Merge branch 'main' into users/meinersbur/ide_folders_llvmusers/meinersbur/ide_folders_llvm

author: Michael Kruse <llvm-project@meinersbur.de> 2024-05-25 17:21:09 +0200
committer: Michael Kruse <llvm-project@meinersbur.de> 2024-05-25 17:21:09 +0200
commit: 062fdd4f4439c00437fef07488e994a6ff10bb5d (patch)
tree: 79297e3188951f7b98d10f3d67a92f4df75bac80
parent: 0e864bbd4142cf202aa9ffd66eb67c9528c0f452 (diff)
parent: 9da81cee219da78ab44357310a3bcf481bdba26c (diff)
download: llvm-users/meinersbur/ide_folders_llvm.zip
llvm-users/meinersbur/ide_folders_llvm.tar.gz
llvm-users/meinersbur/ide_folders_llvm.tar.bz2
2554 files changed, 52298 insertions, 22649 deletions
diff --git a/.ci/generate-buildkite-pipeline-premerge b/.ci/generate-buildkite-pipeline-premerge
index 78a9cb7..e1c66ac 100755
--- a/.ci/generate-buildkite-pipeline-premerge
+++ b/.ci/generate-buildkite-pipeline-premerge
@@ -68,7 +68,7 @@ function compute-projects-to-test() {
       done
     ;;
     clang)
-      for p in clang-tools-extra compiler-rt flang lldb cross-project-tests; do
+      for p in clang-tools-extra compiler-rt lldb cross-project-tests; do
         echo $p
       done
     ;;
diff --git a/bolt/docs/BAT.md b/bolt/docs/BAT.md
index 7ffb5d7..817ad28 100644
--- a/bolt/docs/BAT.md
+++ b/bolt/docs/BAT.md
@@ -106,9 +106,14 @@ equals output offset.
 `BRANCHENTRY` bit denotes whether a given offset pair is a control flow source
 (branch or call instruction). If not set, it signifies a control flow target
 (basic block offset).
+
 `InputAddr` is omitted for equal offsets in input and output function. In this
 case, `BRANCHENTRY` bits are encoded separately in a `BranchEntries` bitvector.
 
+Deleted basic blocks are emitted as having `OutputOffset` equal to the size of
+the function. They don't affect address translation and only participate in
+input basic block mapping.
+
 ### Secondary Entry Points table
 The table is emitted for hot fragments only. It contains `NumSecEntryPoints`
 offsets denoting secondary entry points, delta encoded, implicitly starting at zero.
diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 7576581..4ec3de3 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -17,6 +17,7 @@
 #include "bolt/Core/BinaryData.h"
 #include "bolt/Core/BinarySection.h"
 #include "bolt/Core/DebugData.h"
+#include "bolt/Core/DynoStats.h"
 #include "bolt/Core/JumpTable.h"
 #include "bolt/Core/MCPlusBuilder.h"
 #include "bolt/RuntimeLibs/RuntimeLibrary.h"
@@ -359,7 +360,7 @@ public:
   void setFileBuildID(StringRef ID) { FileBuildID = std::string(ID); }
 
   bool hasSymbolsWithFileName() const { return HasSymbolsWithFileName; }
-  void setHasSymbolsWithFileName(bool Value) { HasSymbolsWithFileName = true; }
+  void setHasSymbolsWithFileName(bool Value) { HasSymbolsWithFileName = Value; }
 
   /// Return true if relocations against symbol with a given name
   /// must be created.
@@ -677,6 +678,9 @@ public:
   /// have an origin file name available.
   bool HasSymbolsWithFileName{false};
 
+  /// Does the binary have BAT section.
+  bool HasBATSection{false};
+
   /// Sum of execution count of all functions
   uint64_t SumExecutionCount{0};
 
@@ -714,6 +718,9 @@ public:
     uint64_t NumStaleBlocksWithEqualIcount{0};
   } Stats;
 
+  // Original binary execution count stats.
+  DynoStats InitialDynoStats;
+
   // Address of the first allocated segment.
   uint64_t FirstAllocAddress{std::numeric_limits<uint64_t>::max()};
 
@@ -1217,8 +1224,7 @@ public:
 
   /// Return a signed value of \p Size stored at \p Address. The address has
   /// to be a valid statically allocated address for the binary.
-  ErrorOr<uint64_t> getSignedValueAtAddress(uint64_t Address,
-                                            size_t Size) const;
+  ErrorOr<int64_t> getSignedValueAtAddress(uint64_t Address, size_t Size) const;
 
   /// Special case of getUnsignedValueAtAddress() that uses a pointer size.
   ErrorOr<uint64_t> getPointerAtAddress(uint64_t Address) const {
diff --git a/bolt/include/bolt/Passes/BinaryPasses.h b/bolt/include/bolt/Passes/BinaryPasses.h
index 5d76925..ad8473c 100644
--- a/bolt/include/bolt/Passes/BinaryPasses.h
+++ b/bolt/include/bolt/Passes/BinaryPasses.h
@@ -16,6 +16,7 @@
 #include "bolt/Core/BinaryContext.h"
 #include "bolt/Core/BinaryFunction.h"
 #include "bolt/Core/DynoStats.h"
+#include "bolt/Profile/BoltAddressTranslation.h"
 #include "llvm/Support/CommandLine.h"
 #include <atomic>
 #include <set>
@@ -52,15 +53,31 @@ public:
   virtual Error runOnFunctions(BinaryContext &BC) = 0;
 };
 
+/// A pass to set initial program-wide dynostats.
+class DynoStatsSetPass : public BinaryFunctionPass {
+public:
+  DynoStatsSetPass() : BinaryFunctionPass(false) {}
+
+  const char *getName() const override {
+    return "set dyno-stats before optimizations";
+  }
+
+  bool shouldPrint(const BinaryFunction &BF) const override { return false; }
+
+  Error runOnFunctions(BinaryContext &BC) override {
+    BC.InitialDynoStats = getDynoStats(BC.getBinaryFunctions(), BC.isAArch64());
+    return Error::success();
+  }
+};
+
 /// A pass to print program-wide dynostats.
 class DynoStatsPrintPass : public BinaryFunctionPass {
 protected:
-  DynoStats PrevDynoStats;
   std::string Title;
 
 public:
-  DynoStatsPrintPass(const DynoStats &PrevDynoStats, const char *Title)
-      : BinaryFunctionPass(false), PrevDynoStats(PrevDynoStats), Title(Title) {}
+  DynoStatsPrintPass(const char *Title)
+      : BinaryFunctionPass(false), Title(Title) {}
 
   const char *getName() const override {
     return "print dyno-stats after optimizations";
@@ -69,6 +86,7 @@ public:
   bool shouldPrint(const BinaryFunction &BF) const override { return false; }
 
   Error runOnFunctions(BinaryContext &BC) override {
+    const DynoStats PrevDynoStats = BC.InitialDynoStats;
     const DynoStats NewDynoStats =
         getDynoStats(BC.getBinaryFunctions(), BC.isAArch64());
     const bool Changed = (NewDynoStats != PrevDynoStats);
@@ -399,8 +417,11 @@ public:
 /// Prints a list of the top 100 functions sorted by a set of
 /// dyno stats categories.
 class PrintProgramStats : public BinaryFunctionPass {
+  BoltAddressTranslation *BAT = nullptr;
+
 public:
-  explicit PrintProgramStats() : BinaryFunctionPass(false) {}
+  explicit PrintProgramStats(BoltAddressTranslation *BAT = nullptr)
+      : BinaryFunctionPass(false), BAT(BAT) {}
 
   const char *getName() const override { return "print-stats"; }
   bool shouldPrint(const BinaryFunction &) const override { return false; }
diff --git a/bolt/include/bolt/Passes/MCF.h b/bolt/include/bolt/Passes/MCF.h
index feac7f8..3fe6744 100644
--- a/bolt/include/bolt/Passes/MCF.h
+++ b/bolt/include/bolt/Passes/MCF.h
@@ -9,20 +9,14 @@
 #ifndef BOLT_PASSES_MCF_H
 #define BOLT_PASSES_MCF_H
 
+#include "bolt/Passes/BinaryPasses.h"
+#include "llvm/Support/CommandLine.h"
+
 namespace llvm {
 namespace bolt {
 
-class BinaryFunction;
 class DataflowInfoManager;
 
-enum MCFCostFunction : char {
-  MCF_DISABLE = 0,
-  MCF_LINEAR,
-  MCF_QUADRATIC,
-  MCF_LOG,
-  MCF_BLAMEFTS
-};
-
 /// Implement the idea in "SamplePGO - The Power of Profile Guided Optimizations
 /// without the Usability Burden" by Diego Novillo to make basic block counts
 /// equal if we show that A dominates B, B post-dominates A and they are in the
@@ -31,23 +25,18 @@ void equalizeBBCounts(DataflowInfoManager &Info, BinaryFunction &BF);
 
 /// Fill edge counts based on the basic block count. Used in nonLBR mode when
 /// we only have bb count.
-void estimateEdgeCounts(BinaryFunction &BF);
-
-/// Entry point for computing a min-cost flow for the CFG with the goal
-/// of fixing the flow of the CFG edges, that is, making sure it obeys the
-/// flow-conservation equation  SumInEdges = SumOutEdges.
-///
-/// To do this, we create an instance of the min-cost flow problem in a
-/// similar way as the one discussed in the work of Roy Levin "Completing
-/// Incomplete Edge Profile by Applying Minimum Cost Circulation Algorithms".
-/// We do a few things differently, though. We don't populate edge counts using
-/// weights coming from a static branch prediction technique and we don't
-/// use the same cost function.
-///
-/// If cost function BlameFTs is used, assign all remaining flow to
-/// fall-throughs. This is used when the sampling is based on taken branches
-/// that do not account for them.
-void solveMCF(BinaryFunction &BF, MCFCostFunction CostFunction);
+class EstimateEdgeCounts : public BinaryFunctionPass {
+  void runOnFunction(BinaryFunction &BF);
+
+public:
+  explicit EstimateEdgeCounts(const cl::opt<bool> &PrintPass)
+      : BinaryFunctionPass(PrintPass) {}
+
+  const char *getName() const override { return "estimate-edge-counts"; }
+
+  /// Pass entry point
+  Error runOnFunctions(BinaryContext &BC) override;
+};
 
 } // end namespace bolt
 } // end namespace llvm
diff --git a/bolt/include/bolt/Passes/StokeInfo.h b/bolt/include/bolt/Passes/StokeInfo.h
index 76417e6..a18c2a0 100644
--- a/bolt/include/bolt/Passes/StokeInfo.h
+++ b/bolt/include/bolt/Passes/StokeInfo.h
@@ -87,10 +87,10 @@ struct StokeFuncInfo {
               << "," << NumBlocks << "," << IsLoopFree << "," << NumLoops << ","
               << MaxLoopDepth << "," << HotSize << "," << TotalSize << ","
               << Score << "," << HasCall << ",\"{ ";
-      for (std::string S : DefIn)
+      for (const std::string &S : DefIn)
         Outfile << "%" << S << " ";
       Outfile << "}\",\"{ ";
-      for (std::string S : LiveOut)
+      for (const std::string &S : LiveOut)
         Outfile << "%" << S << " ";
       Outfile << "}\"," << HeapOut << "," << StackOut << "," << HasRipAddr
               << "," << Omitted << "\n";
diff --git a/bolt/include/bolt/Profile/BoltAddressTranslation.h b/bolt/include/bolt/Profile/BoltAddressTranslation.h
index 68b993e..65b9ba8 100644
--- a/bolt/include/bolt/Profile/BoltAddressTranslation.h
+++ b/bolt/include/bolt/Profile/BoltAddressTranslation.h
@@ -70,7 +70,7 @@ class BinaryFunction;
 class BoltAddressTranslation {
 public:
   // In-memory representation of the address translation table
-  using MapTy = std::map<uint32_t, uint32_t>;
+  using MapTy = std::multimap<uint32_t, uint32_t>;
 
   // List of taken fall-throughs
   using FallthroughListTy = SmallVector<std::pair<uint64_t, uint64_t>, 16>;
@@ -90,7 +90,7 @@ public:
   std::error_code parse(raw_ostream &OS, StringRef Buf);
 
   /// Dump the parsed address translation tables
-  void dump(raw_ostream &OS);
+  void dump(raw_ostream &OS) const;
 
   /// If the maps are loaded in memory, perform the lookup to translate LBR
   /// addresses in function located at \p FuncAddress.
@@ -107,7 +107,12 @@ public:
 
   /// If available, fetch the address of the hot part linked to the cold part
   /// at \p Address. Return 0 otherwise.
-  uint64_t fetchParentAddress(uint64_t Address) const;
+  uint64_t fetchParentAddress(uint64_t Address) const {
+    auto Iter = ColdPartSource.find(Address);
+    if (Iter == ColdPartSource.end())
+      return 0;
+    return Iter->second;
+  }
 
   /// True if the input binary has a translation table we can use to convert
   /// addresses when aggregating profile
@@ -132,7 +137,8 @@ private:
   /// emitted for the start of the BB. More entries may be emitted to cover
   /// the location of calls or any instruction that may change control flow.
   void writeEntriesForBB(MapTy &Map, const BinaryBasicBlock &BB,
-                         uint64_t FuncInputAddress, uint64_t FuncOutputAddress);
+                         uint64_t FuncInputAddress,
+                         uint64_t FuncOutputAddress) const;
 
   /// Write the serialized address translation table for a function.
   template <bool Cold>
@@ -147,7 +153,7 @@ private:
 
   /// Returns the bitmask with set bits corresponding to indices of BRANCHENTRY
   /// entries in function address translation map.
-  APInt calculateBranchEntriesBitMask(MapTy &Map, size_t EqualElems);
+  APInt calculateBranchEntriesBitMask(MapTy &Map, size_t EqualElems) const;
 
   /// Calculate the number of equal offsets (output = input - skew) in the
   /// beginning of the function.
@@ -178,14 +184,9 @@ private:
 public:
   /// Map basic block input offset to a basic block index and hash pair.
   class BBHashMapTy {
-    class EntryTy {
+    struct EntryTy {
       unsigned Index;
       size_t Hash;
-
-    public:
-      unsigned getBBIndex() const { return Index; }
-      size_t getBBHash() const { return Hash; }
-      EntryTy(unsigned Index, size_t Hash) : Index(Index), Hash(Hash) {}
     };
 
     std::map<uint32_t, EntryTy> Map;
@@ -201,15 +202,15 @@ public:
     }
 
     unsigned getBBIndex(uint32_t BBInputOffset) const {
-      return getEntry(BBInputOffset).getBBIndex();
+      return getEntry(BBInputOffset).Index;
     }
 
     size_t getBBHash(uint32_t BBInputOffset) const {
-      return getEntry(BBInputOffset).getBBHash();
+      return getEntry(BBInputOffset).Hash;
     }
 
     void addEntry(uint32_t BBInputOffset, unsigned BBIndex, size_t BBHash) {
-      Map.emplace(BBInputOffset, EntryTy(BBIndex, BBHash));
+      Map.emplace(BBInputOffset, EntryTy{BBIndex, BBHash});
     }
 
     size_t getNumBasicBlocks() const { return Map.size(); }
@@ -217,18 +218,14 @@ public:
     auto begin() const { return Map.begin(); }
     auto end() const { return Map.end(); }
     auto upper_bound(uint32_t Offset) const { return Map.upper_bound(Offset); }
+    auto size() const { return Map.size(); }
   };
 
   /// Map function output address to its hash and basic blocks hash map.
   class FuncHashesTy {
-    class EntryTy {
+    struct EntryTy {
       size_t Hash;
       BBHashMapTy BBHashMap;
-
-    public:
-      size_t getBFHash() const { return Hash; }
-      const BBHashMapTy &getBBHashMap() const { return BBHashMap; }
-      EntryTy(size_t Hash) : Hash(Hash) {}
     };
 
     std::unordered_map<uint64_t, EntryTy> Map;
@@ -240,15 +237,15 @@ public:
 
   public:
     size_t getBFHash(uint64_t FuncOutputAddress) const {
-      return getEntry(FuncOutputAddress).getBFHash();
+      return getEntry(FuncOutputAddress).Hash;
     }
 
     const BBHashMapTy &getBBHashMap(uint64_t FuncOutputAddress) const {
-      return getEntry(FuncOutputAddress).getBBHashMap();
+      return getEntry(FuncOutputAddress).BBHashMap;
     }
 
     void addEntry(uint64_t FuncOutputAddress, size_t BFHash) {
-      Map.emplace(FuncOutputAddress, EntryTy(BFHash));
+      Map.emplace(FuncOutputAddress, EntryTy{BFHash, BBHashMapTy()});
     }
 
     size_t getNumFunctions() const { return Map.size(); };
@@ -256,7 +253,7 @@ public:
     size_t getNumBasicBlocks() const {
       size_t NumBasicBlocks{0};
       for (auto &I : Map)
-        NumBasicBlocks += I.second.getBBHashMap().getNumBasicBlocks();
+        NumBasicBlocks += I.second.BBHashMap.getNumBasicBlocks();
       return NumBasicBlocks;
     }
   };
@@ -278,7 +275,9 @@ public:
 
   /// Returns the number of basic blocks in a function.
   size_t getNumBasicBlocks(uint64_t OutputAddress) const {
-    return NumBasicBlocksMap.at(OutputAddress);
+    auto It = NumBasicBlocksMap.find(OutputAddress);
+    assert(It != NumBasicBlocksMap.end());
+    return It->second;
   }
 
 private:
diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index c158a9b..6453b30 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -15,6 +15,7 @@
 #define BOLT_PROFILE_DATA_AGGREGATOR_H
 
 #include "bolt/Profile/DataReader.h"
+#include "bolt/Profile/YAMLProfileWriter.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Program.h"
@@ -248,7 +249,7 @@ private:
   BinaryFunction *getBATParentFunction(const BinaryFunction &Func) const;
 
   /// Retrieve the location name to be used for samples recorded in \p Func.
-  StringRef getLocationName(const BinaryFunction &Func) const;
+  static StringRef getLocationName(const BinaryFunction &Func, bool BAT);
 
   /// Semantic actions - parser hooks to interpret parsed perf samples
   /// Register a sample (non-LBR mode), i.e. a new hit at \p Address
@@ -490,6 +491,8 @@ public:
   /// Parse the output generated by "perf buildid-list" to extract build-ids
   /// and return a file name matching a given \p FileBuildID.
   std::optional<StringRef> getFileNameForBuildID(StringRef FileBuildID);
+
+  friend class YAMLProfileWriter;
 };
 } // namespace bolt
 } // namespace llvm
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index ad2eb18..db02dc0 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -142,7 +142,7 @@ BinaryContext::BinaryContext(std::unique_ptr<MCContext> Ctx,
       AsmInfo(std::move(AsmInfo)), MII(std::move(MII)), STI(std::move(STI)),
       InstPrinter(std::move(InstPrinter)), MIA(std::move(MIA)),
       MIB(std::move(MIB)), MRI(std::move(MRI)), DisAsm(std::move(DisAsm)),
-      Logger(Logger) {
+      Logger(Logger), InitialDynoStats(isAArch64()) {
   Relocation::Arch = this->TheTriple->getArch();
   RegularPageSize = isAArch64() ? RegularPageSizeAArch64 : RegularPageSizeX86;
   PageAlign = opts::NoHugePages ? RegularPageSize : HugePageSize;
@@ -934,10 +934,13 @@ std::string BinaryContext::generateJumpTableName(const BinaryFunction &BF,
   uint64_t Offset = 0;
   if (const JumpTable *JT = BF.getJumpTableContainingAddress(Address)) {
     Offset = Address - JT->getAddress();
-    auto Itr = JT->Labels.find(Offset);
-    if (Itr != JT->Labels.end())
-      return std::string(Itr->second->getName());
-    Id = JumpTableIds.at(JT->getAddress());
+    auto JTLabelsIt = JT->Labels.find(Offset);
+    if (JTLabelsIt != JT->Labels.end())
+      return std::string(JTLabelsIt->second->getName());
+
+    auto JTIdsIt = JumpTableIds.find(JT->getAddress());
+    assert(JTIdsIt != JumpTableIds.end());
+    Id = JTIdsIt->second;
   } else {
     Id = JumpTableIds[Address] = BF.JumpTables.size();
   }
@@ -1322,7 +1325,9 @@ void BinaryContext::processInterproceduralReferences() {
        InterproceduralReferences) {
     BinaryFunction &Function = *It.first;
     uint64_t Address = It.second;
-    if (!Address || Function.isIgnored())
+    // Process interprocedural references from ignored functions in BAT mode
+    // (non-simple in non-relocation mode) to properly register entry points
+    if (!Address || (Function.isIgnored() && !HasBATSection))
       continue;
 
     BinaryFunction *TargetFunction =
@@ -2212,8 +2217,8 @@ ErrorOr<uint64_t> BinaryContext::getUnsignedValueAtAddress(uint64_t Address,
   return DE.getUnsigned(&ValueOffset, Size);
 }
 
-ErrorOr<uint64_t> BinaryContext::getSignedValueAtAddress(uint64_t Address,
-                                                         size_t Size) const {
+ErrorOr<int64_t> BinaryContext::getSignedValueAtAddress(uint64_t Address,
+                                                        size_t Size) const {
   const ErrorOr<const BinarySection &> Section = getSectionForAddress(Address);
   if (!Section)
     return std::make_error_code(std::errc::bad_address);
diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index 6f86ddc..0b44acb 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -813,7 +813,9 @@ void BinaryEmitter::emitJumpTable(const JumpTable &JT, MCSection *HotSection,
   // determining its destination.
   std::map<MCSymbol *, uint64_t> LabelCounts;
   if (opts::JumpTables > JTS_SPLIT && !JT.Counts.empty()) {
-    MCSymbol *CurrentLabel = JT.Labels.at(0);
+    auto It = JT.Labels.find(0);
+    assert(It != JT.Labels.end());
+    MCSymbol *CurrentLabel = It->second;
     uint64_t CurrentLabelCount = 0;
     for (unsigned Index = 0; Index < JT.Entries.size(); ++Index) {
       auto LI = JT.Labels.find(Index * JT.EntrySize);
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 10b93e7..c897392 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -851,15 +851,19 @@ BinaryFunction::processIndirectBranch(MCInst &Instruction, unsigned Size,
     return IndirectBranchType::UNKNOWN;
   }
 
-  // RIP-relative addressing should be converted to symbol form by now
-  // in processed instructions (but not in jump).
-  if (DispExpr) {
+  auto getExprValue = [&](const MCExpr *Expr) {
     const MCSymbol *TargetSym;
     uint64_t TargetOffset;
-    std::tie(TargetSym, TargetOffset) = BC.MIB->getTargetSymbolInfo(DispExpr);
+    std::tie(TargetSym, TargetOffset) = BC.MIB->getTargetSymbolInfo(Expr);
     ErrorOr<uint64_t> SymValueOrError = BC.getSymbolValue(*TargetSym);
-    assert(SymValueOrError && "global symbol needs a value");
-    ArrayStart = *SymValueOrError + TargetOffset;
+    assert(SymValueOrError && "Global symbol needs a value");
+    return *SymValueOrError + TargetOffset;
+  };
+
+  // RIP-relative addressing should be converted to symbol form by now
+  // in processed instructions (but not in jump).
+  if (DispExpr) {
+    ArrayStart = getExprValue(DispExpr);
     BaseRegNum = BC.MIB->getNoRegister();
     if (BC.isAArch64()) {
       ArrayStart &= ~0xFFFULL;
@@ -1666,7 +1670,8 @@ void BinaryFunction::postProcessEntryPoints() {
     // In non-relocation mode there's potentially an external undetectable
     // reference to the entry point and hence we cannot move this entry
     // point. Optimizing without moving could be difficult.
-    if (!BC.HasRelocations)
+    // In BAT mode, register any known entry points for CFG construction.
+    if (!BC.HasRelocations && !BC.HasBATSection)
       setSimple(false);
 
     const uint32_t Offset = KV.first;
@@ -3697,6 +3702,13 @@ BinaryFunction::BasicBlockListType BinaryFunction::dfs() const {
 
 size_t BinaryFunction::computeHash(bool UseDFS, HashFunction HashFunction,
                                    OperandHashFuncTy OperandHashFunc) const {
+  LLVM_DEBUG({
+    dbgs() << "BOLT-DEBUG: computeHash " << getPrintName() << ' '
+           << (UseDFS ? "dfs" : "bin") << " order "
+           << (HashFunction == HashFunction::StdHash ? "std::hash" : "xxh3")
+           << '\n';
+  });
+
   if (size() == 0)
     return 0;
 
diff --git a/bolt/lib/Core/DebugNames.cpp b/bolt/lib/Core/DebugNames.cpp
index 049244c..791cbc6 100644
--- a/bolt/lib/Core/DebugNames.cpp
+++ b/bolt/lib/Core/DebugNames.cpp
@@ -112,8 +112,6 @@ void DWARF5AcceleratorTable::addUnit(DWARFUnit &Unit,
 // Returns true if DW_TAG_variable should be included in .debug-names based on
 // section 6.1.1.1 for DWARF5 spec.
 static bool shouldIncludeVariable(const DWARFUnit &Unit, const DIE &Die) {
-  if (Die.findAttribute(dwarf::Attribute::DW_AT_declaration))
-    return false;
   const DIEValue LocAttrInfo =
       Die.findAttribute(dwarf::Attribute::DW_AT_location);
   if (!LocAttrInfo)
@@ -148,6 +146,8 @@ static bool shouldIncludeVariable(const DWARFUnit &Unit, const DIE &Die) {
 
 bool static canProcess(const DWARFUnit &Unit, const DIE &Die,
                        std::string &NameToUse, const bool TagsOnly) {
+  if (Die.findAttribute(dwarf::Attribute::DW_AT_declaration))
+    return false;
   switch (Die.getTag()) {
   case dwarf::DW_TAG_base_type:
   case dwarf::DW_TAG_class_type:
diff --git a/bolt/lib/Core/DynoStats.cpp b/bolt/lib/Core/DynoStats.cpp
index 5de0f9e..1d98187 100644
--- a/bolt/lib/Core/DynoStats.cpp
+++ b/bolt/lib/Core/DynoStats.cpp
@@ -114,8 +114,9 @@ void DynoStats::print(raw_ostream &OS, const DynoStats *Other,
     for (auto &Stat : llvm::reverse(SortedHistogram)) {
       OS << format("%20s,%'18lld", Printer->getOpcodeName(Stat.second).data(),
                    Stat.first * opts::DynoStatsScale);
-
-      MaxOpcodeHistogramTy MaxMultiMap = OpcodeHistogram.at(Stat.second).second;
+      auto It = OpcodeHistogram.find(Stat.second);
+      assert(It != OpcodeHistogram.end());
+      MaxOpcodeHistogramTy MaxMultiMap = It->second.second;
       // Start with function name:BB offset with highest execution count.
       for (auto &Max : llvm::reverse(MaxMultiMap)) {
         OS << format(", %'18lld, ", Max.first * opts::DynoStatsScale)
diff --git a/bolt/lib/Passes/BinaryFunctionCallGraph.cpp b/bolt/lib/Passes/BinaryFunctionCallGraph.cpp
index 2373710..bbcc975 100644
--- a/bolt/lib/Passes/BinaryFunctionCallGraph.cpp
+++ b/bolt/lib/Passes/BinaryFunctionCallGraph.cpp
@@ -56,7 +56,9 @@ std::deque<BinaryFunction *> BinaryFunctionCallGraph::buildTraversalOrder() {
   std::stack<NodeId> Worklist;
 
   for (BinaryFunction *Func : Funcs) {
-    const NodeId Id = FuncToNodeId.at(Func);
+    auto It = FuncToNodeId.find(Func);
+    assert(It != FuncToNodeId.end());
+    const NodeId Id = It->second;
     Worklist.push(Id);
     NodeStatus[Id] = NEW;
   }
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 298ba29..2810f72 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1390,9 +1390,19 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
     if (Function.isPLTFunction())
       continue;
 
+    // Adjustment for BAT mode: the profile for BOLT split fragments is combined
+    // so only count the hot fragment.
+    const uint64_t Address = Function.getAddress();
+    bool IsHotParentOfBOLTSplitFunction = !Function.getFragments().empty() &&
+                                          BAT && BAT->isBATFunction(Address) &&
+                                          !BAT->fetchParentAddress(Address);
+
     ++NumRegularFunctions;
 
-    if (!Function.isSimple()) {
+    // In BOLTed binaries split functions are non-simple (due to non-relocation
+    // mode), but the original function is known to be simple and we have a
+    // valid profile for it.
+    if (!Function.isSimple() && !IsHotParentOfBOLTSplitFunction) {
       if (Function.hasProfile())
         ++NumNonSimpleProfiledFunctions;
       continue;
@@ -1553,23 +1563,28 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
     const bool Ascending =
         opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending;
 
-    if (SortAll) {
-      llvm::stable_sort(Functions,
-                        [Ascending, &Stats](const BinaryFunction *A,
-                                            const BinaryFunction *B) {
-                          return Ascending ? Stats.at(A) < Stats.at(B)
-                                           : Stats.at(B) < Stats.at(A);
-                        });
-    } else {
-      llvm::stable_sort(
-          Functions, [Ascending, &Stats](const BinaryFunction *A,
-                                         const BinaryFunction *B) {
-            const DynoStats &StatsA = Stats.at(A);
-            const DynoStats &StatsB = Stats.at(B);
-            return Ascending ? StatsA.lessThan(StatsB, opts::PrintSortedBy)
-                             : StatsB.lessThan(StatsA, opts::PrintSortedBy);
-          });
-    }
+    std::function<bool(const DynoStats &, const DynoStats &)>
+        DynoStatsComparator =
+            SortAll ? [](const DynoStats &StatsA,
+                         const DynoStats &StatsB) { return StatsA < StatsB; }
+                    : [](const DynoStats &StatsA, const DynoStats &StatsB) {
+                        return StatsA.lessThan(StatsB, opts::PrintSortedBy);
+                      };
+
+    llvm::stable_sort(Functions,
+                      [Ascending, &Stats, DynoStatsComparator](
+                          const BinaryFunction *A, const BinaryFunction *B) {
+                        auto StatsItr = Stats.find(A);
+                        assert(StatsItr != Stats.end());
+                        const DynoStats &StatsA = StatsItr->second;
+
+                        StatsItr = Stats.find(B);
+                        assert(StatsItr != Stats.end());
+                        const DynoStats &StatsB = StatsItr->second;
+
+                        return Ascending ? DynoStatsComparator(StatsA, StatsB)
+                                         : DynoStatsComparator(StatsB, StatsA);
+                      });
 
     BC.outs() << "BOLT-INFO: top functions sorted by ";
     if (SortAll) {
diff --git a/bolt/lib/Passes/CacheMetrics.cpp b/bolt/lib/Passes/CacheMetrics.cpp
index b02d430..21b420a 100644
--- a/bolt/lib/Passes/CacheMetrics.cpp
+++ b/bolt/lib/Passes/CacheMetrics.cpp
@@ -67,7 +67,20 @@ calcTSPScore(const std::vector<BinaryFunction *> &BinaryFunctions,
       for (BinaryBasicBlock *DstBB : SrcBB->successors()) {
         if (SrcBB != DstBB && BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) {
           JumpCount += BI->Count;
-          if (BBAddr.at(SrcBB) + BBSize.at(SrcBB) == BBAddr.at(DstBB))
+
+          auto BBAddrIt = BBAddr.find(SrcBB);
+          assert(BBAddrIt != BBAddr.end());
+          uint64_t SrcBBAddr = BBAddrIt->second;
+
+          auto BBSizeIt = BBSize.find(SrcBB);
+          assert(BBSizeIt != BBSize.end());
+          uint64_t SrcBBSize = BBSizeIt->second;
+
+          BBAddrIt = BBAddr.find(DstBB);
+          assert(BBAddrIt != BBAddr.end());
+          uint64_t DstBBAddr = BBAddrIt->second;
+
+          if (SrcBBAddr + SrcBBSize == DstBBAddr)
             Score += BI->Count;
         }
         ++BI;
@@ -149,20 +162,28 @@ double expectedCacheHitRatio(
   for (BinaryFunction *BF : BinaryFunctions) {
     if (BF->getLayout().block_empty())
       continue;
-    const uint64_t Page =
-        BBAddr.at(BF->getLayout().block_front()) / ITLBPageSize;
-    PageSamples[Page] += FunctionSamples.at(BF);
+    auto BBAddrIt = BBAddr.find(BF->getLayout().block_front());
+    assert(BBAddrIt != BBAddr.end());
+    const uint64_t Page = BBAddrIt->second / ITLBPageSize;
+
+    auto FunctionSamplesIt = FunctionSamples.find(BF);
+    assert(FunctionSamplesIt != FunctionSamples.end());
+    PageSamples[Page] += FunctionSamplesIt->second;
   }
 
   // Computing the expected number of misses for every function
   double Misses = 0;
   for (BinaryFunction *BF : BinaryFunctions) {
     // Skip the function if it has no samples
-    if (BF->getLayout().block_empty() || FunctionSamples.at(BF) == 0.0)
+    auto FunctionSamplesIt = FunctionSamples.find(BF);
+    assert(FunctionSamplesIt != FunctionSamples.end());
+    double Samples = FunctionSamplesIt->second;
+    if (BF->getLayout().block_empty() || Samples == 0.0)
       continue;
-    double Samples = FunctionSamples.at(BF);
-    const uint64_t Page =
-        BBAddr.at(BF->getLayout().block_front()) / ITLBPageSize;
+
+    auto BBAddrIt = BBAddr.find(BF->getLayout().block_front());
+    assert(BBAddrIt != BBAddr.end());
+    const uint64_t Page = BBAddrIt->second / ITLBPageSize;
     // The probability that the page is not present in the cache
     const double MissProb =
         pow(1.0 - PageSamples[Page] / TotalSamples, ITLBEntries);
@@ -170,8 +191,10 @@ double expectedCacheHitRatio(
     // Processing all callers of the function
     for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) {
       BinaryFunction *SrcFunction = Pair.first;
-      const uint64_t SrcPage =
-          BBAddr.at(SrcFunction->getLayout().block_front()) / ITLBPageSize;
+
+      BBAddrIt = BBAddr.find(SrcFunction->getLayout().block_front());
+      assert(BBAddrIt != BBAddr.end());
+      const uint64_t SrcPage = BBAddrIt->second / ITLBPageSize;
       // Is this a 'long' or a 'short' call?
       if (Page != SrcPage) {
         // This is a miss
diff --git a/bolt/lib/Passes/Inliner.cpp b/bolt/lib/Passes/Inliner.cpp
index 84e7d97..f004a8ee 100644
--- a/bolt/lib/Passes/Inliner.cpp
+++ b/bolt/lib/Passes/Inliner.cpp
@@ -355,7 +355,9 @@ Inliner::inlineCall(BinaryBasicBlock &CallerBB,
     std::vector<BinaryBasicBlock *> Successors(BB.succ_size());
     llvm::transform(BB.successors(), Successors.begin(),
                     [&InlinedBBMap](const BinaryBasicBlock *BB) {
-                      return InlinedBBMap.at(BB);
+                      auto It = InlinedBBMap.find(BB);
+                      assert(It != InlinedBBMap.end());
+                      return It->second;
                     });
 
     if (CallerFunction.hasValidProfile() && Callee.hasValidProfile())
diff --git a/bolt/lib/Passes/MCF.cpp b/bolt/lib/Passes/MCF.cpp
index c3898d2..77dea73 100644
--- a/bolt/lib/Passes/MCF.cpp
+++ b/bolt/lib/Passes/MCF.cpp
@@ -12,9 +12,11 @@
 
 #include "bolt/Passes/MCF.h"
 #include "bolt/Core/BinaryFunction.h"
+#include "bolt/Core/ParallelUtilities.h"
 #include "bolt/Passes/DataflowInfoManager.h"
 #include "bolt/Utils/CommandLineOpts.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include <algorithm>
 #include <vector>
@@ -29,19 +31,10 @@ namespace opts {
 
 extern cl::OptionCategory BoltOptCategory;
 
-extern cl::opt<bool> TimeOpts;
-
 static cl::opt<bool> IterativeGuess(
     "iterative-guess",
     cl::desc("in non-LBR mode, guess edge counts using iterative technique"),
     cl::Hidden, cl::cat(BoltOptCategory));
-
-static cl::opt<bool> UseRArcs(
-    "mcf-use-rarcs",
-    cl::desc("in MCF, consider the possibility of cancelling flow to balance "
-             "edges"),
-    cl::Hidden, cl::cat(BoltOptCategory));
-
 } // namespace opts
 
 namespace llvm {
@@ -441,7 +434,7 @@ void equalizeBBCounts(DataflowInfoManager &Info, BinaryFunction &BF) {
   }
 }
 
-void estimateEdgeCounts(BinaryFunction &BF) {
+void EstimateEdgeCounts::runOnFunction(BinaryFunction &BF) {
   EdgeWeightMap PredEdgeWeights;
   EdgeWeightMap SuccEdgeWeights;
   if (!opts::IterativeGuess) {
@@ -462,8 +455,24 @@ void estimateEdgeCounts(BinaryFunction &BF) {
   recalculateBBCounts(BF, /*AllEdges=*/false);
 }
 
-void solveMCF(BinaryFunction &BF, MCFCostFunction CostFunction) {
-  llvm_unreachable("not implemented");
+Error EstimateEdgeCounts::runOnFunctions(BinaryContext &BC) {
+  if (llvm::none_of(llvm::make_second_range(BC.getBinaryFunctions()),
+                    [](const BinaryFunction &BF) {
+                      return BF.getProfileFlags() == BinaryFunction::PF_SAMPLE;
+                    }))
+    return Error::success();
+
+  ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
+    runOnFunction(BF);
+  };
+  ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
+    return BF.getProfileFlags() != BinaryFunction::PF_SAMPLE;
+  };
+
+  ParallelUtilities::runOnEachFunction(
+      BC, ParallelUtilities::SchedulingPolicy::SP_BB_QUADRATIC, WorkFun,
+      SkipFunc, "EstimateEdgeCounts");
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp
index 7cfb9c1..cdfca2b 100644
--- a/bolt/lib/Profile/BoltAddressTranslation.cpp
+++ b/bolt/lib/Profile/BoltAddressTranslation.cpp
@@ -20,10 +20,9 @@ namespace bolt {
 
 const char *BoltAddressTranslation::SECTION_NAME = ".note.bolt_bat";
 
-void BoltAddressTranslation::writeEntriesForBB(MapTy &Map,
-                                               const BinaryBasicBlock &BB,
-                                               uint64_t FuncInputAddress,
-                                               uint64_t FuncOutputAddress) {
+void BoltAddressTranslation::writeEntriesForBB(
+    MapTy &Map, const BinaryBasicBlock &BB, uint64_t FuncInputAddress,
+    uint64_t FuncOutputAddress) const {
   const uint64_t BBOutputOffset =
       BB.getOutputAddressRange().first - FuncOutputAddress;
   const uint32_t BBInputOffset = BB.getInputOffset();
@@ -55,7 +54,7 @@ void BoltAddressTranslation::writeEntriesForBB(MapTy &Map,
   // and this deleted block will both share the same output address (the same
   // key), and we need to map back. We choose here to privilege the successor by
   // allowing it to overwrite the previously inserted key in the map.
-  Map[BBOutputOffset] = BBInputOffset << 1;
+  Map.emplace(BBOutputOffset, BBInputOffset << 1);
 
   const auto &IOAddressMap =
       BB.getFunction()->getBinaryContext().getIOAddressMap();
@@ -72,8 +71,7 @@ void BoltAddressTranslation::writeEntriesForBB(MapTy &Map,
 
     LLVM_DEBUG(dbgs() << "  Key: " << Twine::utohexstr(OutputOffset) << " Val: "
                       << Twine::utohexstr(InputOffset) << " (branch)\n");
-    Map.insert(std::pair<uint32_t, uint32_t>(OutputOffset,
-                                             (InputOffset << 1) | BRANCHENTRY));
+    Map.emplace(OutputOffset, (InputOffset << 1) | BRANCHENTRY);
   }
 }
 
@@ -108,6 +106,19 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) {
     for (const BinaryBasicBlock *const BB :
          Function.getLayout().getMainFragment())
       writeEntriesForBB(Map, *BB, InputAddress, OutputAddress);
+    // Add entries for deleted blocks. They are still required for correct BB
+    // mapping of branches modified by SCTC. By convention, they would have the
+    // end of the function as output address.
+    const BBHashMapTy &BBHashMap = getBBHashMap(InputAddress);
+    if (BBHashMap.size() != Function.size()) {
+      const uint64_t EndOffset = Function.getOutputSize();
+      std::unordered_set<uint32_t> MappedInputOffsets;
+      for (const BinaryBasicBlock &BB : Function)
+        MappedInputOffsets.emplace(BB.getInputOffset());
+      for (const auto &[InputOffset, _] : BBHashMap)
+        if (!llvm::is_contained(MappedInputOffsets, InputOffset))
+          Map.emplace(EndOffset, InputOffset << 1);
+    }
     Maps.emplace(Function.getOutputAddress(), std::move(Map));
     ReverseMap.emplace(OutputAddress, InputAddress);
 
@@ -138,8 +149,8 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) {
             << " basic block hashes\n";
 }
 
-APInt BoltAddressTranslation::calculateBranchEntriesBitMask(MapTy &Map,
-                                                            size_t EqualElems) {
+APInt BoltAddressTranslation::calculateBranchEntriesBitMask(
+    MapTy &Map, size_t EqualElems) const {
   APInt BitMask(alignTo(EqualElems, 8), 0);
   size_t Index = 0;
   for (std::pair<const uint32_t, uint32_t> &KeyVal : Map) {
@@ -422,7 +433,7 @@ void BoltAddressTranslation::parseMaps(std::vector<uint64_t> &HotFuncs,
   }
 }
 
-void BoltAddressTranslation::dump(raw_ostream &OS) {
+void BoltAddressTranslation::dump(raw_ostream &OS) const {
   const size_t NumTables = Maps.size();
   OS << "BAT tables for " << NumTables << " functions:\n";
   for (const auto &MapEntry : Maps) {
@@ -447,11 +458,15 @@ void BoltAddressTranslation::dump(raw_ostream &OS) {
         OS << formatv(" hash: {0:x}", BBHashMap.getBBHash(Val));
       OS << "\n";
     }
-    if (IsHotFunction)
-      OS << "NumBlocks: " << NumBasicBlocksMap[Address] << '\n';
-    if (SecondaryEntryPointsMap.count(Address)) {
+    if (IsHotFunction) {
+      auto NumBasicBlocksIt = NumBasicBlocksMap.find(Address);
+      assert(NumBasicBlocksIt != NumBasicBlocksMap.end());
+      OS << "NumBlocks: " << NumBasicBlocksIt->second << '\n';
+    }
+    auto SecondaryEntryPointsIt = SecondaryEntryPointsMap.find(Address);
+    if (SecondaryEntryPointsIt != SecondaryEntryPointsMap.end()) {
       const std::vector<uint32_t> &SecondaryEntryPoints =
-          SecondaryEntryPointsMap[Address];
+          SecondaryEntryPointsIt->second;
       OS << SecondaryEntryPoints.size() << " secondary entry points:\n";
       for (uint32_t EntryPointOffset : SecondaryEntryPoints)
         OS << formatv("{0:x}\n", EntryPointOffset);
@@ -547,13 +562,6 @@ BoltAddressTranslation::getFallthroughsInTrace(uint64_t FuncAddress,
   return Res;
 }
 
-uint64_t BoltAddressTranslation::fetchParentAddress(uint64_t Address) const {
-  auto Iter = ColdPartSource.find(Address);
-  if (Iter == ColdPartSource.end())
-    return 0;
-  return Iter->second;
-}
-
 bool BoltAddressTranslation::enabledFor(
     llvm::object::ELFObjectFileBase *InputFile) const {
   for (const SectionRef &Section : InputFile->sections()) {
diff --git a/bolt/lib/Profile/CMakeLists.txt b/bolt/lib/Profile/CMakeLists.txt
index 045ac47..ca8b9c3 100644
--- a/bolt/lib/Profile/CMakeLists.txt
+++ b/bolt/lib/Profile/CMakeLists.txt
@@ -17,6 +17,5 @@ add_llvm_library(LLVMBOLTProfile
 target_link_libraries(LLVMBOLTProfile
   PRIVATE
   LLVMBOLTCore
-  LLVMBOLTPasses
   LLVMBOLTUtils
   )
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index f55caa7..ce6ec0a 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -613,7 +613,6 @@ Error DataAggregator::readProfile(BinaryContext &BC) {
         if (std::error_code EC = writeBATYAML(BC, opts::SaveProfile))
           report_error("cannot create output data file", EC);
     }
-    BC.logBOLTErrorsAndQuitOnFatal(PrintProgramStats().runOnFunctions(BC));
   }
 
   return Error::success();
@@ -673,7 +672,8 @@ DataAggregator::getBATParentFunction(const BinaryFunction &Func) const {
   return nullptr;
 }
 
-StringRef DataAggregator::getLocationName(const BinaryFunction &Func) const {
+StringRef DataAggregator::getLocationName(const BinaryFunction &Func,
+                                          bool BAT) {
   if (!BAT)
     return Func.getOneName();
 
@@ -702,7 +702,7 @@ bool DataAggregator::doSample(BinaryFunction &OrigFunc, uint64_t Address,
   auto I = NamesToSamples.find(Func.getOneName());
   if (I == NamesToSamples.end()) {
     bool Success;
-    StringRef LocName = getLocationName(Func);
+    StringRef LocName = getLocationName(Func, BAT);
     std::tie(I, Success) = NamesToSamples.insert(
         std::make_pair(Func.getOneName(),
                        FuncSampleData(LocName, FuncSampleData::ContainerTy())));
@@ -722,7 +722,7 @@ bool DataAggregator::doIntraBranch(BinaryFunction &Func, uint64_t From,
   FuncBranchData *AggrData = getBranchData(Func);
   if (!AggrData) {
     AggrData = &NamesToBranches[Func.getOneName()];
-    AggrData->Name = getLocationName(Func);
+    AggrData->Name = getLocationName(Func, BAT);
     setBranchData(Func, AggrData);
   }
 
@@ -741,7 +741,7 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
   StringRef SrcFunc;
   StringRef DstFunc;
   if (FromFunc) {
-    SrcFunc = getLocationName(*FromFunc);
+    SrcFunc = getLocationName(*FromFunc, BAT);
     FromAggrData = getBranchData(*FromFunc);
     if (!FromAggrData) {
       FromAggrData = &NamesToBranches[FromFunc->getOneName()];
@@ -752,7 +752,7 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
     recordExit(*FromFunc, From, Mispreds, Count);
   }
   if (ToFunc) {
-    DstFunc = getLocationName(*ToFunc);
+    DstFunc = getLocationName(*ToFunc, BAT);
     ToAggrData = getBranchData(*ToFunc);
     if (!ToAggrData) {
       ToAggrData = &NamesToBranches[ToFunc->getOneName()];
@@ -2340,7 +2340,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
         continue;
       BinaryFunction *BF = BC.getBinaryFunctionAtAddress(FuncAddress);
       assert(BF);
-      YamlBF.Name = getLocationName(*BF);
+      YamlBF.Name = getLocationName(*BF, BAT);
       YamlBF.Id = BF->getFunctionNumber();
       YamlBF.Hash = BAT->getBFHash(FuncAddress);
       YamlBF.ExecCount = BF->getKnownExecutionCount();
@@ -2349,11 +2349,11 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
           BAT->getBBHashMap(FuncAddress);
       YamlBF.Blocks.resize(YamlBF.NumBasicBlocks);
 
-      for (auto &&[Idx, YamlBB] : llvm::enumerate(YamlBF.Blocks))
-        YamlBB.Index = Idx;
-
-      for (auto BI = BlockMap.begin(), BE = BlockMap.end(); BI != BE; ++BI)
-        YamlBF.Blocks[BI->second.getBBIndex()].Hash = BI->second.getBBHash();
+      for (auto &&[Entry, YamlBB] : llvm::zip(BlockMap, YamlBF.Blocks)) {
+        const auto &Block = Entry.second;
+        YamlBB.Hash = Block.Hash;
+        YamlBB.Index = Block.Index;
+      }
 
       // Lookup containing basic block offset and index
       auto getBlock = [&BlockMap](uint32_t Offset) {
@@ -2363,7 +2363,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
           exit(1);
         }
         --BlockIt;
-        return std::pair(BlockIt->first, BlockIt->second.getBBIndex());
+        return std::pair(BlockIt->first, BlockIt->second.Index);
       };
 
       for (const BranchInfo &BI : Branches.Data) {
diff --git a/bolt/lib/Profile/DataReader.cpp b/bolt/lib/Profile/DataReader.cpp
index 06c5e96..f2e999b 100644
--- a/bolt/lib/Profile/DataReader.cpp
+++ b/bolt/lib/Profile/DataReader.cpp
@@ -598,8 +598,6 @@ void DataReader::readSampleData(BinaryFunction &BF) {
   }
 
   BF.ExecutionCount = TotalEntryCount;
-
-  estimateEdgeCounts(BF);
 }
 
 void DataReader::convertBranchData(BinaryFunction &BF) const {
diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 016962f..365bc53 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -30,6 +30,7 @@
 #include "llvm/ADT/Bitfields.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Timer.h"
 #include "llvm/Support/xxhash.h"
 #include "llvm/Transforms/Utils/SampleProfileInference.h"
 
@@ -42,6 +43,7 @@ using namespace llvm;
 
 namespace opts {
 
+extern cl::opt<bool> TimeRewrite;
 extern cl::OptionCategory BoltOptCategory;
 
 cl::opt<bool>
@@ -372,8 +374,10 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) {
 
   // Create necessary metadata for the flow function
   for (FlowJump &Jump : Func.Jumps) {
-    Func.Blocks.at(Jump.Source).SuccJumps.push_back(&Jump);
-    Func.Blocks.at(Jump.Target).PredJumps.push_back(&Jump);
+    assert(Jump.Source < Func.Blocks.size());
+    Func.Blocks[Jump.Source].SuccJumps.push_back(&Jump);
+    assert(Jump.Target < Func.Blocks.size());
+    Func.Blocks[Jump.Target].PredJumps.push_back(&Jump);
   }
   return Func;
 }
@@ -705,6 +709,10 @@ void assignProfile(BinaryFunction &BF,
 
 bool YAMLProfileReader::inferStaleProfile(
     BinaryFunction &BF, const yaml::bolt::BinaryFunctionProfile &YamlBF) {
+
+  NamedRegionTimer T("inferStaleProfile", "stale profile inference", "rewrite",
+                     "Rewrite passes", opts::TimeRewrite);
+
   if (!BF.hasCFG())
     return false;
 
diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp
index 29d9406..f25f5920 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -102,11 +102,14 @@ bool YAMLProfileReader::parseFunctionProfile(
   if (BF.empty())
     return true;
 
-  if (!opts::IgnoreHash &&
-      YamlBF.Hash != BF.computeHash(IsDFSOrder, HashFunction)) {
-    if (opts::Verbosity >= 1)
-      errs() << "BOLT-WARNING: function hash mismatch\n";
-    ProfileMatched = false;
+  if (!opts::IgnoreHash) {
+    if (!BF.getHash())
+      BF.computeHash(IsDFSOrder, HashFunction);
+    if (YamlBF.Hash != BF.getHash()) {
+      if (opts::Verbosity >= 1)
+        errs() << "BOLT-WARNING: function hash mismatch\n";
+      ProfileMatched = false;
+    }
   }
 
   if (YamlBF.NumBasicBlocks != BF.size()) {
@@ -253,10 +256,8 @@ bool YAMLProfileReader::parseFunctionProfile(
     if (BB.getExecutionCount() == BinaryBasicBlock::COUNT_NO_PROFILE)
       BB.setExecutionCount(0);
 
-  if (YamlBP.Header.Flags & BinaryFunction::PF_SAMPLE) {
+  if (YamlBP.Header.Flags & BinaryFunction::PF_SAMPLE)
     BF.setExecutionCount(FunctionExecutionCount);
-    estimateEdgeCounts(BF);
-  }
 
   ProfileMatched &= !MismatchedBlocks && !MismatchedCalls && !MismatchedEdges;
 
diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp
index ef04ba0..cf6b61d 100644
--- a/bolt/lib/Profile/YAMLProfileWriter.cpp
+++ b/bolt/lib/Profile/YAMLProfileWriter.cpp
@@ -10,6 +10,7 @@
 #include "bolt/Core/BinaryBasicBlock.h"
 #include "bolt/Core/BinaryFunction.h"
 #include "bolt/Profile/BoltAddressTranslation.h"
+#include "bolt/Profile/DataAggregator.h"
 #include "bolt/Profile/ProfileReaderBase.h"
 #include "bolt/Rewrite/RewriteInstance.h"
 #include "llvm/Support/CommandLine.h"
@@ -39,6 +40,10 @@ const BinaryFunction *YAMLProfileWriter::setCSIDestination(
             BC.getFunctionForSymbol(Symbol, &EntryID)) {
       if (BAT && BAT->isBATFunction(Callee->getAddress()))
         std::tie(Callee, EntryID) = BAT->translateSymbol(BC, *Symbol, Offset);
+      else if (const BinaryBasicBlock *BB =
+                   Callee->getBasicBlockContainingOffset(Offset))
+        BC.getFunctionForSymbol(Callee->getSecondaryEntryPointSymbol(*BB),
+                                &EntryID);
       CSI.DestId = Callee->getFunctionNumber();
       CSI.EntryDiscriminator = EntryID;
       return Callee;
@@ -59,7 +64,7 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS,
   BF.computeHash(UseDFS);
   BF.computeBlockHashes();
 
-  YamlBF.Name = BF.getPrintName();
+  YamlBF.Name = DataAggregator::getLocationName(BF, BAT);
   YamlBF.Id = BF.getFunctionNumber();
   YamlBF.Hash = BF.getHash();
   YamlBF.NumBasicBlocks = BF.size();
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index cbb7199..aaa0e1f 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -23,6 +23,7 @@
 #include "bolt/Passes/JTFootprintReduction.h"
 #include "bolt/Passes/LongJmp.h"
 #include "bolt/Passes/LoopInversionPass.h"
+#include "bolt/Passes/MCF.h"
 #include "bolt/Passes/PLTCall.h"
 #include "bolt/Passes/PatchEntries.h"
 #include "bolt/Passes/RegReAssign.h"
@@ -90,6 +91,11 @@ PrintAfterLowering("print-after-lowering",
   cl::desc("print function after instruction lowering"),
   cl::Hidden, cl::cat(BoltOptCategory));
 
+static cl::opt<bool> PrintEstimateEdgeCounts(
+    "print-estimate-edge-counts",
+    cl::desc("print function after edge counts are set for no-LBR profile"),
+    cl::Hidden, cl::cat(BoltOptCategory));
+
 cl::opt<bool>
 PrintFinalized("print-finalized",
   cl::desc("print function after CFG is finalized"),
@@ -334,8 +340,10 @@ Error BinaryFunctionPassManager::runPasses() {
 Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
   BinaryFunctionPassManager Manager(BC);
 
-  const DynoStats InitialDynoStats =
-      getDynoStats(BC.getBinaryFunctions(), BC.isAArch64());
+  Manager.registerPass(
+      std::make_unique<EstimateEdgeCounts>(PrintEstimateEdgeCounts));
+
+  Manager.registerPass(std::make_unique<DynoStatsSetPass>());
 
   Manager.registerPass(std::make_unique<AsmDumpPass>(),
                        opts::AsmDump.getNumOccurrences());
@@ -447,10 +455,9 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
   Manager.registerPass(std::make_unique<SplitFunctions>(PrintSplit));
 
   // Print final dyno stats right while CFG and instruction analysis are intact.
-  Manager.registerPass(
-      std::make_unique<DynoStatsPrintPass>(
-          InitialDynoStats, "after all optimizations before SCTC and FOP"),
-      opts::PrintDynoStats || opts::DynoStatsAll);
+  Manager.registerPass(std::make_unique<DynoStatsPrintPass>(
+                           "after all optimizations before SCTC and FOP"),
+                       opts::PrintDynoStats || opts::DynoStatsAll);
 
   // Add the StokeInfo pass, which extract functions for stoke optimization and
   // get the liveness information for them
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index d582ce7..ab46503 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -73,8 +73,7 @@ static void printDie(DWARFUnit &DU, uint64_t DIEOffset) {
   DWARFDataExtractor DebugInfoData = DU.getDebugInfoExtractor();
   DWARFDebugInfoEntry DIEEntry;
   if (DIEEntry.extractFast(DU, &DIEOffset, DebugInfoData, NextCUOffset, 0)) {
-    if (const DWARFAbbreviationDeclaration *AbbrDecl =
-            DIEEntry.getAbbreviationDeclarationPtr()) {
+    if (DIEEntry.getAbbreviationDeclarationPtr()) {
       DWARFDie DDie(&DU, &DIEEntry);
       printDie(DDie);
     } else {
diff --git a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
index 99775cc..b2c8b24 100644
--- a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
+++ b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
@@ -393,7 +393,7 @@ void LinuxKernelRewriter::processLKKSymtab(bool IsGPL) {
 
   for (uint64_t I = 0; I < SectionSize; I += 4) {
     const uint64_t EntryAddress = SectionAddress + I;
-    ErrorOr<uint64_t> Offset = BC.getSignedValueAtAddress(EntryAddress, 4);
+    ErrorOr<int64_t> Offset = BC.getSignedValueAtAddress(EntryAddress, 4);
     assert(Offset && "Reading valid PC-relative offset for a ksymtab entry");
     const int32_t SignedOffset = *Offset;
     const uint64_t RefAddress = EntryAddress + SignedOffset;
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 6e1021a..4b4913d 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -17,6 +17,7 @@
 #include "bolt/Core/MCPlusBuilder.h"
 #include "bolt/Core/ParallelUtilities.h"
 #include "bolt/Core/Relocation.h"
+#include "bolt/Passes/BinaryPasses.h"
 #include "bolt/Passes/CacheMetrics.h"
 #include "bolt/Passes/ReorderFunctions.h"
 #include "bolt/Profile/BoltAddressTranslation.h"
@@ -86,6 +87,7 @@ extern cl::list<std::string> ReorderData;
 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
 extern cl::opt<bool> TerminalTrap;
 extern cl::opt<bool> TimeBuild;
+extern cl::opt<bool> TimeRewrite;
 
 cl::opt<bool> AllowStripped("allow-stripped",
                             cl::desc("allow processing of stripped binaries"),
@@ -236,11 +238,6 @@ UseGnuStack("use-gnu-stack",
   cl::cat(BoltCategory));
 
 static cl::opt<bool>
-    TimeRewrite("time-rewrite",
-                cl::desc("print time spent in rewriting passes"), cl::Hidden,
-                cl::cat(BoltCategory));
-
-static cl::opt<bool>
 SequentialDisassembly("sequential-disassembly",
   cl::desc("performs disassembly sequentially"),
   cl::init(false),
@@ -1500,7 +1497,7 @@ void RewriteInstance::registerFragments() {
   if (!BC->hasSymbolsWithFileName()) {
     BC->errs() << "BOLT-ERROR: input file has split functions but does not "
                   "have FILE symbols. If the binary was stripped, preserve "
-                  "FILE symbols with --keep-file-symbols strip option";
+                  "FILE symbols with --keep-file-symbols strip option\n";
     exit(1);
   }
 
@@ -1988,6 +1985,7 @@ Error RewriteInstance::readSpecialSections() {
 
   if (ErrorOr<BinarySection &> BATSec =
           BC->getUniqueSectionByName(BoltAddressTranslation::SECTION_NAME)) {
+    BC->HasBATSection = true;
     // Do not read BAT when plotting a heatmap
     if (!opts::HeatmapMode) {
       if (std::error_code EC = BAT->parse(BC->outs(), BATSec->getContents())) {
@@ -3208,12 +3206,14 @@ void RewriteInstance::preprocessProfileData() {
   if (Error E = ProfileReader->preprocessProfile(*BC.get()))
     report_error("cannot pre-process profile", std::move(E));
 
-  if (!BC->hasSymbolsWithFileName() && ProfileReader->hasLocalsWithFileName()) {
+  if (!BC->hasSymbolsWithFileName() && ProfileReader->hasLocalsWithFileName() &&
+      !opts::AllowStripped) {
     BC->errs()
         << "BOLT-ERROR: input binary does not have local file symbols "
            "but profile data includes function names with embedded file "
            "names. It appears that the input binary was stripped while a "
-           "profiled binary was not\n";
+           "profiled binary was not. If you know what you are doing and "
+           "wish to proceed, use -allow-stripped option.\n";
     exit(1);
   }
 }
@@ -3284,8 +3284,11 @@ void RewriteInstance::processProfileData() {
   // Release memory used by profile reader.
   ProfileReader.reset();
 
-  if (opts::AggregateOnly)
+  if (opts::AggregateOnly) {
+    PrintProgramStats PPS(&*BAT);
+    BC->logBOLTErrorsAndQuitOnFatal(PPS.runOnFunctions(*BC));
     exit(0);
+  }
 }
 
 void RewriteInstance::disassembleFunctions() {
diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
index 8fdacff..a33a9dc 100644
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -1932,6 +1932,19 @@ public:
     //    = R_X86_64_PC32(Ln) + En - JT
     //    = R_X86_64_PC32(Ln + offsetof(En))
     //
+    auto isRIPRel = [&](X86MemOperand &MO) {
+      // NB: DispExpr should be set
+      return MO.DispExpr != nullptr &&
+             MO.BaseRegNum == RegInfo->getProgramCounter() &&
+             MO.IndexRegNum == X86::NoRegister &&
+             MO.SegRegNum == X86::NoRegister;
+    };
+    auto isIndexed = [](X86MemOperand &MO, MCPhysReg R) {
+      // NB: IndexRegNum should be set.
+      return MO.IndexRegNum != X86::NoRegister && MO.BaseRegNum == R &&
+             MO.ScaleImm == 4 && MO.DispImm == 0 &&
+             MO.SegRegNum == X86::NoRegister;
+    };
     LLVM_DEBUG(dbgs() << "Checking for PIC jump table\n");
     MCInst *MemLocInstr = nullptr;
     const MCInst *MovInstr = nullptr;
@@ -1965,9 +1978,8 @@ public:
         std::optional<X86MemOperand> MO = evaluateX86MemoryOperand(Instr);
         if (!MO)
           break;
-        if (MO->BaseRegNum != R1 || MO->ScaleImm != 4 ||
-            MO->IndexRegNum == X86::NoRegister || MO->DispImm != 0 ||
-            MO->SegRegNum != X86::NoRegister)
+        if (!isIndexed(*MO, R1))
+          // POSSIBLE_PIC_JUMP_TABLE
           break;
         MovInstr = &Instr;
       } else {
@@ -1986,9 +1998,7 @@ public:
         std::optional<X86MemOperand> MO = evaluateX86MemoryOperand(Instr);
         if (!MO)
           break;
-        if (MO->BaseRegNum != RegInfo->getProgramCounter() ||
-            MO->IndexRegNum != X86::NoRegister ||
-            MO->SegRegNum != X86::NoRegister || MO->DispExpr == nullptr)
+        if (!isRIPRel(*MO))
           break;
         MemLocInstr = &Instr;
         break;
@@ -2105,13 +2115,15 @@ public:
       return IndirectBranchType::POSSIBLE_FIXED_BRANCH;
     }
 
-    if (Type == IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE &&
-        (MO->ScaleImm != 1 || MO->BaseRegNum != RIPRegister))
-      return IndirectBranchType::UNKNOWN;
-
-    if (Type != IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE &&
-        MO->ScaleImm != PtrSize)
-      return IndirectBranchType::UNKNOWN;
+    switch (Type) {
+    case IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE:
+      if (MO->ScaleImm != 1 || MO->BaseRegNum != RIPRegister)
+        return IndirectBranchType::UNKNOWN;
+      break;
+    default:
+      if (MO->ScaleImm != PtrSize)
+        return IndirectBranchType::UNKNOWN;
+    }
 
     MemLocInstrOut = MemLocInstr;
 
diff --git a/bolt/lib/Utils/CommandLineOpts.cpp b/bolt/lib/Utils/CommandLineOpts.cpp
index ba296c1..41c89bc 100644
--- a/bolt/lib/Utils/CommandLineOpts.cpp
+++ b/bolt/lib/Utils/CommandLineOpts.cpp
@@ -179,6 +179,10 @@ cl::opt<bool> TimeOpts("time-opts",
                        cl::desc("print time spent in each optimization"),
                        cl::cat(BoltOptCategory));
 
+cl::opt<bool> TimeRewrite("time-rewrite",
+                          cl::desc("print time spent in rewriting passes"),
+                          cl::Hidden, cl::cat(BoltCategory));
+
 cl::opt<bool> UseOldText(
     "use-old-text",
     cl::desc("re-use space in old .text if possible (relocation mode)"),
diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp
index 16e0bbd..d1f8a21 100644
--- a/bolt/runtime/instr.cpp
+++ b/bolt/runtime/instr.cpp
@@ -1245,7 +1245,6 @@ void Graph::computeEdgeFrequencies(const uint64_t *Counters,
       continue;
 
     assert(SpanningTreeNodes[Cur].NumInEdges == 1, "must have 1 parent");
-    const uint32_t Parent = SpanningTreeNodes[Cur].InEdges[0].Node;
     const uint32_t ParentEdge = SpanningTreeNodes[Cur].InEdges[0].ID;
 
     // Calculate parent edge freq.
@@ -1464,9 +1463,8 @@ void visitCallFlowEntry(CallFlowHashTable::MapEntry &Entry, int FD,
 int openProfile() {
   // Build the profile name string by appending our PID
   char Buf[BufSize];
-  char *Ptr = Buf;
   uint64_t PID = __getpid();
-  Ptr = strCopy(Buf, __bolt_instr_filename, BufSize);
+  char *Ptr = strCopy(Buf, __bolt_instr_filename, BufSize);
   if (__bolt_instr_use_pid) {
     Ptr = strCopy(Ptr, ".", BufSize - (Ptr - Buf + 1));
     Ptr = intToStr(Ptr, PID, 10);
diff --git a/bolt/test/X86/bb-with-two-tail-calls.s b/bolt/test/X86/bb-with-two-tail-calls.s
index b6703e3..8bbecc4 100644
--- a/bolt/test/X86/bb-with-two-tail-calls.s
+++ b/bolt/test/X86/bb-with-two-tail-calls.s
@@ -8,11 +8,21 @@
 # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib
 # RUN: llvm-bolt %t.exe -o %t.out --data %t.fdata --lite=0 --dyno-stats \
 # RUN:    --print-sctc --print-only=_start -enable-bat 2>&1 | FileCheck %s
+# RUN: llvm-objdump --syms %t.out > %t.log
+# RUN: llvm-bat-dump %t.out --dump-all >> %t.log
+# RUN: FileCheck %s --input-file %t.log --check-prefix=CHECK-BAT
+
 # CHECK-NOT: Assertion `BranchInfo.size() == 2 && "could only be called for blocks with 2 successors"' failed.
 # Two tail calls in the same basic block after SCTC:
 # CHECK:         {{.*}}:   ja      {{.*}} # TAILCALL # Offset: 7 # CTCTakenCount: 4
 # CHECK-NEXT:    {{.*}}:   jmp     {{.*}} # TAILCALL # Offset: 13
 
+# Confirm that a deleted basic block is emitted at function end offset (0xe)
+# CHECK-BAT: [[#%x,ADDR:]] g .text  [[#%x,SIZE:]] _start
+# CHECK-BAT: Function Address: 0x[[#%x,ADDR]]
+# CHECK-BAT: 0x[[#%x,SIZE]]
+# CHECK-BAT: NumBlocks: 5
+
   .globl _start
 _start:
     je x
diff --git a/bolt/test/X86/bolt-address-translation-yaml.test b/bolt/test/X86/bolt-address-translation-yaml.test
index e21513b..8f65eab 100644
--- a/bolt/test/X86/bolt-address-translation-yaml.test
+++ b/bolt/test/X86/bolt-address-translation-yaml.test
@@ -31,7 +31,8 @@ RUN: perf2bolt %t.out --pa -p %p/Inputs/blarge_new_bat.preagg.txt -w %t.yaml -o
 RUN:   2>&1 | FileCheck --check-prefix READ-BAT-CHECK %s
 RUN: FileCheck --input-file %t.yaml --check-prefix YAML-BAT-CHECK %s
 # Check that YAML converted from fdata matches YAML created directly with BAT.
-RUN: llvm-bolt %t.exe -data %t.fdata -w %t.yaml-fdata -o /dev/null
+RUN: llvm-bolt %t.exe -data %t.fdata -w %t.yaml-fdata -o /dev/null \
+RUN:   2>&1 | FileCheck --check-prefix READ-BAT-FDATA-CHECK %s
 RUN: FileCheck --input-file %t.yaml-fdata --check-prefix YAML-BAT-CHECK %s
 
 # Test resulting YAML profile with the original binary (no-stale mode)
@@ -40,11 +41,13 @@ RUN:   | FileCheck --check-prefix CHECK-BOLT-YAML %s
 
 WRITE-BAT-CHECK: BOLT-INFO: Wrote 5 BAT maps
 WRITE-BAT-CHECK: BOLT-INFO: Wrote 4 function and 22 basic block hashes
-WRITE-BAT-CHECK: BOLT-INFO: BAT section size (bytes): 384
+WRITE-BAT-CHECK: BOLT-INFO: BAT section size (bytes): 404
 
 READ-BAT-CHECK-NOT: BOLT-ERROR: unable to save profile in YAML format for input file processed by BOLT
 READ-BAT-CHECK: BOLT-INFO: Parsed 5 BAT entries
 READ-BAT-CHECK: PERF2BOLT: read 79 aggregated LBR entries
+READ-BAT-CHECK: BOLT-INFO: 5 out of 21 functions in the binary (23.8%) have non-empty execution profile
+READ-BAT-FDATA-CHECK: BOLT-INFO: 5 out of 16 functions in the binary (31.2%) have non-empty execution profile
 
 YAML-BAT-CHECK:      functions:
 # Function not covered by BAT - has insns in basic block
diff --git a/bolt/test/X86/bolt-address-translation.test b/bolt/test/X86/bolt-address-translation.test
index e6b21c1..dfdd1ee 100644
--- a/bolt/test/X86/bolt-address-translation.test
+++ b/bolt/test/X86/bolt-address-translation.test
@@ -37,7 +37,7 @@
 # CHECK:      BOLT: 3 out of 7 functions were overwritten.
 # CHECK:      BOLT-INFO: Wrote 6 BAT maps
 # CHECK:      BOLT-INFO: Wrote 3 function and 58 basic block hashes
-# CHECK:      BOLT-INFO: BAT section size (bytes): 928
+# CHECK:      BOLT-INFO: BAT section size (bytes): 940
 #
 # usqrt mappings (hot part). We match against any key (left side containing
 # the bolted binary offsets) because BOLT may change where it puts instructions
diff --git a/bolt/test/X86/dwarf5-debug-names-class-type-decl.s b/bolt/test/X86/dwarf5-debug-names-class-type-decl.s
new file mode 100644
index 0000000..587eaaf
--- /dev/null
+++ b/bolt/test/X86/dwarf5-debug-names-class-type-decl.s
@@ -0,0 +1,670 @@
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %s -o %t1.o
+# RUN: %clang %cflags -dwarf-5 %t1.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt > %t.txt
+# RUN: llvm-dwarfdump --show-form --verbose --debug-names %t.bolt >> %t.txt
+# RUN: cat %t.txt | FileCheck --check-prefix=POSTCHECK %s
+
+## This tests that BOLT doesn't generate entry for a DW_TAG_class_type declaration with DW_AT_name.
+
+# POSTCHECK:       DW_TAG_type_unit
+# POSTCHECK:       DW_TAG_class_type [7]
+# POSTCHECK-NEXT:    DW_AT_name [DW_FORM_strx1]  (indexed (00000006) string = "InnerState")
+# POSTCHECK-NEXT:    DW_AT_declaration [DW_FORM_flag_present]  (true)
+# POSTCHECK: Name Index
+# POSTCHECK-NOT: "InnerState"
+
+## -g2 -O0 -fdebug-types-section -gpubnames
+## namespace A {
+##   namespace B {
+##     class State {
+##       public:
+##       class InnerState{
+##         InnerState() {}
+##       };
+##       State(){}
+##       State(InnerState S){}
+##     };
+##   }
+## }
+##
+## int main() {
+##   A::B::State S;
+##   return 0;
+## }
+
+	.text
+	.file	"main.cpp"
+	.file	0 "/DW_TAG_class_type" "main.cpp" md5 0x80f261b124b76c481b8761c040ab4802
+	.section	.debug_info,"G",@progbits,16664150534606561860,comdat
+.Ltu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	2                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	-1782593539102989756            # Type Signature
+	.long	39                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x3b DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	2                               # Abbrev [2] 0x23:0x2a DW_TAG_namespace
+	.byte	3                               # DW_AT_name
+	.byte	2                               # Abbrev [2] 0x25:0x27 DW_TAG_namespace
+	.byte	4                               # DW_AT_name
+	.byte	3                               # Abbrev [3] 0x27:0x24 DW_TAG_class_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	5                               # DW_AT_name
+	.byte	1                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	4                               # Abbrev [4] 0x2d:0xb DW_TAG_subprogram
+	.byte	5                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	5                               # Abbrev [5] 0x32:0x5 DW_TAG_formal_parameter
+	.long	77                              # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x38:0x10 DW_TAG_subprogram
+	.byte	5                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	9                               # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	5                               # Abbrev [5] 0x3d:0x5 DW_TAG_formal_parameter
+	.long	77                              # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	6                               # Abbrev [6] 0x42:0x5 DW_TAG_formal_parameter
+	.long	72                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	7                               # Abbrev [7] 0x48:0x2 DW_TAG_class_type
+	.byte	6                               # DW_AT_name
+                                        # DW_AT_declaration
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	8                               # Abbrev [8] 0x4d:0x5 DW_TAG_pointer_type
+	.long	39                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.text
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.loc	0 14 0                          # main.cpp:14:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	subq	$16, %rsp
+	movl	$0, -4(%rbp)
+.Ltmp0:
+	.loc	0 15 15 prologue_end            # main.cpp:15:15
+	leaq	-5(%rbp), %rdi
+	callq	_ZN1A1B5StateC2Ev
+	.loc	0 16 3                          # main.cpp:16:3
+	xorl	%eax, %eax
+	.loc	0 16 3 epilogue_begin is_stmt 0 # main.cpp:16:3
+	addq	$16, %rsp
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.section	.text._ZN1A1B5StateC2Ev,"axG",@progbits,_ZN1A1B5StateC2Ev,comdat
+	.weak	_ZN1A1B5StateC2Ev               # -- Begin function _ZN1A1B5StateC2Ev
+	.p2align	4, 0x90
+	.type	_ZN1A1B5StateC2Ev,@function
+_ZN1A1B5StateC2Ev:                      # @_ZN1A1B5StateC2Ev
+.Lfunc_begin1:
+	.loc	0 8 0 is_stmt 1                 # main.cpp:8:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	movq	%rdi, -8(%rbp)
+.Ltmp2:
+	.loc	0 8 15 prologue_end epilogue_begin # main.cpp:8:15
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp3:
+.Lfunc_end1:
+	.size	_ZN1A1B5StateC2Ev, .Lfunc_end1-_ZN1A1B5StateC2Ev
+	.cfi_endproc
+                                        # -- End function
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	57                              # DW_TAG_namespace
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	2                               # DW_TAG_class_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	50                              # DW_AT_accessibility
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	52                              # DW_AT_artificial
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	2                               # DW_TAG_class_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	9                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	85                              # DW_AT_ranges
+	.byte	35                              # DW_FORM_rnglistx
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	116                             # DW_AT_rnglists_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	10                              # Abbreviation Code
+	.byte	2                               # DW_TAG_class_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	11                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	12                              # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	13                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	100                             # DW_AT_object_pointer
+	.byte	19                              # DW_FORM_ref4
+	.byte	110                             # DW_AT_linkage_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	71                              # DW_AT_specification
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	14                              # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	52                              # DW_AT_artificial
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	15                              # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+	.short	5                               # DWARF version number
+	.byte	1                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	9                               # Abbrev [9] 0xc:0x7f DW_TAG_compile_unit
+	.byte	0                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_name
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # DW_AT_comp_dir
+	.quad	0                               # DW_AT_low_pc
+	.byte	0                               # DW_AT_ranges
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+	.long	.Lrnglists_table_base0          # DW_AT_rnglists_base
+	.byte	2                               # Abbrev [2] 0x2b:0x1b DW_TAG_namespace
+	.byte	3                               # DW_AT_name
+	.byte	2                               # Abbrev [2] 0x2d:0x18 DW_TAG_namespace
+	.byte	4                               # DW_AT_name
+	.byte	10                              # Abbrev [10] 0x2f:0x15 DW_TAG_class_type
+                                        # DW_AT_declaration
+	.quad	-1782593539102989756            # DW_AT_signature
+	.byte	4                               # Abbrev [4] 0x38:0xb DW_TAG_subprogram
+	.byte	5                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	5                               # Abbrev [5] 0x3d:0x5 DW_TAG_formal_parameter
+	.long	97                              # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	11                              # Abbrev [11] 0x46:0x1b DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	7                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	14                              # DW_AT_decl_line
+	.long	129                             # DW_AT_type
+                                        # DW_AT_external
+	.byte	12                              # Abbrev [12] 0x55:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	123
+	.byte	10                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	15                              # DW_AT_decl_line
+	.long	47                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	8                               # Abbrev [8] 0x61:0x5 DW_TAG_pointer_type
+	.long	47                              # DW_AT_type
+	.byte	13                              # Abbrev [13] 0x66:0x1b DW_TAG_subprogram
+	.byte	1                               # DW_AT_low_pc
+	.long	.Lfunc_end1-.Lfunc_begin1       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.long	119                             # DW_AT_object_pointer
+	.byte	9                               # DW_AT_linkage_name
+	.long	56                              # DW_AT_specification
+	.byte	14                              # Abbrev [14] 0x77:0x9 DW_TAG_formal_parameter
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	120
+	.byte	11                              # DW_AT_name
+	.long	133                             # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	0                               # End Of Children Mark
+	.byte	15                              # Abbrev [15] 0x81:0x4 DW_TAG_base_type
+	.byte	8                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	8                               # Abbrev [8] 0x85:0x5 DW_TAG_pointer_type
+	.long	47                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end1:
+	.section	.debug_rnglists,"",@progbits
+	.long	.Ldebug_list_header_end0-.Ldebug_list_header_start0 # Length
+.Ldebug_list_header_start0:
+	.short	5                               # Version
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+	.long	1                               # Offset entry count
+.Lrnglists_table_base0:
+	.long	.Ldebug_ranges0-.Lrnglists_table_base0
+.Ldebug_ranges0:
+	.byte	3                               # DW_RLE_startx_length
+	.byte	0                               #   start index
+	.uleb128 .Lfunc_end0-.Lfunc_begin0      #   length
+	.byte	3                               # DW_RLE_startx_length
+	.byte	1                               #   start index
+	.uleb128 .Lfunc_end1-.Lfunc_begin1      #   length
+	.byte	0                               # DW_RLE_end_of_list
+.Ldebug_list_header_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	52                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 19.0.0git"       # string offset=0
+.Linfo_string1:
+	.asciz	"main.cpp"                      # string offset=24
+.Linfo_string2:
+	.asciz	"/home/ayermolo/local/tasks/T190087639/DW_TAG_class_type" # string offset=33
+.Linfo_string3:
+	.asciz	"A"                             # string offset=89
+.Linfo_string4:
+	.asciz	"B"                             # string offset=91
+.Linfo_string5:
+	.asciz	"State"                         # string offset=93
+.Linfo_string6:
+	.asciz	"InnerState"                    # string offset=99
+.Linfo_string7:
+	.asciz	"main"                          # string offset=110
+.Linfo_string8:
+	.asciz	"_ZN1A1B5StateC2Ev"             # string offset=115
+.Linfo_string9:
+	.asciz	"int"                           # string offset=133
+.Linfo_string10:
+	.asciz	"S"                             # string offset=137
+.Linfo_string11:
+	.asciz	"this"                          # string offset=139
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Linfo_string0
+	.long	.Linfo_string1
+	.long	.Linfo_string2
+	.long	.Linfo_string3
+	.long	.Linfo_string4
+	.long	.Linfo_string5
+	.long	.Linfo_string6
+	.long	.Linfo_string7
+	.long	.Linfo_string9
+	.long	.Linfo_string8
+	.long	.Linfo_string10
+	.long	.Linfo_string11
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+	.quad	.Lfunc_begin1
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	1                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	6                               # Header: bucket count
+	.long	6                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	.Ltu_begin0                     # Type unit 0
+	.long	0                               # Bucket 0
+	.long	0                               # Bucket 1
+	.long	1                               # Bucket 2
+	.long	2                               # Bucket 3
+	.long	3                               # Bucket 4
+	.long	6                               # Bucket 5
+	.long	193495088                       # Hash in Bucket 2
+	.long	1059643959                      # Hash in Bucket 3
+	.long	177670                          # Hash in Bucket 4
+	.long	274811398                       # Hash in Bucket 4
+	.long	2090499946                      # Hash in Bucket 4
+	.long	177671                          # Hash in Bucket 5
+	.long	.Linfo_string9                  # String in Bucket 2: int
+	.long	.Linfo_string8                  # String in Bucket 3: _ZN1A1B5StateC2Ev
+	.long	.Linfo_string3                  # String in Bucket 4: A
+	.long	.Linfo_string5                  # String in Bucket 4: State
+	.long	.Linfo_string7                  # String in Bucket 4: main
+	.long	.Linfo_string4                  # String in Bucket 5: B
+	.long	.Lnames5-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames4-.Lnames_entries0       # Offset in Bucket 3
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 4
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 4
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 4
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 5
+.Lnames_abbrev_start0:
+	.byte	1                               # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	2                               # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	3                               # Abbrev code
+	.byte	57                              # DW_TAG_namespace
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	4                               # Abbrev code
+	.byte	57                              # DW_TAG_namespace
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	5                               # Abbrev code
+	.byte	2                               # DW_TAG_class_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	6                               # Abbrev code
+	.byte	57                              # DW_TAG_namespace
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	7                               # Abbrev code
+	.byte	57                              # DW_TAG_namespace
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames5:
+.L2:
+	.byte	1                               # Abbreviation code
+	.long	129                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: int
+.Lnames4:
+.L3:
+	.byte	2                               # Abbreviation code
+	.long	102                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: _ZN1A1B5StateC2Ev
+.Lnames0:
+.L4:
+	.byte	3                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	35                              # DW_IDX_die_offset
+.L7:                                    # DW_IDX_parent
+	.byte	4                               # Abbreviation code
+	.long	43                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: A
+.Lnames2:
+.L1:
+	.byte	5                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	39                              # DW_IDX_die_offset
+	.long	.L5-.Lnames_entries0            # DW_IDX_parent
+	.byte	2                               # Abbreviation code
+	.long	102                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: State
+.Lnames3:
+.L0:
+	.byte	2                               # Abbreviation code
+	.long	70                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: main
+.Lnames1:
+.L5:
+	.byte	6                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	37                              # DW_IDX_die_offset
+	.long	.L4-.Lnames_entries0            # DW_IDX_parent
+.L6:
+	.byte	7                               # Abbreviation code
+	.long	45                              # DW_IDX_die_offset
+	.long	.L7-.Lnames_entries0            # DW_IDX_parent
+	.byte	0                               # End of list: B
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 19.0.0git"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/dwarf5-debug-names-enumeration-type-decl.s b/bolt/test/X86/dwarf5-debug-names-enumeration-type-decl.s
new file mode 100644
index 0000000..0311757
--- /dev/null
+++ b/bolt/test/X86/dwarf5-debug-names-enumeration-type-decl.s
@@ -0,0 +1,485 @@
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %s -o %t1.o
+# RUN: %clang %cflags -dwarf-5 %t1.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt > %t.txt
+# RUN: llvm-dwarfdump --show-form --verbose --debug-names %t.bolt >> %t.txt
+# RUN: cat %t.txt | FileCheck --check-prefix=POSTCHECK %s
+
+## This tests that BOLT doesn't generate entry for a DW_TAG_enumeration_type declaration with DW_AT_name.
+
+# POSTCHECK:       DW_TAG_type_unit
+# POSTCHECK:       DW_TAG_enumeration_type [6]
+# POSTCHECK-NEXT:    DW_AT_name [DW_FORM_strx1]  (indexed (00000009) string = "InnerState")
+# POSTCHECK-NEXT:    DW_AT_byte_size [DW_FORM_data1] (0x04)
+# POSTCHECK-NEXT:    DW_AT_declaration [DW_FORM_flag_present]  (true)
+# POSTCHECK: Name Index
+# POSTCHECK-NOT: "InnerState"
+
+## -g2 -O0 -fdebug-types-section -gpubnames
+## namespace B {
+##   template <typename Task>
+##   class State {
+##     public:
+##     enum class InnerState { STATE0 };
+##     InnerState St;
+##   };
+## }
+##
+## int main() {
+##   B::State<int> S;
+##   return 0;
+## }
+
+	.text
+	.file	"main.cpp"
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.file	0 "/DW_TAG_enumeration_type" "main.cpp" md5 0x2e8962f8ef4bf6eb6f8bd92966c0848b
+	.loc	0 10 0                          # main.cpp:10:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	movl	$0, -4(%rbp)
+.Ltmp0:
+	.loc	0 12 3 prologue_end             # main.cpp:12:3
+	xorl	%eax, %eax
+	.loc	0 12 3 epilogue_begin is_stmt 0 # main.cpp:12:3
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.section	.debug_info,"G",@progbits,8822129917070965541,comdat
+.Ltu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	2                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	8822129917070965541             # Type Signature
+	.long	37                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x2d DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	2                               # Abbrev [2] 0x23:0x1d DW_TAG_namespace
+	.byte	6                               # DW_AT_name
+	.byte	3                               # Abbrev [3] 0x25:0x1a DW_TAG_class_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	10                              # DW_AT_name
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	4                               # Abbrev [4] 0x2b:0x6 DW_TAG_template_type_parameter
+	.long	64                              # DW_AT_type
+	.byte	7                               # DW_AT_name
+	.byte	5                               # Abbrev [5] 0x31:0xa DW_TAG_member
+	.byte	8                               # DW_AT_name
+	.long	59                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	6                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	6                               # Abbrev [6] 0x3b:0x3 DW_TAG_enumeration_type
+	.byte	9                               # DW_AT_name
+	.byte	4                               # DW_AT_byte_size
+                                        # DW_AT_declaration
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	7                               # Abbrev [7] 0x40:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	57                              # DW_TAG_namespace
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	2                               # DW_TAG_class_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	47                              # DW_TAG_template_type_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	50                              # DW_AT_accessibility
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	4                               # DW_TAG_enumeration_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	9                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	10                              # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	11                              # Abbreviation Code
+	.byte	2                               # DW_TAG_class_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+	.short	5                               # DWARF version number
+	.byte	1                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Abbrev [8] 0xc:0x43 DW_TAG_compile_unit
+	.byte	0                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_name
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # DW_AT_comp_dir
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+	.byte	9                               # Abbrev [9] 0x23:0x1b DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	3                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	10                              # DW_AT_decl_line
+	.long	62                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	10                              # Abbrev [10] 0x32:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	120
+	.byte	5                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	11                              # DW_AT_decl_line
+	.long	68                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	7                               # Abbrev [7] 0x3e:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	2                               # Abbrev [2] 0x42:0xc DW_TAG_namespace
+	.byte	6                               # DW_AT_name
+	.byte	11                              # Abbrev [11] 0x44:0x9 DW_TAG_class_type
+                                        # DW_AT_declaration
+	.quad	8822129917070965541             # DW_AT_signature
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end1:
+	.section	.debug_str_offsets,"",@progbits
+	.long	48                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 19.0.0git"       # string offset=0
+.Linfo_string1:
+	.asciz	"main.cpp"                      # string offset=24
+.Linfo_string2:
+	.asciz	"/home/ayermolo/local/tasks/T190087639/DW_TAG_enumeration_type" # string offset=33
+.Linfo_string3:
+	.asciz	"main"                          # string offset=95
+.Linfo_string4:
+	.asciz	"int"                           # string offset=100
+.Linfo_string5:
+	.asciz	"S"                             # string offset=104
+.Linfo_string6:
+	.asciz	"B"                             # string offset=106
+.Linfo_string7:
+	.asciz	"Task"                          # string offset=108
+.Linfo_string8:
+	.asciz	"St"                            # string offset=113
+.Linfo_string9:
+	.asciz	"InnerState"                    # string offset=116
+.Linfo_string10:
+	.asciz	"State<int>"                    # string offset=127
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Linfo_string0
+	.long	.Linfo_string1
+	.long	.Linfo_string2
+	.long	.Linfo_string3
+	.long	.Linfo_string4
+	.long	.Linfo_string5
+	.long	.Linfo_string6
+	.long	.Linfo_string7
+	.long	.Linfo_string8
+	.long	.Linfo_string9
+	.long	.Linfo_string10
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	1                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	4                               # Header: bucket count
+	.long	4                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	.Ltu_begin0                     # Type unit 0
+	.long	1                               # Bucket 0
+	.long	0                               # Bucket 1
+	.long	2                               # Bucket 2
+	.long	3                               # Bucket 3
+	.long	193495088                       # Hash in Bucket 0
+	.long	2090499946                      # Hash in Bucket 2
+	.long	177671                          # Hash in Bucket 3
+	.long	624407275                       # Hash in Bucket 3
+	.long	.Linfo_string4                  # String in Bucket 0: int
+	.long	.Linfo_string3                  # String in Bucket 2: main
+	.long	.Linfo_string6                  # String in Bucket 3: B
+	.long	.Linfo_string10                 # String in Bucket 3: State<int>
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 0
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 3
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 3
+.Lnames_abbrev_start0:
+	.byte	1                               # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	2                               # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	3                               # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	4                               # Abbrev code
+	.byte	57                              # DW_TAG_namespace
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	5                               # Abbrev code
+	.byte	57                              # DW_TAG_namespace
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	6                               # Abbrev code
+	.byte	2                               # DW_TAG_class_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames1:
+.L0:
+	.byte	1                               # Abbreviation code
+	.long	62                              # DW_IDX_die_offset
+.L2:                                    # DW_IDX_parent
+	.byte	2                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	64                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: int
+.Lnames0:
+.L3:
+	.byte	3                               # Abbreviation code
+	.long	35                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: main
+.Lnames2:
+	.byte	4                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	35                              # DW_IDX_die_offset
+.L1:                                    # DW_IDX_parent
+	.byte	5                               # Abbreviation code
+	.long	66                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: B
+.Lnames3:
+.L4:
+	.byte	6                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	37                              # DW_IDX_die_offset
+	.long	.L3-.Lnames_entries0            # DW_IDX_parent
+	.byte	0                               # End of list: State<int>
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 19.0.0git"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/dwarf5-debug-names-structure-type-decl.s b/bolt/test/X86/dwarf5-debug-names-structure-type-decl.s
new file mode 100644
index 0000000..6eb2852
--- /dev/null
+++ b/bolt/test/X86/dwarf5-debug-names-structure-type-decl.s
@@ -0,0 +1,671 @@
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %s -o %t1.o
+# RUN: %clang %cflags -dwarf-5 %t1.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt > %t.txt
+# RUN: llvm-dwarfdump --show-form --verbose --debug-names %t.bolt >> %t.txt
+# RUN: cat %t.txt | FileCheck --check-prefix=POSTCHECK %s
+
+## This tests that BOLT doesn't generate entry for a DW_TAG_structure_type declaration with DW_AT_name.
+
+# POSTCHECK:       DW_TAG_type_unit
+# POSTCHECK:       DW_TAG_structure_type [7]
+# POSTCHECK-NEXT:    DW_AT_name [DW_FORM_strx1]  (indexed (00000006) string = "InnerState")
+# POSTCHECK-NEXT:    DW_AT_declaration [DW_FORM_flag_present]  (true)
+# POSTCHECK: Name Index
+# POSTCHECK-NOT: "InnerState"
+
+## -g2 -O0 -fdebug-types-section -gpubnames
+## namespace A {
+##   namespace B {
+##     class State {
+##       public:
+##       struct InnerState{
+##         InnerState() {}
+##       };
+##       State(){}
+##       State(InnerState S){}
+##     };
+##   }
+## }
+##
+## int main() {
+##   A::B::State S;
+##   return 0;
+## }
+
+
+	.text
+	.file	"main.cpp"
+	.file	0 "/DW_TAG_structure_type" "main.cpp" md5 0xd43ba503b70d00353c195087e1fe16e2
+	.section	.debug_info,"G",@progbits,16664150534606561860,comdat
+.Ltu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	2                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	-1782593539102989756            # Type Signature
+	.long	39                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x3b DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	2                               # Abbrev [2] 0x23:0x2a DW_TAG_namespace
+	.byte	3                               # DW_AT_name
+	.byte	2                               # Abbrev [2] 0x25:0x27 DW_TAG_namespace
+	.byte	4                               # DW_AT_name
+	.byte	3                               # Abbrev [3] 0x27:0x24 DW_TAG_class_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	5                               # DW_AT_name
+	.byte	1                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	4                               # Abbrev [4] 0x2d:0xb DW_TAG_subprogram
+	.byte	5                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	5                               # Abbrev [5] 0x32:0x5 DW_TAG_formal_parameter
+	.long	77                              # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x38:0x10 DW_TAG_subprogram
+	.byte	5                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	9                               # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	5                               # Abbrev [5] 0x3d:0x5 DW_TAG_formal_parameter
+	.long	77                              # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	6                               # Abbrev [6] 0x42:0x5 DW_TAG_formal_parameter
+	.long	72                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	7                               # Abbrev [7] 0x48:0x2 DW_TAG_structure_type
+	.byte	6                               # DW_AT_name
+                                        # DW_AT_declaration
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	8                               # Abbrev [8] 0x4d:0x5 DW_TAG_pointer_type
+	.long	39                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.text
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.loc	0 14 0                          # main.cpp:14:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	subq	$16, %rsp
+	movl	$0, -4(%rbp)
+.Ltmp0:
+	.loc	0 15 15 prologue_end            # main.cpp:15:15
+	leaq	-5(%rbp), %rdi
+	callq	_ZN1A1B5StateC2Ev
+	.loc	0 16 3                          # main.cpp:16:3
+	xorl	%eax, %eax
+	.loc	0 16 3 epilogue_begin is_stmt 0 # main.cpp:16:3
+	addq	$16, %rsp
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.section	.text._ZN1A1B5StateC2Ev,"axG",@progbits,_ZN1A1B5StateC2Ev,comdat
+	.weak	_ZN1A1B5StateC2Ev               # -- Begin function _ZN1A1B5StateC2Ev
+	.p2align	4, 0x90
+	.type	_ZN1A1B5StateC2Ev,@function
+_ZN1A1B5StateC2Ev:                      # @_ZN1A1B5StateC2Ev
+.Lfunc_begin1:
+	.loc	0 8 0 is_stmt 1                 # main.cpp:8:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	movq	%rdi, -8(%rbp)
+.Ltmp2:
+	.loc	0 8 15 prologue_end epilogue_begin # main.cpp:8:15
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp3:
+.Lfunc_end1:
+	.size	_ZN1A1B5StateC2Ev, .Lfunc_end1-_ZN1A1B5StateC2Ev
+	.cfi_endproc
+                                        # -- End function
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	57                              # DW_TAG_namespace
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	2                               # DW_TAG_class_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	50                              # DW_AT_accessibility
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	52                              # DW_AT_artificial
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	9                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	85                              # DW_AT_ranges
+	.byte	35                              # DW_FORM_rnglistx
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	116                             # DW_AT_rnglists_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	10                              # Abbreviation Code
+	.byte	2                               # DW_TAG_class_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	11                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	12                              # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	13                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	100                             # DW_AT_object_pointer
+	.byte	19                              # DW_FORM_ref4
+	.byte	110                             # DW_AT_linkage_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	71                              # DW_AT_specification
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	14                              # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	52                              # DW_AT_artificial
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	15                              # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+	.short	5                               # DWARF version number
+	.byte	1                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	9                               # Abbrev [9] 0xc:0x7f DW_TAG_compile_unit
+	.byte	0                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_name
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # DW_AT_comp_dir
+	.quad	0                               # DW_AT_low_pc
+	.byte	0                               # DW_AT_ranges
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+	.long	.Lrnglists_table_base0          # DW_AT_rnglists_base
+	.byte	2                               # Abbrev [2] 0x2b:0x1b DW_TAG_namespace
+	.byte	3                               # DW_AT_name
+	.byte	2                               # Abbrev [2] 0x2d:0x18 DW_TAG_namespace
+	.byte	4                               # DW_AT_name
+	.byte	10                              # Abbrev [10] 0x2f:0x15 DW_TAG_class_type
+                                        # DW_AT_declaration
+	.quad	-1782593539102989756            # DW_AT_signature
+	.byte	4                               # Abbrev [4] 0x38:0xb DW_TAG_subprogram
+	.byte	5                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	5                               # Abbrev [5] 0x3d:0x5 DW_TAG_formal_parameter
+	.long	97                              # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	11                              # Abbrev [11] 0x46:0x1b DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	7                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	14                              # DW_AT_decl_line
+	.long	129                             # DW_AT_type
+                                        # DW_AT_external
+	.byte	12                              # Abbrev [12] 0x55:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	123
+	.byte	10                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	15                              # DW_AT_decl_line
+	.long	47                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	8                               # Abbrev [8] 0x61:0x5 DW_TAG_pointer_type
+	.long	47                              # DW_AT_type
+	.byte	13                              # Abbrev [13] 0x66:0x1b DW_TAG_subprogram
+	.byte	1                               # DW_AT_low_pc
+	.long	.Lfunc_end1-.Lfunc_begin1       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.long	119                             # DW_AT_object_pointer
+	.byte	9                               # DW_AT_linkage_name
+	.long	56                              # DW_AT_specification
+	.byte	14                              # Abbrev [14] 0x77:0x9 DW_TAG_formal_parameter
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	120
+	.byte	11                              # DW_AT_name
+	.long	133                             # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	0                               # End Of Children Mark
+	.byte	15                              # Abbrev [15] 0x81:0x4 DW_TAG_base_type
+	.byte	8                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	8                               # Abbrev [8] 0x85:0x5 DW_TAG_pointer_type
+	.long	47                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end1:
+	.section	.debug_rnglists,"",@progbits
+	.long	.Ldebug_list_header_end0-.Ldebug_list_header_start0 # Length
+.Ldebug_list_header_start0:
+	.short	5                               # Version
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+	.long	1                               # Offset entry count
+.Lrnglists_table_base0:
+	.long	.Ldebug_ranges0-.Lrnglists_table_base0
+.Ldebug_ranges0:
+	.byte	3                               # DW_RLE_startx_length
+	.byte	0                               #   start index
+	.uleb128 .Lfunc_end0-.Lfunc_begin0      #   length
+	.byte	3                               # DW_RLE_startx_length
+	.byte	1                               #   start index
+	.uleb128 .Lfunc_end1-.Lfunc_begin1      #   length
+	.byte	0                               # DW_RLE_end_of_list
+.Ldebug_list_header_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	52                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 19.0.0git"       # string offset=0
+.Linfo_string1:
+	.asciz	"main.cpp"                      # string offset=24
+.Linfo_string2:
+	.asciz	"/home/ayermolo/local/tasks/T190087639/DW_TAG_structure_type" # string offset=33
+.Linfo_string3:
+	.asciz	"A"                             # string offset=93
+.Linfo_string4:
+	.asciz	"B"                             # string offset=95
+.Linfo_string5:
+	.asciz	"State"                         # string offset=97
+.Linfo_string6:
+	.asciz	"InnerState"                    # string offset=103
+.Linfo_string7:
+	.asciz	"main"                          # string offset=114
+.Linfo_string8:
+	.asciz	"_ZN1A1B5StateC2Ev"             # string offset=119
+.Linfo_string9:
+	.asciz	"int"                           # string offset=137
+.Linfo_string10:
+	.asciz	"S"                             # string offset=141
+.Linfo_string11:
+	.asciz	"this"                          # string offset=143
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Linfo_string0
+	.long	.Linfo_string1
+	.long	.Linfo_string2
+	.long	.Linfo_string3
+	.long	.Linfo_string4
+	.long	.Linfo_string5
+	.long	.Linfo_string6
+	.long	.Linfo_string7
+	.long	.Linfo_string9
+	.long	.Linfo_string8
+	.long	.Linfo_string10
+	.long	.Linfo_string11
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+	.quad	.Lfunc_begin1
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	1                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	6                               # Header: bucket count
+	.long	6                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	.Ltu_begin0                     # Type unit 0
+	.long	0                               # Bucket 0
+	.long	0                               # Bucket 1
+	.long	1                               # Bucket 2
+	.long	2                               # Bucket 3
+	.long	3                               # Bucket 4
+	.long	6                               # Bucket 5
+	.long	193495088                       # Hash in Bucket 2
+	.long	1059643959                      # Hash in Bucket 3
+	.long	177670                          # Hash in Bucket 4
+	.long	274811398                       # Hash in Bucket 4
+	.long	2090499946                      # Hash in Bucket 4
+	.long	177671                          # Hash in Bucket 5
+	.long	.Linfo_string9                  # String in Bucket 2: int
+	.long	.Linfo_string8                  # String in Bucket 3: _ZN1A1B5StateC2Ev
+	.long	.Linfo_string3                  # String in Bucket 4: A
+	.long	.Linfo_string5                  # String in Bucket 4: State
+	.long	.Linfo_string7                  # String in Bucket 4: main
+	.long	.Linfo_string4                  # String in Bucket 5: B
+	.long	.Lnames5-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames4-.Lnames_entries0       # Offset in Bucket 3
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 4
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 4
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 4
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 5
+.Lnames_abbrev_start0:
+	.byte	1                               # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	2                               # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	3                               # Abbrev code
+	.byte	57                              # DW_TAG_namespace
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	4                               # Abbrev code
+	.byte	57                              # DW_TAG_namespace
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	5                               # Abbrev code
+	.byte	2                               # DW_TAG_class_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	6                               # Abbrev code
+	.byte	57                              # DW_TAG_namespace
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	7                               # Abbrev code
+	.byte	57                              # DW_TAG_namespace
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames5:
+.L2:
+	.byte	1                               # Abbreviation code
+	.long	129                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: int
+.Lnames4:
+.L3:
+	.byte	2                               # Abbreviation code
+	.long	102                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: _ZN1A1B5StateC2Ev
+.Lnames0:
+.L4:
+	.byte	3                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	35                              # DW_IDX_die_offset
+.L7:                                    # DW_IDX_parent
+	.byte	4                               # Abbreviation code
+	.long	43                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: A
+.Lnames2:
+.L1:
+	.byte	5                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	39                              # DW_IDX_die_offset
+	.long	.L5-.Lnames_entries0            # DW_IDX_parent
+	.byte	2                               # Abbreviation code
+	.long	102                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: State
+.Lnames3:
+.L0:
+	.byte	2                               # Abbreviation code
+	.long	70                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: main
+.Lnames1:
+.L5:
+	.byte	6                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	37                              # DW_IDX_die_offset
+	.long	.L4-.Lnames_entries0            # DW_IDX_parent
+.L6:
+	.byte	7                               # Abbreviation code
+	.long	45                              # DW_IDX_die_offset
+	.long	.L7-.Lnames_entries0            # DW_IDX_parent
+	.byte	0                               # End of list: B
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 19.0.0git"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/ignored-interprocedural-reference.s b/bolt/test/X86/ignored-interprocedural-reference.s
new file mode 100644
index 0000000..12e4fb9
--- /dev/null
+++ b/bolt/test/X86/ignored-interprocedural-reference.s
@@ -0,0 +1,49 @@
+# This reproduces a bug with not processing interprocedural references from
+# ignored functions.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -nostdlib -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.out --enable-bat -funcs=main
+# RUN: link_fdata %s %t.out %t.preagg PREAGG
+# RUN: perf2bolt %t.out -p %t.preagg --pa -o %t.fdata -w %t.yaml
+# RUN: FileCheck %s --input-file=%t.fdata --check-prefix=CHECK-FDATA
+# RUN: FileCheck %s --input-file=%t.yaml --check-prefix=CHECK-YAML
+
+# CHECK-FDATA: 1 main 0 1 foo a 1 1
+# CHECK-YAML: name: main
+# CHECK-YAML: calls: {{.*}} disc: 1
+
+# PREAGG: B #main# #foo_secondary# 1 1
+# main calls foo at valid instruction offset past nops that are to be stripped.
+  .globl main
+main:
+  .cfi_startproc
+  call foo_secondary
+  ret
+  .cfi_endproc
+.size main,.-main
+
+# Placeholder cold fragment to force main to be ignored in non-relocation mode.
+  .globl main.cold
+main.cold:
+  .cfi_startproc
+  ud2
+  .cfi_endproc
+.size main.cold,.-main.cold
+
+# foo is set up to contain a valid instruction at called offset, and trapping
+# instructions past that.
+  .globl foo
+foo:
+  .cfi_startproc
+  .nops 10
+  .globl foo_secondary
+foo_secondary:
+  ret
+  .rept 20
+  int3
+  .endr
+  .cfi_endproc
+.size foo,.-foo
diff --git a/bolt/test/X86/register-fragments-bolt-symbols.s b/bolt/test/X86/register-fragments-bolt-symbols.s
index 6478adf..90c402b 100644
--- a/bolt/test/X86/register-fragments-bolt-symbols.s
+++ b/bolt/test/X86/register-fragments-bolt-symbols.s
@@ -18,6 +18,11 @@
 # RUN: FileCheck --input-file %t.bat.fdata --check-prefix=CHECK-FDATA %s
 # RUN: FileCheck --input-file %t.bat.yaml --check-prefix=CHECK-YAML %s
 
+# RUN: link_fdata --no-redefine %s %t.bolt %t.preagg2 PREAGG2
+# PREAGG2: B X:0 #chain# 1 0
+# RUN: perf2bolt %t.bolt -p %t.preagg2 --pa -o %t.bat2.fdata -w %t.bat2.yaml
+# RUN: FileCheck %s --input-file %t.bat2.yaml --check-prefix=CHECK-YAML2
+
 # CHECK-SYMS: l df *ABS*          [[#]] chain.s
 # CHECK-SYMS: l  F .bolt.org.text [[#]] chain
 # CHECK-SYMS: l  F .text.cold     [[#]] chain.cold.0
@@ -28,6 +33,9 @@
 
 # CHECK-FDATA: 0 [unknown] 0 1 chain/chain.s/2 10 0 1
 # CHECK-YAML: - name: 'chain/chain.s/2'
+# CHECK-YAML2: - name: 'chain/chain.s/1'
+## non-BAT function has non-zero insns:
+# CHECK-YAML2: insns: 1
 
 .file "chain.s"
         .text
diff --git a/bolt/test/link_fdata.py b/bolt/test/link_fdata.py
index 0232dd3..3837e39 100755
--- a/bolt/test/link_fdata.py
+++ b/bolt/test/link_fdata.py
@@ -19,6 +19,7 @@ parser.add_argument("output")
 parser.add_argument("prefix", nargs="?", default="FDATA", help="Custom FDATA prefix")
 parser.add_argument("--nmtool", default="nm", help="Path to nm tool")
 parser.add_argument("--no-lbr", action="store_true")
+parser.add_argument("--no-redefine", action="store_true")
 
 args = parser.parse_args()
 
@@ -90,6 +91,8 @@ nm_output = subprocess.run(
 symbols = {}
 for symline in nm_output.splitlines():
     symval, _, symname = symline.split(maxsplit=2)
+    if symname in symbols and args.no_redefine:
+        continue
     symbols[symname] = symval
 
 
diff --git a/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp
index 36687a8..c87b3ea 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp
@@ -54,7 +54,9 @@ AST_MATCHER(QualType, isEnableIf) {
 AST_MATCHER_P(TemplateTypeParmDecl, hasDefaultArgument,
               clang::ast_matchers::internal::Matcher<QualType>, TypeMatcher) {
   return Node.hasDefaultArgument() &&
-         TypeMatcher.matches(Node.getDefaultArgument(), Finder, Builder);
+         TypeMatcher.matches(
+             Node.getDefaultArgument().getArgument().getAsType(), Finder,
+             Builder);
 }
 AST_MATCHER(TemplateDecl, hasAssociatedConstraints) {
   return Node.hasAssociatedConstraints();
diff --git a/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.cpp
index 09aaf3e..75f1107 100644
--- a/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.cpp
@@ -19,10 +19,11 @@ namespace {
 AST_MATCHER_P(TemplateTypeParmDecl, hasUnnamedDefaultArgument,
               ast_matchers::internal::Matcher<TypeLoc>, InnerMatcher) {
   if (Node.getIdentifier() != nullptr || !Node.hasDefaultArgument() ||
-      Node.getDefaultArgumentInfo() == nullptr)
+      Node.getDefaultArgument().getArgument().isNull())
     return false;
 
-  TypeLoc DefaultArgTypeLoc = Node.getDefaultArgumentInfo()->getTypeLoc();
+  TypeLoc DefaultArgTypeLoc =
+      Node.getDefaultArgument().getTypeSourceInfo()->getTypeLoc();
   return InnerMatcher.matches(DefaultArgTypeLoc, Finder, Builder);
 }
 
diff --git a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
index a1cffbc..5e64d23 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
@@ -144,16 +144,13 @@ void SizeofExpressionCheck::registerMatchers(MatchFinder *Finder) {
         unaryOperator(hasUnaryOperand(ArrayExpr), unless(hasOperatorName("*"))),
         binaryOperator(hasEitherOperand(ArrayExpr)),
         castExpr(hasSourceExpression(ArrayExpr))));
-    const auto PointerToArrayExpr = ignoringParenImpCasts(
-        hasType(hasCanonicalType(pointerType(pointee(arrayType())))));
+    const auto PointerToArrayExpr =
+        hasType(hasCanonicalType(pointerType(pointee(arrayType()))));
 
-    const auto StructAddrOfExpr = unaryOperator(
-        hasOperatorName("&"), hasUnaryOperand(ignoringParenImpCasts(
-                                  hasType(hasCanonicalType(recordType())))));
     const auto PointerToStructType =
         hasUnqualifiedDesugaredType(pointerType(pointee(recordType())));
-    const auto PointerToStructExpr = ignoringParenImpCasts(expr(
-        hasType(hasCanonicalType(PointerToStructType)), unless(cxxThisExpr())));
+    const auto PointerToStructExpr = expr(
+        hasType(hasCanonicalType(PointerToStructType)), unless(cxxThisExpr()));
 
     const auto ArrayOfPointersExpr = ignoringParenImpCasts(
         hasType(hasCanonicalType(arrayType(hasElementType(pointerType()))
@@ -166,18 +163,19 @@ void SizeofExpressionCheck::registerMatchers(MatchFinder *Finder) {
         ignoringParenImpCasts(arraySubscriptExpr(
             hasBase(ArrayOfSamePointersExpr), hasIndex(ZeroLiteral)));
     const auto ArrayLengthExprDenom =
-        expr(hasParent(expr(ignoringParenImpCasts(binaryOperator(
-                 hasOperatorName("/"), hasLHS(ignoringParenImpCasts(sizeOfExpr(
-                                           has(ArrayOfPointersExpr)))))))),
+        expr(hasParent(binaryOperator(hasOperatorName("/"),
+                                      hasLHS(ignoringParenImpCasts(sizeOfExpr(
+                                          has(ArrayOfPointersExpr)))))),
              sizeOfExpr(has(ArrayOfSamePointersZeroSubscriptExpr)));
 
-    Finder->addMatcher(expr(anyOf(sizeOfExpr(has(ignoringParenImpCasts(anyOf(
-                                      ArrayCastExpr, PointerToArrayExpr,
-                                      StructAddrOfExpr, PointerToStructExpr)))),
-                                  sizeOfExpr(has(PointerToStructType))),
-                            unless(ArrayLengthExprDenom))
-                           .bind("sizeof-pointer-to-aggregate"),
-                       this);
+    Finder->addMatcher(
+        expr(sizeOfExpr(anyOf(
+                 has(ignoringParenImpCasts(anyOf(
+                     ArrayCastExpr, PointerToArrayExpr, PointerToStructExpr))),
+                 has(PointerToStructType))),
+             unless(ArrayLengthExprDenom))
+            .bind("sizeof-pointer-to-aggregate"),
+        this);
   }
 
   // Detect expression like: sizeof(expr) <= k for a suspicious constant 'k'.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
index 7a021fe..ea4d995 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
@@ -177,9 +177,11 @@ matchTrailingTemplateParam(const FunctionTemplateDecl *FunctionTemplate) {
           dyn_cast<TemplateTypeParmDecl>(LastParam)) {
     if (LastTemplateParam->hasDefaultArgument() &&
         LastTemplateParam->getIdentifier() == nullptr) {
-      return {matchEnableIfSpecialization(
-                  LastTemplateParam->getDefaultArgumentInfo()->getTypeLoc()),
-              LastTemplateParam};
+      return {
+          matchEnableIfSpecialization(LastTemplateParam->getDefaultArgument()
+                                          .getTypeSourceInfo()
+                                          ->getTypeLoc()),
+          LastTemplateParam};
     }
   }
   return {};
diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
index 74152c60..28f5ead 100644
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
@@ -50,7 +50,9 @@ StringRef getZeroLiteralToCompareWithForType(CastKind CastExprKind,
 
   case CK_PointerToBoolean:
   case CK_MemberPointerToBoolean: // Fall-through on purpose.
-    return Context.getLangOpts().CPlusPlus11 ? "nullptr" : "0";
+    return (Context.getLangOpts().CPlusPlus11 || Context.getLangOpts().C23)
+               ? "nullptr"
+               : "0";
 
   default:
     llvm_unreachable("Unexpected cast kind");
@@ -165,6 +167,12 @@ bool needsSpacePrefix(SourceLocation Loc, ASTContext &Context) {
 void fixGenericExprCastFromBool(DiagnosticBuilder &Diag,
                                 const ImplicitCastExpr *Cast,
                                 ASTContext &Context, StringRef OtherType) {
+  if (!Context.getLangOpts().CPlusPlus) {
+    Diag << FixItHint::CreateInsertion(Cast->getBeginLoc(),
+                                       (Twine("(") + OtherType + ")").str());
+    return;
+  }
+
   const Expr *SubExpr = Cast->getSubExpr();
   const bool NeedParens = !isa<ParenExpr>(SubExpr->IgnoreImplicit());
   const bool NeedSpace = needsSpacePrefix(Cast->getBeginLoc(), Context);
@@ -267,6 +275,10 @@ void ImplicitBoolConversionCheck::registerMatchers(MatchFinder *Finder) {
   auto BoolXor =
       binaryOperator(hasOperatorName("^"), hasLHS(ImplicitCastFromBool),
                      hasRHS(ImplicitCastFromBool));
+  auto ComparisonInCall = allOf(
+      hasParent(callExpr()),
+      hasSourceExpression(binaryOperator(hasAnyOperatorName("==", "!="))));
+
   Finder->addMatcher(
       traverse(TK_AsIs,
                implicitCastExpr(
@@ -281,6 +293,8 @@ void ImplicitBoolConversionCheck::registerMatchers(MatchFinder *Finder) {
                        stmt(anyOf(ifStmt(), whileStmt()), has(declStmt())))),
                    // Exclude cases common to implicit cast to and from bool.
                    unless(ExceptionCases), unless(has(BoolXor)),
+                   // Exclude C23 cases common to implicit cast to bool.
+                   unless(ComparisonInCall),
                    // Retrieve also parent statement, to check if we need
                    // additional parens in replacement.
                    optionally(hasParent(stmt().bind("parentStmt"))),
diff --git a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
index e811f55..88e4886 100644
--- a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
@@ -123,6 +123,9 @@ static const NamedDecl *getFailureForNamedDecl(const NamedDecl *ND) {
   if (const auto *Method = dyn_cast<CXXMethodDecl>(ND)) {
     if (const CXXMethodDecl *Overridden = getOverrideMethod(Method))
       Canonical = cast<NamedDecl>(Overridden->getCanonicalDecl());
+    else if (const FunctionTemplateDecl *Primary = Method->getPrimaryTemplate())
+      if (const FunctionDecl *TemplatedDecl = Primary->getTemplatedDecl())
+        Canonical = cast<NamedDecl>(TemplatedDecl->getCanonicalDecl());
 
     if (Canonical != ND)
       return Canonical;
diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp
index 06b949b..de103e0 100644
--- a/clang-tools-extra/clangd/Hover.cpp
+++ b/clang-tools-extra/clangd/Hover.cpp
@@ -247,8 +247,12 @@ fetchTemplateParameters(const TemplateParameterList *Params,
       if (!TTP->getName().empty())
         P.Name = TTP->getNameAsString();
 
-      if (TTP->hasDefaultArgument())
-        P.Default = TTP->getDefaultArgument().getAsString(PP);
+      if (TTP->hasDefaultArgument()) {
+        P.Default.emplace();
+        llvm::raw_string_ostream Out(*P.Default);
+        TTP->getDefaultArgument().getArgument().print(PP, Out,
+                                                      /*IncludeType=*/false);
+      }
     } else if (const auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(Param)) {
       P.Type = printType(NTTP, PP);
 
@@ -258,7 +262,8 @@ fetchTemplateParameters(const TemplateParameterList *Params,
       if (NTTP->hasDefaultArgument()) {
         P.Default.emplace();
         llvm::raw_string_ostream Out(*P.Default);
-        NTTP->getDefaultArgument()->printPretty(Out, nullptr, PP);
+        NTTP->getDefaultArgument().getArgument().print(PP, Out,
+                                                       /*IncludeType=*/false);
       }
     } else if (const auto *TTPD = dyn_cast<TemplateTemplateParmDecl>(Param)) {
       P.Type = printType(TTPD, PP);
diff --git a/clang-tools-extra/clangd/test/infinite-instantiation.test b/clang-tools-extra/clangd/test/infinite-instantiation.test
index d379a9c..a9c787c 100644
--- a/clang-tools-extra/clangd/test/infinite-instantiation.test
+++ b/clang-tools-extra/clangd/test/infinite-instantiation.test
@@ -1,5 +1,5 @@
 // RUN: rm -rf %t.dir && mkdir -p %t.dir
-// RUN: echo '[{"directory": "%/t.dir", "command": "clang -ftemplate-depth=100 -x c++ %s", "file": "%/s"}]' > %t.dir/compile_commands.json
+// RUN: echo '[{"directory": "%/t.dir", "command": "clang -ftemplate-depth=100 -x c++ %/s", "file": "%/s"}]' > %t.dir/compile_commands.json
 // RUN: not clangd --compile-commands-dir=%t.dir -check=%s 2>&1 | FileCheck -strict-whitespace %s
 
 // CHECK: [template_recursion_depth_exceeded]
diff --git a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
index 0b2273f..3220a5a 100644
--- a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
@@ -836,7 +836,9 @@ TEST_F(TargetDeclTest, OverloadExpr) {
       [[delete]] x;
     }
   )cpp";
-  EXPECT_DECLS("CXXDeleteExpr", "void operator delete(void *) noexcept");
+  // Sized deallocation is enabled by default in C++14 onwards.
+  EXPECT_DECLS("CXXDeleteExpr",
+               "void operator delete(void *, unsigned long) noexcept");
 }
 
 TEST_F(TargetDeclTest, DependentExprs) {
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 6a9892b..3e3195f 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -375,12 +375,15 @@ Changes in existing checks
   <clang-tidy/checks/readability/identifier-naming>` check in `GetConfigPerFile`
   mode by resolving symbolic links to header files. Fixed handling of Hungarian
   Prefix when configured to `LowerCase`. Added support for renaming designated
-  initializers. Added support for renaming macro arguments.
+  initializers. Added support for renaming macro arguments. Fixed renaming
+  conflicts arising from out-of-line member function template definitions.
 
 - Improved :doc:`readability-implicit-bool-conversion
   <clang-tidy/checks/readability/implicit-bool-conversion>` check to provide
   valid fix suggestions for ``static_cast`` without a preceding space and
-  fixed problem with duplicate parentheses in double implicit casts.
+  fixed problem with duplicate parentheses in double implicit casts. Corrected
+  the fix suggestions for C23 and later by using C-style casts instead of
+  ``static_cast``.
 
 - Improved :doc:`readability-redundant-inline-specifier
   <clang-tidy/checks/readability/redundant-inline-specifier>` check to properly
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/implicit-bool-conversion.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/implicit-bool-conversion.rst
index 1ea67a0..1ab21ff 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/implicit-bool-conversion.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/implicit-bool-conversion.rst
@@ -96,8 +96,8 @@ The rules for generating fix-it hints are:
   - ``if (!pointer)`` is changed to ``if (pointer == nullptr)``,
 
 - in case of conversions from bool to other built-in types, an explicit
-  ``static_cast`` is proposed to make it clear that a conversion is taking
-  place:
+  ``static_cast`` (or a C-style cast since C23) is proposed to make it clear
+  that a conversion is taking place:
 
   - ``int integer = boolean;`` is changed to
     ``int integer = static_cast<int>(boolean);``,
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/new-delete-overloads.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/new-delete-overloads.cpp
index 78f0211..f86fe8a 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/new-delete-overloads.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/new-delete-overloads.cpp
@@ -12,16 +12,6 @@ struct S {
 // CHECK-MESSAGES: :[[@LINE+1]]:7: warning: declaration of 'operator new' has no matching declaration of 'operator delete' at the same scope
 void *operator new(size_t size) noexcept(false);
 
-struct T {
-  // Sized deallocations are not enabled by default, and so this new/delete pair
-  // does not match. However, we expect only one warning, for the new, because
-  // the operator delete is a placement delete and we do not warn on mismatching
-  // placement operations.
-  // CHECK-MESSAGES: :[[@LINE+1]]:9: warning: declaration of 'operator new' has no matching declaration of 'operator delete' at the same scope
-  void *operator new(size_t size) noexcept;
-  void operator delete(void *ptr, size_t) noexcept; // ok only if sized deallocation is enabled
-};
-
 struct U {
   void *operator new(size_t size) noexcept;
   void operator delete(void *ptr) noexcept;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming-outofline.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming-outofline.cpp
new file mode 100644
index 0000000..f807875
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming-outofline.cpp
@@ -0,0 +1,30 @@
+// RUN: %check_clang_tidy %s readability-identifier-naming %t -std=c++20 \
+// RUN:   --config='{CheckOptions: { \
+// RUN:     readability-identifier-naming.MethodCase: CamelCase, \
+// RUN:  }}'
+
+namespace SomeNamespace {
+namespace Inner {
+
+class SomeClass {
+public:
+    template <typename T>
+    int someMethod();
+// CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for method 'someMethod' [readability-identifier-naming]
+// CHECK-FIXES: {{^}}    int SomeMethod();
+};
+template <typename T>
+int SomeClass::someMethod() {
+// CHECK-FIXES: {{^}}int SomeClass::SomeMethod() {
+    return 5;
+}
+
+} // namespace Inner
+
+void someFunc() {
+    Inner::SomeClass S;
+    S.someMethod<int>();
+// CHECK-FIXES: {{^}}    S.SomeMethod<int>();
+}
+
+} // namespace SomeNamespace
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.c b/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.c
new file mode 100644
index 0000000..a8c6985
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.c
@@ -0,0 +1,354 @@
+// RUN: %check_clang_tidy %s readability-implicit-bool-conversion %t -- -- -std=c23
+
+#undef NULL
+#define NULL 0L
+
+void functionTakingBool(bool);
+void functionTakingInt(int);
+void functionTakingUnsignedLong(unsigned long);
+void functionTakingChar(char);
+void functionTakingFloat(float);
+void functionTakingDouble(double);
+void functionTakingSignedChar(signed char);
+
+
+////////// Implicit conversion from bool.
+
+void implicitConversionFromBoolSimpleCases() {
+  bool boolean = true;
+
+  functionTakingBool(boolean);
+
+  functionTakingInt(boolean);
+  // CHECK-MESSAGES: :[[@LINE-1]]:21: warning: implicit conversion 'bool' -> 'int' [readability-implicit-bool-conversion]
+  // CHECK-FIXES: functionTakingInt((int)boolean);
+
+  functionTakingUnsignedLong(boolean);
+  // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: implicit conversion 'bool' -> 'unsigned long'
+  // CHECK-FIXES: functionTakingUnsignedLong((unsigned long)boolean);
+
+  functionTakingChar(boolean);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'bool' -> 'char'
+  // CHECK-FIXES: functionTakingChar((char)boolean);
+
+  functionTakingFloat(boolean);
+  // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: implicit conversion 'bool' -> 'float'
+  // CHECK-FIXES: functionTakingFloat((float)boolean);
+
+  functionTakingDouble(boolean);
+  // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: implicit conversion 'bool' -> 'double'
+  // CHECK-FIXES: functionTakingDouble((double)boolean);
+}
+
+float implicitConversionFromBoolInReturnValue() {
+  bool boolean = false;
+  return boolean;
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: implicit conversion 'bool' -> 'float'
+  // CHECK-FIXES: return (float)boolean;
+}
+
+void implicitConversionFromBoolInSingleBoolExpressions(bool b1, bool b2) {
+  bool boolean = true;
+  boolean = b1 ^ b2;
+  boolean |= !b1 || !b2;
+  boolean &= b1;
+
+  int integer = boolean - 3;
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: implicit conversion 'bool' -> 'int'
+  // CHECK-FIXES: int integer = (int)boolean - 3;
+
+  float floating = boolean / 0.3f;
+  // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: implicit conversion 'bool' -> 'float'
+  // CHECK-FIXES: float floating = (float)boolean / 0.3f;
+
+  char character = boolean;
+  // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: implicit conversion 'bool' -> 'char'
+  // CHECK-FIXES: char character = (char)boolean;
+}
+
+void implicitConversionFromBoolInComplexBoolExpressions() {
+  bool boolean = true;
+  bool anotherBoolean = false;
+
+  int integer = boolean && anotherBoolean;
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: implicit conversion 'bool' -> 'int'
+  // CHECK-MESSAGES: :[[@LINE-2]]:28: warning: implicit conversion 'bool' -> 'int'
+  // CHECK-FIXES: int integer = (int)boolean && (int)anotherBoolean;
+
+  float floating = (boolean || anotherBoolean) * 0.3f;
+  // CHECK-MESSAGES: :[[@LINE-1]]:21: warning: implicit conversion 'bool' -> 'int'
+  // CHECK-MESSAGES: :[[@LINE-2]]:32: warning: implicit conversion 'bool' -> 'int'
+  // CHECK-FIXES: float floating = ((int)boolean || (int)anotherBoolean) * 0.3f;
+
+  double doubleFloating = (boolean && (anotherBoolean || boolean)) * 0.3;
+  // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: implicit conversion 'bool' -> 'int'
+  // CHECK-MESSAGES: :[[@LINE-2]]:40: warning: implicit conversion 'bool' -> 'int'
+  // CHECK-MESSAGES: :[[@LINE-3]]:58: warning: implicit conversion 'bool' -> 'int'
+  // CHECK-FIXES: double doubleFloating = ((int)boolean && ((int)anotherBoolean || (int)boolean)) * 0.3;
+}
+
+void implicitConversionFromBoolLiterals() {
+  functionTakingInt(true);
+  // CHECK-MESSAGES: :[[@LINE-1]]:21: warning: implicit conversion 'bool' -> 'int'
+  // CHECK-FIXES: functionTakingInt(1);
+
+  functionTakingUnsignedLong(false);
+  // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: implicit conversion 'bool' -> 'unsigned long'
+  // CHECK-FIXES: functionTakingUnsignedLong(0u);
+
+  functionTakingSignedChar(true);
+  // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: implicit conversion 'bool' -> 'signed char'
+  // CHECK-FIXES: functionTakingSignedChar(1);
+
+  functionTakingFloat(false);
+  // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: implicit conversion 'bool' -> 'float'
+  // CHECK-FIXES: functionTakingFloat(0.0f);
+
+  functionTakingDouble(true);
+  // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: implicit conversion 'bool' -> 'double'
+  // CHECK-FIXES: functionTakingDouble(1.0);
+}
+
+void implicitConversionFromBoolInComparisons() {
+  bool boolean = true;
+  int integer = 0;
+
+  functionTakingBool(boolean == integer);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'bool' -> 'int'
+  // CHECK-FIXES: functionTakingBool((int)boolean == integer);
+
+  functionTakingBool(integer != boolean);
+  // CHECK-MESSAGES: :[[@LINE-1]]:33: warning: implicit conversion 'bool' -> 'int'
+  // CHECK-FIXES: functionTakingBool(integer != (int)boolean);
+}
+
+void ignoreBoolComparisons() {
+  bool boolean = true;
+  bool anotherBoolean = false;
+
+  functionTakingBool(boolean == anotherBoolean);
+  functionTakingBool(boolean != anotherBoolean);
+}
+
+void ignoreExplicitCastsFromBool() {
+  bool boolean = true;
+
+  int integer = (int)boolean + 3;
+  float floating = (float)boolean * 0.3f;
+  char character = (char)boolean;
+}
+
+void ignoreImplicitConversionFromBoolInMacroExpansions() {
+  bool boolean = true;
+
+  #define CAST_FROM_BOOL_IN_MACRO_BODY boolean + 3
+  int integerFromMacroBody = CAST_FROM_BOOL_IN_MACRO_BODY;
+
+  #define CAST_FROM_BOOL_IN_MACRO_ARGUMENT(x) x + 3
+  int integerFromMacroArgument = CAST_FROM_BOOL_IN_MACRO_ARGUMENT(boolean);
+}
+
+////////// Implicit conversions to bool.
+
+void implicitConversionToBoolSimpleCases() {
+  int integer = 10;
+  functionTakingBool(integer);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'int' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(integer != 0);
+
+  unsigned long unsignedLong = 10;
+  functionTakingBool(unsignedLong);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'unsigned long' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(unsignedLong != 0u);
+
+  float floating = 0.0f;
+  functionTakingBool(floating);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'float' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(floating != 0.0f);
+
+  double doubleFloating = 1.0f;
+  functionTakingBool(doubleFloating);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'double' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(doubleFloating != 0.0);
+
+  signed char character = 'a';
+  functionTakingBool(character);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'signed char' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(character != 0);
+
+  int* pointer = nullptr;
+  functionTakingBool(pointer);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'int *' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(pointer != nullptr);
+}
+
+void implicitConversionToBoolInSingleExpressions() {
+  int integer = 10;
+  bool boolComingFromInt;
+  boolComingFromInt = integer;
+  // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: implicit conversion 'int' -> 'bool'
+  // CHECK-FIXES: boolComingFromInt = (integer != 0);
+
+  float floating = 10.0f;
+  bool boolComingFromFloat;
+  boolComingFromFloat = floating;
+  // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: implicit conversion 'float' -> 'bool'
+  // CHECK-FIXES: boolComingFromFloat = (floating != 0.0f);
+
+  signed char character = 'a';
+  bool boolComingFromChar;
+  boolComingFromChar = character;
+  // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: implicit conversion 'signed char' -> 'bool'
+  // CHECK-FIXES: boolComingFromChar = (character != 0);
+
+  int* pointer = nullptr;
+  bool boolComingFromPointer;
+  boolComingFromPointer = pointer;
+  // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: implicit conversion 'int *' -> 'bool'
+  // CHECK-FIXES: boolComingFromPointer = (pointer != nullptr);
+}
+
+void implicitConversionToBoolInComplexExpressions() {
+  bool boolean = true;
+
+  int integer = 10;
+  int anotherInteger = 20;
+  bool boolComingFromInteger;
+  boolComingFromInteger = integer + anotherInteger;
+  // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: implicit conversion 'int' -> 'bool'
+  // CHECK-FIXES: boolComingFromInteger = ((integer + anotherInteger) != 0);
+}
+
+void implicitConversionInNegationExpressions() {
+  int integer = 10;
+  bool boolComingFromNegatedInt;
+  boolComingFromNegatedInt = !integer;
+  // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: implicit conversion 'int' -> 'bool'
+  // CHECK-FIXES: boolComingFromNegatedInt = ((!integer) != 0);
+}
+
+bool implicitConversionToBoolInReturnValue() {
+  float floating = 1.0f;
+  return floating;
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: implicit conversion 'float' -> 'bool'
+  // CHECK-FIXES: return floating != 0.0f;
+}
+
+void implicitConversionToBoolFromLiterals() {
+  functionTakingBool(0);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'int' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(false);
+
+  functionTakingBool(1);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'int' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(true);
+
+  functionTakingBool(2ul);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'unsigned long' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(true);
+
+  functionTakingBool(0.0f);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'float' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(false);
+
+  functionTakingBool(1.0f);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'float' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(true);
+
+  functionTakingBool(2.0);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'double' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(true);
+
+  functionTakingBool('\0');
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'int' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(false);
+
+  functionTakingBool('a');
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'int' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(true);
+
+  functionTakingBool("");
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'char *' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(true);
+
+  functionTakingBool("abc");
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'char *' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(true);
+
+  functionTakingBool(NULL);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'long' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(false);
+}
+
+void implicitConversionToBoolFromUnaryMinusAndZeroLiterals() {
+  functionTakingBool(-0);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'int' -> 'bool'
+  // CHECK-FIXES: functionTakingBool((-0) != 0);
+
+  functionTakingBool(-0.0f);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'float' -> 'bool'
+  // CHECK-FIXES: functionTakingBool((-0.0f) != 0.0f);
+
+  functionTakingBool(-0.0);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'double' -> 'bool'
+  // CHECK-FIXES: functionTakingBool((-0.0) != 0.0);
+}
+
+void ignoreExplicitCastsToBool() {
+  int integer = 10;
+  bool boolComingFromInt = (bool)integer;
+
+  float floating = 10.0f;
+  bool boolComingFromFloat = (bool)floating;
+
+  char character = 'a';
+  bool boolComingFromChar = (bool)character;
+
+  int* pointer = nullptr;
+  bool booleanComingFromPointer = (bool)pointer;
+}
+
+void ignoreImplicitConversionToBoolInMacroExpansions() {
+  int integer = 3;
+
+  #define CAST_TO_BOOL_IN_MACRO_BODY integer && false
+  bool boolFromMacroBody = CAST_TO_BOOL_IN_MACRO_BODY;
+
+  #define CAST_TO_BOOL_IN_MACRO_ARGUMENT(x) x || true
+  bool boolFromMacroArgument = CAST_TO_BOOL_IN_MACRO_ARGUMENT(integer);
+}
+
+int implicitConversionReturnInt()
+{
+    return true;
+    // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: implicit conversion 'bool' -> 'int'
+    // CHECK-FIXES: return 1
+}
+
+int implicitConversionReturnIntWithParens()
+{
+    return (true);
+    // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: implicit conversion 'bool' -> 'int'
+    // CHECK-FIXES: return 1
+}
+
+bool implicitConversionReturnBool()
+{
+    return 1;
+    // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: implicit conversion 'int' -> 'bool'
+    // CHECK-FIXES: return true
+}
+
+bool implicitConversionReturnBoolWithParens()
+{
+    return (1);
+    // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: implicit conversion 'int' -> 'bool'
+    // CHECK-FIXES: return true
+}
+
+int keepCompactReturnInC_PR71848() {
+  bool foo = false;
+  return( foo );
+// CHECK-MESSAGES: :[[@LINE-1]]:9: warning: implicit conversion 'bool' -> 'int' [readability-implicit-bool-conversion]
+// CHECK-FIXES: return(int)( foo );
+}
diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index c20ce47..a6bcb85 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -349,10 +349,7 @@ if (LLVM_COMPILER_IS_GCC_COMPATIBLE)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pedantic -Wno-long-long")
   endif ()
 
-  check_cxx_compiler_flag("-Werror -Wnested-anon-types" CXX_SUPPORTS_NO_NESTED_ANON_TYPES_FLAG)
-  if( CXX_SUPPORTS_NO_NESTED_ANON_TYPES_FLAG )
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-nested-anon-types" )
-  endif()
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-nested-anon-types" )
 endif ()
 
 # Determine HOST_LINK_VERSION on Darwin.
diff --git a/clang/cmake/caches/CrossWinToARMLinux.cmake b/clang/cmake/caches/CrossWinToARMLinux.cmake
index 736a54e..62e87c6 100644
--- a/clang/cmake/caches/CrossWinToARMLinux.cmake
+++ b/clang/cmake/caches/CrossWinToARMLinux.cmake
@@ -89,6 +89,13 @@ endif()
 
 message(STATUS "Toolchain target to build: ${LLVM_TARGETS_TO_BUILD}")
 
+# Allow to override libc++ ABI version. Use 2 by default.
+if (NOT DEFINED LIBCXX_ABI_VERSION)
+  set(LIBCXX_ABI_VERSION 2)
+endif()
+
+message(STATUS "Toolchain's Libc++ ABI version: ${LIBCXX_ABI_VERSION}")
+
 if (NOT DEFINED CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE "Release" CACHE STRING "")
 endif()
@@ -109,8 +116,15 @@ set(CLANG_DEFAULT_OBJCOPY                   "llvm-objcopy" CACHE STRING "")
 set(CLANG_DEFAULT_RTLIB                     "compiler-rt" CACHE STRING "")
 set(CLANG_DEFAULT_UNWINDLIB                 "libunwind" CACHE STRING "")
 
-if(WIN32)
-  set(CMAKE_MSVC_RUNTIME_LIBRARY            "MultiThreaded" CACHE STRING "")
+if (NOT DEFINED CMAKE_MSVC_RUNTIME_LIBRARY AND WIN32)
+  #Note: Always specify MT DLL for the LLDB build configurations on Windows host.
+  if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(CMAKE_MSVC_RUNTIME_LIBRARY            "MultiThreadedDebugDLL" CACHE STRING "")
+  else()
+    set(CMAKE_MSVC_RUNTIME_LIBRARY            "MultiThreadedDLL" CACHE STRING "")
+  endif()
+  # Grab all ucrt/vcruntime related DLLs into the binary installation folder.
+  set(CMAKE_INSTALL_UCRT_LIBRARIES          ON CACHE BOOL "")
 endif()
 
 # Set up RPATH for the target runtime/builtin libraries.
@@ -127,6 +141,15 @@ set(BUILTINS_${TOOLCHAIN_TARGET_TRIPLE}_CMAKE_INSTALL_RPATH
 set(BUILTINS_${TOOLCHAIN_TARGET_TRIPLE}_CMAKE_BUILD_WITH_INSTALL_RPATH            ON  CACHE BOOL "")
 set(BUILTINS_${TOOLCHAIN_TARGET_TRIPLE}_LLVM_CMAKE_DIR                            "${LLVM_PROJECT_DIR}/llvm/cmake/modules" CACHE PATH "")
 
+if (DEFINED TOOLCHAIN_TARGET_COMPILER_FLAGS)
+  foreach(lang C;CXX;ASM)
+    set(BUILTINS_${TOOLCHAIN_TARGET_TRIPLE}_CMAKE_${lang}_FLAGS         "${TOOLCHAIN_TARGET_COMPILER_FLAGS}" CACHE STRING "")
+  endforeach()
+endif()
+foreach(type SHARED;MODULE;EXE)
+  set(BUILTINS_${TOOLCHAIN_TARGET_TRIPLE}_CMAKE_${type}_LINKER_FLAGS    "-fuse-ld=lld" CACHE STRING "")
+endforeach()
+
 set(LLVM_RUNTIME_TARGETS                    "${TOOLCHAIN_TARGET_TRIPLE}" CACHE STRING "")
 set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR      ON CACHE BOOL "")
 
@@ -137,6 +160,15 @@ set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_CMAKE_SYSROOT
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_CMAKE_INSTALL_RPATH                       "${RUNTIMES_INSTALL_RPATH}"  CACHE STRING "")
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_CMAKE_BUILD_WITH_INSTALL_RPATH            ON  CACHE BOOL "")
 
+if (DEFINED TOOLCHAIN_TARGET_COMPILER_FLAGS)
+  foreach(lang C;CXX;ASM)
+    set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_CMAKE_${lang}_FLAGS         "${TOOLCHAIN_TARGET_COMPILER_FLAGS}" CACHE STRING "")
+  endforeach()
+endif()
+foreach(type SHARED;MODULE;EXE)
+  set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_CMAKE_${type}_LINKER_FLAGS    "-fuse-ld=lld" CACHE STRING "")
+endforeach()
+
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_BUILD_BUILTINS                ON CACHE BOOL "")
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_BUILD_SANITIZERS              OFF CACHE BOOL "")
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_BUILD_XRAY                    OFF CACHE BOOL "")
@@ -164,7 +196,7 @@ set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXXABI_ENABLE_SHARED
 
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_USE_COMPILER_RT                    ON CACHE BOOL "")
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_ENABLE_SHARED                      OFF CACHE BOOL "")
-set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_ABI_VERSION                        2 CACHE STRING "")
+set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_ABI_VERSION                        ${LIBCXX_ABI_VERSION} CACHE STRING "")
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_CXX_ABI                            "libcxxabi" CACHE STRING "")    #!!!
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS      ON CACHE BOOL "")
 
diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake
index d5546e2..66e7649 100644
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -19,7 +19,6 @@ set(LLVM_ENABLE_LLD ON CACHE BOOL "")
 set(LLVM_ENABLE_LTO ON CACHE BOOL "")
 set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "")
 set(LLVM_ENABLE_PLUGINS OFF CACHE BOOL "")
-set(LLVM_ENABLE_TERMINFO OFF CACHE BOOL "")
 set(LLVM_ENABLE_UNWIND_TABLES OFF CACHE BOOL "")
 set(LLVM_ENABLE_Z3_SOLVER OFF CACHE BOOL "")
 set(LLVM_ENABLE_ZLIB ON CACHE BOOL "")
diff --git a/clang/cmake/caches/Fuchsia.cmake b/clang/cmake/caches/Fuchsia.cmake
index 30a3b91..4d3af3a 100644
--- a/clang/cmake/caches/Fuchsia.cmake
+++ b/clang/cmake/caches/Fuchsia.cmake
@@ -12,7 +12,6 @@ set(LLVM_ENABLE_DIA_SDK OFF CACHE BOOL "")
 set(LLVM_ENABLE_LIBEDIT OFF CACHE BOOL "")
 set(LLVM_ENABLE_LIBXML2 OFF CACHE BOOL "")
 set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "")
-set(LLVM_ENABLE_TERMINFO OFF CACHE BOOL "")
 set(LLVM_ENABLE_UNWIND_TABLES OFF CACHE BOOL "")
 set(LLVM_ENABLE_Z3_SOLVER OFF CACHE BOOL "")
 set(LLVM_ENABLE_ZLIB OFF CACHE BOOL "")
@@ -34,7 +33,6 @@ set(_FUCHSIA_BOOTSTRAP_PASSTHROUGH
   LibXml2_ROOT
   LLVM_ENABLE_CURL
   LLVM_ENABLE_HTTPLIB
-  LLVM_ENABLE_TERMINFO
   LLVM_ENABLE_LIBEDIT
   CURL_ROOT
   OpenSSL_ROOT
@@ -48,11 +46,6 @@ set(_FUCHSIA_BOOTSTRAP_PASSTHROUGH
   PANEL_LIBRARIES
 
   # Deprecated
-  Terminfo_ROOT
-
-  Terminfo_LIBRARIES
-
-  # Deprecated
   LibEdit_ROOT
 
   LibEdit_INCLUDE_DIRS
diff --git a/clang/cmake/caches/VectorEngine.cmake b/clang/cmake/caches/VectorEngine.cmake
index 2f968a2..b429fb0 100644
--- a/clang/cmake/caches/VectorEngine.cmake
+++ b/clang/cmake/caches/VectorEngine.cmake
@@ -13,9 +13,7 @@
 #   ninja
 #
 
-# Disable TERMINFO, ZLIB, and ZSTD for VE since there is no pre-compiled
-# libraries.
-set(LLVM_ENABLE_TERMINFO OFF CACHE BOOL "")
+# Disable ZLIB, and ZSTD for VE since there is no pre-compiled libraries.
 set(LLVM_ENABLE_ZLIB OFF CACHE BOOL "")
 set(LLVM_ENABLE_ZSTD OFF CACHE BOOL "")
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 81e9d04..d023f53 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -59,6 +59,18 @@ C++ Specific Potentially Breaking Changes
 - Clang now performs semantic analysis for unary operators with dependent operands
   that are known to be of non-class non-enumeration type prior to instantiation.
 
+  This change uncovered a bug in libstdc++ 14.1.0 which may cause compile failures
+  on systems using that version of libstdc++ and Clang 19, with an error that looks
+  something like this:
+
+  .. code-block:: text
+
+    <source>:4:5: error: expression is not assignable
+    4 |     ++this;
+      |     ^ ~~~~
+
+  To fix this, update libstdc++ to version 14.1.1 or greater.
+
 ABI Changes in This Version
 ---------------------------
 - Fixed Microsoft name mangling of implicitly defined variables used for thread
@@ -155,6 +167,11 @@ C++17 Feature Support
   files because they may not be stable across multiple TUs (the values may vary
   based on compiler version as well as CPU tuning). #GH60174
 
+C++14 Feature Support
+^^^^^^^^^^^^^^^^^^^^^
+- Sized deallocation is enabled by default in C++14 onwards. The user may specify
+  ``-fno-sized-deallocation`` to disable it if there are some regressions.
+
 C++20 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
@@ -317,13 +334,18 @@ New Compiler Flags
 
 - ``-fexperimental-late-parse-attributes`` enables an experimental feature to
   allow late parsing certain attributes in specific contexts where they would
-  not normally be late parsed.
+  not normally be late parsed. Currently this allows late parsing the
+  `counted_by` attribute in C. See `Attribute Changes in Clang`_.
 
 - ``-fseparate-named-sections`` uses separate unique sections for global
   symbols in named special sections (i.e. symbols annotated with
   ``__attribute__((section(...)))``. This enables linker GC to collect unused
   symbols without having to use a per-symbol section.
 
+- ``-fms-define-stdc`` and its clang-cl counterpart ``/Zc:__STDC__``.
+  Matches MSVC behaviour by defining ``__STDC__`` to ``1`` when
+  MSVC compatibility mode is used. It has no effect for C++ code.
+
 Deprecated Compiler Flags
 -------------------------
 
@@ -406,6 +428,24 @@ Attribute Changes in Clang
 - The ``clspv_libclc_builtin`` attribute has been added to allow clspv
   (`OpenCL-C to Vulkan SPIR-V compiler <https://github.com/google/clspv>`_) to identify functions coming from libclc
   (`OpenCL-C builtin library <https://libclc.llvm.org>`_).
+- The ``counted_by`` attribute is now allowed on pointers that are members of a
+  struct in C.
+
+- The ``counted_by`` attribute can now be late parsed in C when
+  ``-fexperimental-late-parse-attributes`` is passed but only when attribute is
+  used in the declaration attribute position. This allows using the
+  attribute on existing code where it previously impossible to do so without
+  re-ordering struct field declarations would break ABI as shown below.
+
+  .. code-block:: c
+
+     struct BufferTy {
+       /* Refering to `count` requires late parsing */
+       char* buffer __counted_by(count);
+       /* Swapping `buffer` and `count` to avoid late parsing would break ABI */
+       size_t count;
+     };
+
 
 Improvements to Clang's diagnostics
 -----------------------------------
@@ -749,6 +789,11 @@ Bug Fixes to C++ Support
 - Clang now correctly diagnoses when the current instantiation is used as an incomplete base class.
 - Clang no longer treats ``constexpr`` class scope function template specializations of non-static members
   as implicitly ``const`` in language modes after C++11.
+- Fixed a crash when trying to emit captures in a lambda call operator with an explicit object
+  parameter that is called on a derived type of the lambda.
+  Fixes (#GH87210), (GH89541).
+- Clang no longer tries to check if an expression is immediate-escalating in an unevaluated context.
+  Fixes (#GH91308).
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -761,12 +806,15 @@ Miscellaneous Bug Fixes
 
 - Fixed an infinite recursion in ASTImporter, on return type declared inside
   body of C++11 lambda without trailing return (#GH68775).
+- Fixed declaration name source location of instantiated function definitions (GH71161).
+- Improve diagnostic output to print an expression instead of 'no argument` when comparing Values as template arguments.
 
 Miscellaneous Clang Crashes Fixed
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 - Do not attempt to dump the layout of dependent types or invalid declarations
   when ``-fdump-record-layouts-complete`` is passed. Fixes #GH83684.
+- Unhandled StructuralValues in the template differ (#GH93068).
 
 OpenACC Specific Changes
 ------------------------
@@ -780,6 +828,8 @@ AMDGPU Support
 X86 Support
 ^^^^^^^^^^^
 
+- Remove knl/knm specific ISA supports: AVX512PF, AVX512ER, PREFETCHWT1
+
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -832,6 +882,10 @@ Windows Support
   including STL headers will no longer slow down compile times since ``intrin.h``
   is not included from MSVC STL.
 
+- When the target triple is `*-windows-msvc` strict aliasing is now disabled by default
+  to ensure compatibility with msvc. Previously strict aliasing was only disabled if the
+  driver mode was cl.
+
 LoongArch Support
 ^^^^^^^^^^^^^^^^^
 
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index eb8b583..ac9f0b0 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -1179,6 +1179,47 @@ security.insecureAPI.DeprecatedOrUnsafeBufferHandling (C)
    strncpy(buf, "a", 1); // warn
  }
 
+security.SetgidSetuidOrder (C)
+""""""""""""""""""""""""""""""
+When dropping user-level and group-level privileges in a program by using
+``setuid`` and ``setgid`` calls, it is important to reset the group-level
+privileges (with ``setgid``) first. Function ``setgid`` will likely fail if
+the superuser privileges are already dropped.
+
+The checker checks for sequences of ``setuid(getuid())`` and
+``setgid(getgid())`` calls (in this order). If such a sequence is found and
+there is no other privilege-changing function call (``seteuid``, ``setreuid``,
+``setresuid`` and the GID versions of these) in between, a warning is
+generated. The checker finds only exactly ``setuid(getuid())`` calls (and the
+GID versions), not for example if the result of ``getuid()`` is stored in a
+variable.
+
+.. code-block:: c
+
+ void test1() {
+   // ...
+   // end of section with elevated privileges
+   // reset privileges (user and group) to normal user
+   if (setuid(getuid()) != 0) {
+     handle_error();
+     return;
+   }
+   if (setgid(getgid()) != 0) { // warning: A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail
+     handle_error();
+     return;
+   }
+   // user-ID and group-ID are reset to normal user now
+   // ...
+ }
+
+In the code above the problem is that ``setuid(getuid())`` removes superuser
+privileges before ``setgid(getgid())`` is called. To fix the problem the
+``setgid(getgid())`` should be called first. Further attention is needed to
+avoid code like ``setgid(getuid())`` (this checker does not detect bugs like
+this) and always check the return value of these calls.
+
+This check corresponds to SEI CERT Rule `POS36-C <https://wiki.sei.cmu.edu/confluence/display/c/POS36-C.+Observe+correct+revocation+order+while+relinquishing+privileges>`_.
+
 .. _unix-checkers:
 
 unix
@@ -2792,6 +2833,31 @@ Warn on mmap() calls that are both writable and executable.
    //       code
  }
 
+.. _alpha-security-putenv-stack-array:
+
+alpha.security.PutenvStackArray (C)
+"""""""""""""""""""""""""""""""""""
+Finds calls to the ``putenv`` function which pass a pointer to a stack-allocated
+(automatic) array as the argument. Function ``putenv`` does not copy the passed
+string, only a pointer to the data is stored and this data can be read even by
+other threads. Content of a stack-allocated array is likely to be overwritten
+after returning from the parent function.
+
+The problem can be solved by using a static array variable or dynamically
+allocated memory. Even better is to avoid using ``putenv`` (it has other
+problems related to memory leaks) and use ``setenv`` instead.
+
+The check corresponds to CERT rule
+`POS34-C. Do not call putenv() with a pointer to an automatic variable as the argument
+<https://wiki.sei.cmu.edu/confluence/display/c/POS34-C.+Do+not+call+putenv%28%29+with+a+pointer+to+an+automatic+variable+as+the+argument>`_.
+
+.. code-block:: c
+
+  int f() {
+    char env[] = "NAME=value";
+    return putenv(env); // putenv function should not be called with stack-allocated string
+  }
+
 .. _alpha-security-ReturnPtrRange:
 
 alpha.security.ReturnPtrRange (C)
@@ -2818,55 +2884,6 @@ alpha.security.cert
 
 SEI CERT checkers which tries to find errors based on their `C coding rules <https://wiki.sei.cmu.edu/confluence/display/c/2+Rules>`_.
 
-.. _alpha-security-cert-pos-checkers:
-
-alpha.security.cert.pos
-^^^^^^^^^^^^^^^^^^^^^^^
-
-SEI CERT checkers of `POSIX C coding rules <https://wiki.sei.cmu.edu/confluence/pages/viewpage.action?pageId=87152405>`_.
-
-.. _alpha-security-cert-pos-34c:
-
-alpha.security.cert.pos.34c
-"""""""""""""""""""""""""""
-Finds calls to the ``putenv`` function which pass a pointer to an automatic variable as the argument.
-
-.. code-block:: c
-
-  int func(const char *var) {
-    char env[1024];
-    int retval = snprintf(env, sizeof(env),"TEST=%s", var);
-    if (retval < 0 || (size_t)retval >= sizeof(env)) {
-        /* Handle error */
-    }
-
-    return putenv(env); // putenv function should not be called with auto variables
-  }
-
-Limitations:
-
-   - Technically, one can pass automatic variables to ``putenv``,
-     but one needs to ensure that the given environment key stays
-     alive until it's removed or overwritten.
-     Since the analyzer cannot keep track of which envvars get overwritten
-     and when, it needs to be slightly more aggressive and warn for such
-     cases too, leading in some cases to false-positive reports like this:
-
-     .. code-block:: c
-
-        void baz() {
-          char env[] = "NAME=value";
-          putenv(env); // false-positive warning: putenv function should not be called...
-          // More code...
-          putenv((char *)"NAME=anothervalue");
-          // This putenv call overwrites the previous entry, thus that can no longer dangle.
-        } // 'env' array becomes dead only here.
-
-alpha.security.cert.env
-^^^^^^^^^^^^^^^^^^^^^^^
-
-SEI CERT checkers of `Environment C coding rules <https://wiki.sei.cmu.edu/confluence/x/JdcxBQ>`_.
-
 alpha.security.taint
 ^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 2ce2b810..a1d1d1c 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -110,6 +110,9 @@ class VarTemplateDecl;
 class VTableContextBase;
 class XRayFunctionFilter;
 
+/// A simple array of base specifiers.
+typedef SmallVector<CXXBaseSpecifier *, 4> CXXCastPath;
+
 namespace Builtin {
 
 class Context;
@@ -1170,6 +1173,12 @@ public:
   /// in device compilation.
   llvm::DenseSet<const FunctionDecl *> CUDAImplicitHostDeviceFunUsedByDevice;
 
+  /// For capturing lambdas with an explicit object parameter whose type is
+  /// derived from the lambda type, we need to perform derived-to-base
+  /// conversion so we can access the captures; the cast paths for that
+  /// are stored here.
+  llvm::DenseMap<const CXXMethodDecl *, CXXCastPath> LambdaCastPaths;
+
   ASTContext(LangOptions &LOpts, SourceManager &SM, IdentifierTable &idents,
              SelectorTable &sels, Builtin::Context &builtins,
              TranslationUnitKind TUKind);
diff --git a/clang/include/clang/AST/ASTNodeTraverser.h b/clang/include/clang/AST/ASTNodeTraverser.h
index bf7c204..616f926 100644
--- a/clang/include/clang/AST/ASTNodeTraverser.h
+++ b/clang/include/clang/AST/ASTNodeTraverser.h
@@ -695,7 +695,7 @@ public:
     if (const auto *TC = D->getTypeConstraint())
       Visit(TC->getImmediatelyDeclaredConstraint());
     if (D->hasDefaultArgument())
-      Visit(D->getDefaultArgument(), SourceRange(),
+      Visit(D->getDefaultArgument().getArgument(), SourceRange(),
             D->getDefaultArgStorage().getInheritedFrom(),
             D->defaultArgumentWasInherited() ? "inherited from" : "previous");
   }
@@ -704,9 +704,9 @@ public:
     if (const auto *E = D->getPlaceholderTypeConstraint())
       Visit(E);
     if (D->hasDefaultArgument())
-      Visit(D->getDefaultArgument(), SourceRange(),
-            D->getDefaultArgStorage().getInheritedFrom(),
-            D->defaultArgumentWasInherited() ? "inherited from" : "previous");
+      dumpTemplateArgumentLoc(
+          D->getDefaultArgument(), D->getDefaultArgStorage().getInheritedFrom(),
+          D->defaultArgumentWasInherited() ? "inherited from" : "previous");
   }
 
   void VisitTemplateTemplateParmDecl(const TemplateTemplateParmDecl *D) {
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index 5e485cc..7fd80b9 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -2188,6 +2188,8 @@ public:
 
   void setRangeEnd(SourceLocation E) { EndRangeLoc = E; }
 
+  void setDeclarationNameLoc(DeclarationNameLoc L) { DNLoc = L; }
+
   /// Returns the location of the ellipsis of a variadic function.
   SourceLocation getEllipsisLoc() const {
     const auto *FPT = getType()->getAs<FunctionProtoType>();
diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h
index f3d6a32..5b6a6b4 100644
--- a/clang/include/clang/AST/DeclTemplate.h
+++ b/clang/include/clang/AST/DeclTemplate.h
@@ -1185,7 +1185,7 @@ class TemplateTypeParmDecl final : public TypeDecl,
 
   /// The default template argument, if any.
   using DefArgStorage =
-      DefaultArgStorage<TemplateTypeParmDecl, TypeSourceInfo *>;
+      DefaultArgStorage<TemplateTypeParmDecl, TemplateArgumentLoc *>;
   DefArgStorage DefaultArgument;
 
   TemplateTypeParmDecl(DeclContext *DC, SourceLocation KeyLoc,
@@ -1225,13 +1225,9 @@ public:
   bool hasDefaultArgument() const { return DefaultArgument.isSet(); }
 
   /// Retrieve the default argument, if any.
-  QualType getDefaultArgument() const {
-    return DefaultArgument.get()->getType();
-  }
-
-  /// Retrieves the default argument's source information, if any.
-  TypeSourceInfo *getDefaultArgumentInfo() const {
-    return DefaultArgument.get();
+  const TemplateArgumentLoc &getDefaultArgument() const {
+    static const TemplateArgumentLoc NoneLoc;
+    return DefaultArgument.isSet() ? *DefaultArgument.get() : NoneLoc;
   }
 
   /// Retrieves the location of the default argument declaration.
@@ -1244,9 +1240,8 @@ public:
   }
 
   /// Set the default argument for this template parameter.
-  void setDefaultArgument(TypeSourceInfo *DefArg) {
-    DefaultArgument.set(DefArg);
-  }
+  void setDefaultArgument(const ASTContext &C,
+                          const TemplateArgumentLoc &DefArg);
 
   /// Set that this default argument was inherited from another
   /// parameter.
@@ -1365,7 +1360,8 @@ class NonTypeTemplateParmDecl final
 
   /// The default template argument, if any, and whether or not
   /// it was inherited.
-  using DefArgStorage = DefaultArgStorage<NonTypeTemplateParmDecl, Expr *>;
+  using DefArgStorage =
+      DefaultArgStorage<NonTypeTemplateParmDecl, TemplateArgumentLoc *>;
   DefArgStorage DefaultArgument;
 
   // FIXME: Collapse this into TemplateParamPosition; or, just move depth/index
@@ -1435,7 +1431,10 @@ public:
   bool hasDefaultArgument() const { return DefaultArgument.isSet(); }
 
   /// Retrieve the default argument, if any.
-  Expr *getDefaultArgument() const { return DefaultArgument.get(); }
+  const TemplateArgumentLoc &getDefaultArgument() const {
+    static const TemplateArgumentLoc NoneLoc;
+    return DefaultArgument.isSet() ? *DefaultArgument.get() : NoneLoc;
+  }
 
   /// Retrieve the location of the default argument, if any.
   SourceLocation getDefaultArgumentLoc() const;
@@ -1449,7 +1448,8 @@ public:
   /// Set the default argument for this template parameter, and
   /// whether that default argument was inherited from another
   /// declaration.
-  void setDefaultArgument(Expr *DefArg) { DefaultArgument.set(DefArg); }
+  void setDefaultArgument(const ASTContext &C,
+                          const TemplateArgumentLoc &DefArg);
   void setInheritedDefaultArgument(const ASTContext &C,
                                    NonTypeTemplateParmDecl *Parm) {
     DefaultArgument.setInherited(C, Parm);
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index f5cefed..4bbb438 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -30,6 +30,7 @@
 #include "clang/AST/ExprOpenMP.h"
 #include "clang/AST/LambdaCapture.h"
 #include "clang/AST/NestedNameSpecifier.h"
+#include "clang/AST/OpenACCClause.h"
 #include "clang/AST/OpenMPClause.h"
 #include "clang/AST/Stmt.h"
 #include "clang/AST/StmtCXX.h"
@@ -510,6 +511,7 @@ private:
   bool
   TraverseOpenACCAssociatedStmtConstruct(OpenACCAssociatedStmtConstruct *S);
   bool VisitOpenACCClauseList(ArrayRef<const OpenACCClause *>);
+  bool VisitOpenACCClause(const OpenACCClause *);
 };
 
 template <typename Derived>
@@ -1960,7 +1962,7 @@ DEF_TRAVERSE_DECL(TemplateTypeParmDecl, {
     TRY_TO(TraverseType(QualType(D->getTypeForDecl(), 0)));
   TRY_TO(TraverseTemplateTypeParamDeclConstraints(D));
   if (D->hasDefaultArgument() && !D->defaultArgumentWasInherited())
-    TRY_TO(TraverseTypeLoc(D->getDefaultArgumentInfo()->getTypeLoc()));
+    TRY_TO(TraverseTemplateArgumentLoc(D->getDefaultArgument()));
 })
 
 DEF_TRAVERSE_DECL(TypedefDecl, {
@@ -2320,7 +2322,7 @@ DEF_TRAVERSE_DECL(NonTypeTemplateParmDecl, {
   // A non-type template parameter, e.g. "S" in template<int S> class Foo ...
   TRY_TO(TraverseDeclaratorHelper(D));
   if (D->hasDefaultArgument() && !D->defaultArgumentWasInherited())
-    TRY_TO(TraverseStmt(D->getDefaultArgument()));
+    TRY_TO(TraverseTemplateArgumentLoc(D->getDefaultArgument()));
 })
 
 DEF_TRAVERSE_DECL(ParmVarDecl, {
@@ -3968,8 +3970,25 @@ bool RecursiveASTVisitor<Derived>::TraverseOpenACCAssociatedStmtConstruct(
 }
 
 template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOpenACCClause(const OpenACCClause *C) {
+  for (const Stmt *Child : C->children())
+    TRY_TO(TraverseStmt(const_cast<Stmt *>(Child)));
+  return true;
+}
+
+template <typename Derived>
 bool RecursiveASTVisitor<Derived>::VisitOpenACCClauseList(
-    ArrayRef<const OpenACCClause *>) {
+    ArrayRef<const OpenACCClause *> Clauses) {
+
+  for (const auto *C : Clauses)
+    TRY_TO(VisitOpenACCClause(C));
+//    if (const auto *WithCond = dyn_cast<OopenACCClauseWithCondition>(C);
+//        WithCond && WIthCond->hasConditionExpr()) {
+//      TRY_TO(TraverseStmt(WithCond->getConditionExpr());
+//    } else if (const auto *
+//  }
+//  OpenACCClauseWithCondition::getConditionExpr/hasConditionExpr
+//OpenACCClauseWithExprs::children (might be null?)
   // TODO OpenACC: When we have Clauses with expressions, we should visit them
   // here.
   return true;
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index da3834f..263b632df 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2515,6 +2515,7 @@ public:
   bool isRecordType() const;
   bool isClassType() const;
   bool isStructureType() const;
+  bool isStructureTypeWithFlexibleArrayMember() const;
   bool isObjCBoxableRecordType() const;
   bool isInterfaceType() const;
   bool isStructureOrClassType() const;
@@ -2523,6 +2524,7 @@ public:
   bool isVectorType() const;                    // GCC vector type.
   bool isExtVectorType() const;                 // Extended vector type.
   bool isExtVectorBoolType() const;             // Extended vector type with bool element.
+  bool isSubscriptableVectorType() const;
   bool isMatrixType() const;                    // Matrix type.
   bool isConstantMatrixType() const;            // Constant matrix type.
   bool isDependentAddressSpaceType() const;     // value-dependent address space qualifier
@@ -7729,6 +7731,10 @@ inline bool Type::isExtVectorBoolType() const {
   return cast<ExtVectorType>(CanonicalType)->getElementType()->isBooleanType();
 }
 
+inline bool Type::isSubscriptableVectorType() const {
+  return isVectorType() || isSveVLSBuiltinType();
+}
+
 inline bool Type::isMatrixType() const {
   return isa<MatrixType>(CanonicalType);
 }
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 7008bea..e59cccc 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1640,10 +1640,11 @@ def Unlikely : StmtAttr {
 def : MutualExclusions<[Likely, Unlikely]>;
 
 def CXXAssume : StmtAttr {
-  let Spellings = [CXX11<"", "assume", 202207>];
+  let Spellings = [CXX11<"", "assume", 202207>, Clang<"assume">];
   let Subjects = SubjectList<[NullStmt], ErrorDiag, "empty statements">;
   let Args = [ExprArgument<"Assumption">];
   let Documentation = [CXXAssumeDocs];
+  let HasCustomParsing = 1;
 }
 
 def NoMerge : DeclOrStmtAttr {
@@ -2256,7 +2257,8 @@ def TypeNullUnspecified : TypeAttr {
 def CountedBy : DeclOrTypeAttr {
   let Spellings = [Clang<"counted_by">];
   let Subjects = SubjectList<[Field], ErrorDiag>;
-  let Args = [ExprArgument<"Count">, IntArgument<"NestedLevel">];
+  let Args = [ExprArgument<"Count">, IntArgument<"NestedLevel", 1>];
+  let LateParsed = LateAttrParseExperimentalExt;
   let ParseArgumentsAsUnevaluated = 1;
   let Documentation = [CountedByDocs];
   let LangOpts = [COnly];
@@ -4255,7 +4257,7 @@ def OMPDeclareVariant : InheritableAttr {
 }
 
 def OMPAssume : InheritableAttr {
-  let Spellings = [Clang<"assume">, CXX11<"omp", "assume">];
+  let Spellings = [CXX11<"omp", "assume">];
   let Subjects = SubjectList<[Function, ObjCMethod]>;
   let InheritEvenIfAlreadyPresent = 1;
   let Documentation = [OMPAssumeDocs];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 54197d5..a313e81 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2027,9 +2027,6 @@ Different optimisers are likely to react differently to the presence of
 this attribute; in some cases, adding ``assume`` may affect performance
 negatively. It should be used with parsimony and care.
 
-Note that `clang::assume` is a different attribute. Always write ``assume``
-without a namespace if you intend to use the standard C++ attribute.
-
 Example:
 
 .. code-block:: c++
@@ -4740,7 +4737,7 @@ def OMPAssumeDocs : Documentation {
   let Category = DocCatFunction;
   let Heading = "assume";
   let Content = [{
-Clang supports the ``__attribute__((assume("assumption")))`` attribute to
+Clang supports the ``[[omp::assume("assumption")]]`` attribute to
 provide additional information to the optimizer. The string-literal, here
 "assumption", will be attached to the function declaration such that later
 analysis and optimization passes can assume the "assumption" to hold.
@@ -4752,7 +4749,7 @@ A function can have multiple assume attributes and they propagate from prior
 declarations to later definitions. Multiple assumptions are aggregated into a
 single comma separated string. Thus, one can provide multiple assumptions via
 a comma separated string, i.a.,
-``__attribute__((assume("assumption1,assumption2")))``.
+``[[omp::assume("assumption1,assumption2")]]``.
 
 While LLVM plugins might provide more assumption strings, the default LLVM
 optimization passes are aware of the following assumptions:
diff --git a/clang/include/clang/Basic/BuiltinsAArch64.def b/clang/include/clang/Basic/BuiltinsAArch64.def
index cf8711c..5f53c98 100644
--- a/clang/include/clang/Basic/BuiltinsAArch64.def
+++ b/clang/include/clang/Basic/BuiltinsAArch64.def
@@ -290,7 +290,7 @@ TARGET_HEADER_BUILTIN(_CountLeadingZeros64, "UiULLi", "nh", INTRIN_H, ALL_MS_LAN
 TARGET_HEADER_BUILTIN(_CountOneBits, "UiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
 TARGET_HEADER_BUILTIN(_CountOneBits64, "UiULLi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
 
-TARGET_HEADER_BUILTIN(__prefetch, "vv*", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
+TARGET_HEADER_BUILTIN(__prefetch, "vvC*", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
 
 #undef BUILTIN
 #undef LANGBUILTIN
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 3e21a2f..efa652e 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -240,6 +240,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "at
 TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst")
 TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts")
 TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3UiiUi", "t", "gfx940-insts")
 
 //===----------------------------------------------------------------------===//
 // Deep learning builtins.
diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def
index 8645cff..fd8c1b4 100644
--- a/clang/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@@ -193,6 +193,8 @@ TARGET_BUILTIN(__builtin_wasm_relaxed_dot_bf16x8_add_f32_f32x4, "V4fV8UsV8UsV4f"
 // Half-Precision (fp16)
 TARGET_BUILTIN(__builtin_wasm_loadf16_f32, "fh*", "nU", "half-precision")
 TARGET_BUILTIN(__builtin_wasm_storef16_f32, "vfh*", "n", "half-precision")
+TARGET_BUILTIN(__builtin_wasm_splat_f16x8, "V8hf", "nc", "half-precision")
+TARGET_BUILTIN(__builtin_wasm_extract_lane_f16x8, "fV8hi", "nc", "half-precision")
 
 // Reference Types builtins
 // Some builtins are custom type-checked - see 't' as part of the third argument,
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index eafcc21..7074479 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -832,23 +832,11 @@ TARGET_BUILTIN(__builtin_ia32_rsqrt14ss_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx
 TARGET_BUILTIN(__builtin_ia32_rsqrt14pd512_mask, "V8dV8dV8dUc", "ncV:512:", "avx512f,evex512")
 TARGET_BUILTIN(__builtin_ia32_rsqrt14ps512_mask, "V16fV16fV16fUs", "ncV:512:", "avx512f,evex512")
 
-TARGET_BUILTIN(__builtin_ia32_rsqrt28sd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512er")
-TARGET_BUILTIN(__builtin_ia32_rsqrt28ss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512er")
-TARGET_BUILTIN(__builtin_ia32_rsqrt28pd_mask, "V8dV8dV8dUcIi", "ncV:512:", "avx512er,evex512")
-TARGET_BUILTIN(__builtin_ia32_rsqrt28ps_mask, "V16fV16fV16fUsIi", "ncV:512:", "avx512er,evex512")
-
 TARGET_BUILTIN(__builtin_ia32_rcp14sd_mask, "V2dV2dV2dV2dUc", "ncV:128:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_rcp14ss_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_rcp14pd512_mask, "V8dV8dV8dUc", "ncV:512:", "avx512f,evex512")
 TARGET_BUILTIN(__builtin_ia32_rcp14ps512_mask, "V16fV16fV16fUs", "ncV:512:", "avx512f,evex512")
 
-TARGET_BUILTIN(__builtin_ia32_rcp28sd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512er")
-TARGET_BUILTIN(__builtin_ia32_rcp28ss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512er")
-TARGET_BUILTIN(__builtin_ia32_rcp28pd_mask, "V8dV8dV8dUcIi", "ncV:512:", "avx512er,evex512")
-TARGET_BUILTIN(__builtin_ia32_rcp28ps_mask, "V16fV16fV16fUsIi", "ncV:512:", "avx512er,evex512")
-TARGET_BUILTIN(__builtin_ia32_exp2pd_mask, "V8dV8dV8dUcIi", "ncV:512:", "avx512er,evex512")
-TARGET_BUILTIN(__builtin_ia32_exp2ps_mask, "V16fV16fV16fUsIi", "ncV:512:", "avx512er,evex512")
-
 TARGET_BUILTIN(__builtin_ia32_cvttps2dq512_mask, "V16iV16fV16iUsIi", "ncV:512:", "avx512f,evex512")
 TARGET_BUILTIN(__builtin_ia32_cvttps2udq512_mask, "V16iV16fV16iUsIi", "ncV:512:", "avx512f,evex512")
 TARGET_BUILTIN(__builtin_ia32_cvttpd2dq512_mask, "V8iV8dV8iUcIi", "ncV:512:", "avx512f,evex512")
@@ -960,15 +948,6 @@ TARGET_BUILTIN(__builtin_ia32_scattersiv16si, "vv*UsV16iV16iIi", "nV:512:", "avx
 TARGET_BUILTIN(__builtin_ia32_scatterdiv8di,  "vv*UcV8OiV8OiIi", "nV:512:", "avx512f,evex512")
 TARGET_BUILTIN(__builtin_ia32_scatterdiv16si, "vv*UcV8OiV8iIi", "nV:512:", "avx512f,evex512")
 
-TARGET_BUILTIN(__builtin_ia32_gatherpfdpd,  "vUcV8ivC*IiIi", "nV:512:", "avx512pf,evex512")
-TARGET_BUILTIN(__builtin_ia32_gatherpfdps,  "vUsV16ivC*IiIi", "nV:512:", "avx512pf,evex512")
-TARGET_BUILTIN(__builtin_ia32_gatherpfqpd,  "vUcV8OivC*IiIi", "nV:512:", "avx512pf,evex512")
-TARGET_BUILTIN(__builtin_ia32_gatherpfqps,  "vUcV8OivC*IiIi", "nV:512:", "avx512pf,evex512")
-TARGET_BUILTIN(__builtin_ia32_scatterpfdpd, "vUcV8iv*IiIi", "nV:512:", "avx512pf,evex512")
-TARGET_BUILTIN(__builtin_ia32_scatterpfdps, "vUsV16iv*IiIi", "nV:512:", "avx512pf,evex512")
-TARGET_BUILTIN(__builtin_ia32_scatterpfqpd, "vUcV8Oiv*IiIi", "nV:512:", "avx512pf,evex512")
-TARGET_BUILTIN(__builtin_ia32_scatterpfqps, "vUcV8Oiv*IiIi", "nV:512:", "avx512pf,evex512")
-
 TARGET_BUILTIN(__builtin_ia32_knotqi, "UcUc", "nc", "avx512dq")
 TARGET_BUILTIN(__builtin_ia32_knothi, "UsUs", "nc", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_knotsi, "UiUi", "nc", "avx512bw")
diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index 0738f43..1e44bc4 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -361,9 +361,6 @@ def warn_invalid_feature_combination : Warning<
 def warn_target_unrecognized_env : Warning<
   "mismatch between architecture and environment in target triple '%0'; did you mean '%1'?">,
   InGroup<InvalidCommandLineArgument>;
-def warn_knl_knm_isa_support_removed : Warning<
-  "KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.">,
-  InGroup<DiagGroup<"knl-knm-isa-support-removed">>;
 def err_target_unsupported_abi_with_fpu : Error<
   "'%0' ABI is not supported with FPU">;
 
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 9d97a75..773b234 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -58,7 +58,7 @@ def warn_drv_avr_stdlib_not_linked: Warning<
 def err_drv_cuda_bad_gpu_arch : Error<"unsupported CUDA gpu architecture: %0">;
 def err_drv_offload_bad_gpu_arch : Error<"unsupported %0 gpu architecture: %1">;
 def err_drv_offload_missing_gpu_arch : Error<
-  "Must pass in an explicit %0 gpu architecture to '%1'">;
+  "must pass in an explicit %0 gpu architecture to '%1'">;
 def err_drv_no_cuda_installation : Error<
   "cannot find CUDA installation; provide its path via '--cuda-path', or pass "
   "'-nocudainc' to build without CUDA includes">;
@@ -90,8 +90,8 @@ def err_drv_no_hipspv_device_lib : Error<
   "'--hip-path' or '--hip-device-lib-path', or pass '-nogpulib' to build "
   "without HIP device library">;
 def err_drv_hipspv_no_hip_path : Error<
-  "'--hip-path' must be specified when offloading to "
-  "SPIR-V%select{| unless %1 is given}0.">;
+  "'--hip-path' must be specified when offloading to SPIR-V unless '-nogpuinc' "
+  "is given">;
 
 // TODO: Remove when COV6 is fully supported by ROCm.
 def warn_drv_amdgpu_cov6: Warning<
@@ -137,13 +137,13 @@ def warn_drv_unsupported_option_for_flang : Warning<
   "the argument '%0' is not supported for option '%1'. Mapping to '%1%2'">,
   InGroup<OptionIgnored>;
 def warn_drv_unsupported_diag_option_for_flang : Warning<
-  "The warning option '-%0' is not supported">,
+  "the warning option '-%0' is not supported">,
   InGroup<OptionIgnored>;
 def warn_drv_unsupported_option_for_processor : Warning<
   "ignoring '%0' option as it is not currently supported for processor '%1'">,
   InGroup<OptionIgnored>;
 def warn_drv_unsupported_openmp_library : Warning<
-  "The library '%0=%1' is not supported, openmp is not be enabled">,
+  "the library '%0=%1' is not supported, OpenMP will not be enabled">,
   InGroup<OptionIgnored>;
 
 def err_drv_invalid_thread_model_for_target : Error<
@@ -356,7 +356,7 @@ def err_drv_expecting_fopenmp_with_fopenmp_targets : Error<
   "compatible with offloading; e.g., '-fopenmp=libomp' or '-fopenmp=libiomp5'">;
 def err_drv_failed_to_deduce_target_from_arch : Error<
   "failed to deduce triple for target architecture '%0'; specify the triple "
-  "using '-fopenmp-targets' and '-Xopenmp-target' instead.">;
+  "using '-fopenmp-targets' and '-Xopenmp-target' instead">;
 def err_drv_omp_offload_target_missingbcruntime : Error<
   "no library '%0' found in the default clang lib directory or in LIBRARY_PATH"
   "; use '--libomptarget-%1-bc-path' to specify %1 bitcode library">;
@@ -515,14 +515,6 @@ def err_analyzer_checker_incompatible_analyzer_option : Error<
 def err_analyzer_not_built_with_z3 : Error<
   "analyzer constraint manager 'z3' is only available if LLVM was built with "
   "-DLLVM_ENABLE_Z3_SOLVER=ON">;
-def warn_analyzer_deprecated_option : Warning<
-  "analyzer option '%0' is deprecated. This flag will be removed in %1, and "
-  "passing this option will be an error.">,
-  InGroup<DeprecatedStaticAnalyzerFlag>;
-def warn_analyzer_deprecated_option_with_alternative : Warning<
-  "analyzer option '%0' is deprecated. This flag will be removed in %1, and "
-  "passing this option will be an error. Use '%2' instead.">,
-  InGroup<DeprecatedStaticAnalyzerFlag>;
 
 def warn_drv_needs_hvx : Warning<
   "%0 requires HVX, use -mhvx/-mhvx= to enable it">,
@@ -555,10 +547,12 @@ def err_drv_extract_api_wrong_kind : Error<
   "in api extraction; use '-x %2' to override">;
 
 def err_drv_missing_symbol_graph_dir: Error<
-  "Must provide a symbol graph output directory using --symbol-graph-dir=<directory>">;
+  "must provide a symbol graph output directory using "
+  "'--symbol-graph-dir=<directory>'">;
 
 def err_drv_unexpected_symbol_graph_output : Error<
-  "Unexpected output symbol graph '%1'; please provide --symbol-graph-dir=<directory> instead">;
+  "unexpected output symbol graph '%1'; please provide "
+  "'--symbol-graph-dir=<directory>' instead">;
 
 def warn_slash_u_filename : Warning<"'/U%0' treated as the '/U' option">,
   InGroup<DiagGroup<"slash-u-filename">>;
@@ -599,9 +593,6 @@ def warn_drv_unsupported_gpopt : Warning<
   "ignoring '-mgpopt' option as it cannot be used with %select{|the implicit"
   " usage of }0-mabicalls">,
   InGroup<UnsupportedGPOpt>;
-def warn_drv_unsupported_tocdata: Warning<
-  "ignoring '-mtocdata' as it is only supported for -mcmodel=small">,
-  InGroup<OptionIgnored>;
 def warn_drv_unsupported_sdata : Warning<
   "ignoring '-msmall-data-limit=' with -mcmodel=large for -fpic or RV64">,
   InGroup<OptionIgnored>;
@@ -770,19 +761,19 @@ def err_drv_hlsl_16bit_types_unsupported: Error<
   "'%0' option requires target HLSL Version >= 2018%select{| and shader model >= 6.2}1, but HLSL Version is '%2'%select{| and shader model is '%3'}1">;
 def err_drv_hlsl_bad_shader_unsupported : Error<
   "%select{shader model|Vulkan environment|shader stage}0 '%1' in target '%2' is invalid for HLSL code generation">;
-def warn_drv_dxc_missing_dxv : Warning<"dxv not found. "
-    "Resulting DXIL will not be validated or signed for use in release environments.">,
-    InGroup<DXILValidation>;
+def warn_drv_dxc_missing_dxv : Warning<
+  "dxv not found; resulting DXIL will not be validated or signed for use in "
+  "release environment">, InGroup<DXILValidation>;
 
 def err_drv_invalid_range_dxil_validator_version : Error<
-  "invalid validator version : %0\n"
-  "Validator version must be less than or equal to current internal version.">;
+  "invalid validator version : %0; validator version must be less than or "
+  "equal to current internal version">;
 def err_drv_invalid_format_dxil_validator_version : Error<
-  "invalid validator version : %0\n"
-  "Format of validator version is \"<major>.<minor>\" (ex:\"1.4\").">;
+  "invalid validator version : %0; format of validator version is "
+  "\"<major>.<minor>\" (ex:\"1.4\")">;
 def err_drv_invalid_empty_dxil_validator_version : Error<
-  "invalid validator version : %0\n"
-  "If validator major version is 0, minor version must also be 0.">;
+  "invalid validator version : %0; if validator major version is 0, minor "
+  "version must also be 0">;
 
 def warn_drv_sarif_format_unstable : Warning<
   "diagnostic formatting in SARIF mode is currently unstable">,
@@ -796,12 +787,10 @@ def warn_drv_loongarch_conflicting_implied_val : Warning<
   InGroup<OptionIgnored>;
 def err_drv_loongarch_invalid_mfpu_EQ : Error<
   "invalid argument '%0' to -mfpu=; must be one of: 64, 32, none, 0 (alias for none)">;
-def err_drv_loongarch_wrong_fpu_width_for_lsx : Error<
-  "wrong fpu width; LSX depends on 64-bit FPU.">;
-def err_drv_loongarch_wrong_fpu_width_for_lasx : Error<
-  "wrong fpu width; LASX depends on 64-bit FPU.">;
+def err_drv_loongarch_wrong_fpu_width : Error<
+  "wrong fpu width; %select{LSX|LASX}0 depends on 64-bit FPU">;
 def err_drv_loongarch_invalid_simd_option_combination : Error<
-  "invalid option combination; LASX depends on LSX.">;
+  "invalid option combination; LASX depends on LSX">;
 
 def err_drv_expand_response_file : Error<
   "failed to expand response file: %0">;
@@ -813,9 +802,9 @@ def note_drv_available_multilibs : Note<
   "available multilibs are:%0">;
 
 def warn_android_unversioned_fallback : Warning<
-  "Using unversioned Android target directory %0 for target %1. Unversioned"
-  " directories will not be used in Clang 19. Provide a versioned directory"
-  " for the target version or lower instead.">,
+  "using unversioned Android target directory %0 for target %1; unversioned "
+  "directories will not be used in Clang 19 -- provide a versioned directory "
+  "for the target version or lower instead">,
   InGroup<DiagGroup<"android-unversioned-fallback">>;
 
 def err_drv_triple_version_invalid : Error<
diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index e456ec2..85c32e5 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -71,14 +71,14 @@ def remark_fe_backend_optimization_remark_analysis : Remark<"%0">, BackendInfo,
     InGroup<BackendOptimizationRemarkAnalysis>;
 def remark_fe_backend_optimization_remark_analysis_fpcommute : Remark<"%0; "
     "allow reordering by specifying '#pragma clang loop vectorize(enable)' "
-    "before the loop or by providing the compiler option '-ffast-math'.">,
+    "before the loop or by providing the compiler option '-ffast-math'">,
     BackendInfo, InGroup<BackendOptimizationRemarkAnalysis>;
 def remark_fe_backend_optimization_remark_analysis_aliasing : Remark<"%0; "
     "allow reordering by specifying '#pragma clang loop vectorize(enable)' "
-    "before the loop. If the arrays will always be independent specify "
+    "before the loop; if the arrays will always be independent, specify "
     "'#pragma clang loop vectorize(assume_safety)' before the loop or provide "
-    "the '__restrict__' qualifier with the independent array arguments. "
-    "Erroneous results will occur if these options are incorrectly applied!">,
+    "the '__restrict__' qualifier with the independent array arguments -- "
+    "erroneous results will occur if these options are incorrectly applied">,
     BackendInfo, InGroup<BackendOptimizationRemarkAnalysis>;
 
 def warn_fe_backend_optimization_failure : Warning<"%0">, BackendInfo,
@@ -152,8 +152,8 @@ def warn_fe_serialized_diag_merge_failure : Warning<
 def warn_fe_serialized_diag_failure : Warning<
     "unable to open file %0 for serializing diagnostics (%1)">,
     InGroup<SerializedDiagnostics>;
-def warn_fe_serialized_diag_failure_during_finalisation : Warning<
-    "Received warning after diagnostic serialization teardown was underway: %0">,
+def warn_fe_serialized_diag_failure_during_finalization : Warning<
+    "received warning after diagnostic serialization teardown was underway: %0">,
     InGroup<SerializedDiagnostics>;
 
 def err_verify_missing_line : Error<
@@ -337,7 +337,7 @@ def warn_atomic_op_oversized : Warning<
 InGroup<AtomicAlignment>;
 
 def warn_sync_op_misaligned : Warning<
-  "__sync builtin operation MUST have natural alignment (consider using __atomic).">,
+  "__sync builtin operation must have natural alignment (consider using __atomic)">,
   InGroup<SyncAlignment>;
 
 def warn_alias_with_section : Warning<
@@ -359,17 +359,16 @@ def warn_profile_data_unprofiled : Warning<
   "no profile data available for file \"%0\"">,
   InGroup<ProfileInstrUnprofiled>;
 def warn_profile_data_misexpect : Warning<
-  "Potential performance regression from use of __builtin_expect(): "
-  "Annotation was correct on %0 of profiled executions.">,
-  BackendInfo,
-  InGroup<MisExpect>;
+  "potential performance regression from use of __builtin_expect(): "
+  "annotation was correct on %0 of profiled executions">,
+  BackendInfo, InGroup<MisExpect>;
 } // end of instrumentation issue category
 
 def err_extract_api_ignores_file_not_found :
   Error<"file '%0' specified by '--extract-api-ignores=' not found">, DefaultFatal;
 
 def warn_missing_symbol_graph_dir : Warning<
-  "Missing symbol graph output directory, defaulting to working directory">,
+  "missing symbol graph output directory, defaulting to working directory">,
   InGroup<ExtractAPIMisuse>;
 
 def err_ast_action_on_llvm_ir : Error<
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 4cb4f3d..6b595a3 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -15,8 +15,6 @@ def Implicit : DiagGroup<"implicit", [
     ImplicitInt
 ]>;
 
-def DeprecatedStaticAnalyzerFlag : DiagGroup<"deprecated-static-analyzer-flag">;
-
 // Empty DiagGroups are recognized by clang but ignored.
 def ODR : DiagGroup<"odr">;
 def : DiagGroup<"abi">;
@@ -1447,6 +1445,10 @@ def FunctionMultiVersioning
 
 def NoDeref : DiagGroup<"noderef">;
 
+// -fbounds-safety and bounds annotation related warnings
+def BoundsSafetyCountedByEltTyUnknownSize :
+  DiagGroup<"bounds-safety-counted-by-elt-type-unknown-size">;
+
 // A group for cross translation unit static analysis related warnings.
 def CrossTU : DiagGroup<"ctu">;
 
diff --git a/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td b/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
index 944b2a3..cdf2724 100644
--- a/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
+++ b/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
@@ -59,8 +59,8 @@ def err_platform_mismatch : Error<"platform does not match: '%0' (provided) vs '
 def err_install_name_mismatch : Error<"install_name does not match: '%0' (provided) vs '%1' (found)">;
 def err_current_version_mismatch : Error<"current_version does not match: '%0' (provided) vs '%1' (found)">;
 def err_compatibility_version_mismatch : Error<"compatibility_version does not match: '%0' (provided) vs '%1' (found)">;
-def err_appextension_safe_mismatch : Error<"ApplicationExtensionSafe flag does not match: '%0' (provided) vs '%1' (found)">;
-def err_shared_cache_eligiblity_mismatch : Error<"NotForDyldSharedCache flag does not match: '%0' (provided) vs '%1' (found)">;
+def err_appextension_safe_mismatch : Error<"the ApplicationExtensionSafe flag does not match: '%0' (provided) vs '%1' (found)">;
+def err_shared_cache_eligiblity_mismatch : Error<"the NotForDyldSharedCache flag does not match: '%0' (provided) vs '%1' (found)">;
 def err_no_twolevel_namespace : Error<"flat namespace libraries are not supported">;
 def err_parent_umbrella_missing: Error<"parent umbrella missing from %0: '%1'">;
 def err_parent_umbrella_mismatch : Error<"parent umbrella does not match: '%0' (provided) vs '%1' (found)">;
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index ad6bacf..5a4551a 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -991,5 +991,5 @@ def err_pp_unclosed_pragma_unsafe_buffer_usage :
 Error<"'#pragma unsafe_buffer_usage' was not ended">;
 
 def err_pp_pragma_unsafe_buffer_usage_syntax :
-Error<"Expected 'begin' or 'end'">;
+Error<"expected 'begin' or 'end'">;
 }
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 46656fc..f8328be5 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1117,7 +1117,7 @@ def err_availability_expected_environment : Error<
 
 // objc_bridge_related attribute
 def err_objcbridge_related_expected_related_class : Error<
-  "expected a related ObjectiveC class name, e.g., 'NSColor'">;
+  "expected a related Objective-C class name, e.g., 'NSColor'">;
 def err_objcbridge_related_selector_name : Error<
   "expected a class method selector with single argument, e.g., 'colorWithCGColor:'">;
 
@@ -1345,8 +1345,8 @@ def note_pragma_attribute_namespace_on_attribute : Note<
   "omit the namespace to add attributes to the most-recently"
   " pushed attribute group">;
 def warn_no_support_for_eval_method_source_on_m32 : Warning<
-  "Setting the floating point evaluation method to `source` on a target"
-  " without SSE is not supported.">, InGroup<Pragmas>;
+  "setting the floating point evaluation method to `source` on a target "
+  "without SSE is not supported">, InGroup<Pragmas>;
 // - #pragma __debug
 def warn_pragma_debug_dependent_argument : Warning<
   "%select{value|type}0-dependent expression passed as an argument to debug "
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index c7dea1d..270b0a1 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -310,7 +310,7 @@ def err_invalid_vector_long_double_decl_spec : Error<
 def err_invalid_vector_complex_decl_spec : Error<
   "cannot use '_Complex' with '__vector'">;
 def warn_vector_long_decl_spec_combination : Warning<
-  "Use of 'long' with '__vector' is deprecated">, InGroup<Deprecated>;
+  "use of 'long' with '__vector' is deprecated">, InGroup<Deprecated>;
 
 def err_redeclaration_different_type : Error<
   "redeclaration of %0 with a different type%diff{: $ vs $|}1,2">;
@@ -754,7 +754,7 @@ def note_include_header_or_declare : Note<
 def note_previous_builtin_declaration : Note<"%0 is a builtin with type %1">;
 def warn_implicit_decl_no_jmp_buf
     : Warning<"declaration of built-in function '%0' requires the declaration"
-    " of the 'jmp_buf' type, commonly provided in the header <setjmp.h>.">,
+    " of the 'jmp_buf' type, commonly provided in the header <setjmp.h>">,
       InGroup<DiagGroup<"incomplete-setjmp-declaration">>;
 def warn_implicit_decl_requires_sysheader : Warning<
   "declaration of built-in function '%1' requires inclusion of the header <%0>">,
@@ -3197,7 +3197,7 @@ def err_attribute_bad_sve_vector_size : Error<
   "'-msve-vector-bits' ('%1')">;
 def err_attribute_arm_feature_sve_bits_unsupported : Error<
   "%0 is only supported when '-msve-vector-bits=<bits>' is specified with a "
-  "value of 128, 256, 512, 1024 or 2048.">;
+  "value of 128, 256, 512, 1024 or 2048">;
 def warn_attribute_arm_sm_incompat_builtin : Warning<
   "builtin call has undefined behaviour when called from a %0 function">,
   InGroup<DiagGroup<"undefined-arm-streaming">>;
@@ -3975,7 +3975,7 @@ def warn_acquired_before : Warning<
   "%0 '%1' must be acquired before '%2'">,
   InGroup<ThreadSafetyAnalysis>, DefaultIgnore;
 def warn_acquired_before_after_cycle : Warning<
-  "Cycle in acquired_before/after dependencies, starting with '%0'">,
+  "cycle in acquired_before/after dependencies, starting with '%0'">,
   InGroup<ThreadSafetyAnalysis>, DefaultIgnore;
 
 
@@ -4526,7 +4526,7 @@ def err_objc_attr_typedef_not_void_pointer : Error<
 def err_objc_cf_bridged_not_interface : Error<
   "CF object of type %0 is bridged to %1, which is not an Objective-C class">;
 def err_objc_ns_bridged_invalid_cfobject : Error<
-  "ObjectiveC object of type %0 is bridged to %1, which is not valid CF object">;
+  "Objective-C object of type %0 is bridged to %1, which is not valid CF object">;
 def warn_objc_invalid_bridge : Warning<
   "%0 bridges to %1, not %2">, InGroup<ObjCBridge>;
 def warn_objc_invalid_bridge_to_cf : Warning<
@@ -6544,8 +6544,10 @@ def warn_superclass_variable_sized_type_not_at_end : Warning<
 
 def err_flexible_array_count_not_in_same_struct : Error<
   "'counted_by' field %0 isn't within the same struct as the flexible array">;
-def err_counted_by_attr_not_on_flexible_array_member : Error<
-  "'counted_by' only applies to C99 flexible array members">;
+def err_counted_by_attr_not_on_ptr_or_flexible_array_member : Error<
+  "'counted_by' only applies to pointers or C99 flexible array members">;
+def err_counted_by_attr_on_array_not_flexible_array_member : Error<
+  "'counted_by' on arrays only applies to C99 flexible array members">;
 def err_counted_by_attr_refer_to_itself : Error<
   "'counted_by' cannot refer to the flexible array member %0">;
 def err_counted_by_must_be_in_structure : Error<
@@ -6560,6 +6562,23 @@ def err_counted_by_attr_refer_to_union : Error<
   "'counted_by' argument cannot refer to a union member">;
 def note_flexible_array_counted_by_attr_field : Note<
   "field %0 declared here">;
+def err_counted_by_attr_pointee_unknown_size : Error<
+  "'counted_by' %select{cannot|should not}3 be applied to %select{"
+    "a pointer with pointee|" // pointer
+    "an array with element}0" // array
+  " of unknown size because %1 is %select{"
+    "an incomplete type|"  // CountedByInvalidPointeeTypeKind::INCOMPLETE
+    "a sizeless type|"     // CountedByInvalidPointeeTypeKind::SIZELESS
+    "a function type|"     // CountedByInvalidPointeeTypeKind::FUNCTION
+    // CountedByInvalidPointeeTypeKind::FLEXIBLE_ARRAY_MEMBER
+    "a struct type with a flexible array member"
+    "%select{|. This will be an error in a future compiler version}3"
+    ""
+  "}2">;
+
+def warn_counted_by_attr_elt_type_unknown_size :
+  Warning<err_counted_by_attr_pointee_unknown_size.Summary>,
+  InGroup<BoundsSafetyCountedByEltTyUnknownSize>;
 
 let CategoryName = "ARC Semantic Issue" in {
 
@@ -7525,6 +7544,11 @@ def err_explicit_object_parameter_mutable: Error<
 def err_invalid_explicit_object_type_in_lambda: Error<
   "invalid explicit object parameter type %0 in lambda with capture; "
   "the type must be the same as, or derived from, the lambda">;
+def err_explicit_object_lambda_ambiguous_base : Error<
+  "lambda %0 is inaccessible due to ambiguity:%1">;
+def err_explicit_object_lambda_inaccessible_base : Error<
+  "invalid explicit object parameter type %0 in lambda with capture; "
+  "the type must derive publicly from the lambda">;
 
 def err_ref_qualifier_overload : Error<
   "cannot overload a member function %select{without a ref-qualifier|with "
@@ -7997,15 +8021,15 @@ def warn_deprecated_volatile_structured_binding : Warning<
   InGroup<DeprecatedVolatile>;
 
 def warn_deprecated_altivec_src_compat : Warning<
-  "Current handling of vector bool and vector pixel types in this context are "
-  "deprecated. The default behaviour will soon change to that implied by the "
+  "current handling of vector bool and vector pixel types in this context are "
+  "deprecated; the default behaviour will soon change to that implied by the "
   "'-altivec-compat=xl' option">,
   InGroup<DiagGroup<"deprecated-altivec-src-compat">>;
 
 def warn_deprecated_lax_vec_conv_all : Warning<
-  "Implicit conversion between vector types ('%0' and '%1') is deprecated. "
-  "In the future, the behavior implied by '-fno-lax-vector-conversions' "
-  "will be the default.">,
+  "implicit conversion between vector types ('%0' and '%1') is deprecated; "
+  "in the future, the behavior implied by '-fno-lax-vector-conversions' "
+  "will be the default">,
   InGroup<DiagGroup<"deprecate-lax-vec-conv-all">>;
 
 def err_catch_incomplete_ptr : Error<
@@ -8853,7 +8877,7 @@ def err_atomic_exclusive_builtin_pointer_size : Error<
   "address argument to load or store exclusive builtin must be a pointer to"
   " 1,2,4 or 8 byte type (%0 invalid)">;
 def err_atomic_builtin_ext_int_size : Error<
-  "Atomic memory operand must have a power-of-two size">;
+  "atomic memory operand must have a power-of-two size">;
 def err_atomic_builtin_bit_int_prohibit : Error<
   "argument to atomic builtin of type '_BitInt' is not supported">;
 def err_atomic_op_needs_atomic : Error<
@@ -8961,8 +8985,8 @@ def err_va_arg_in_device : Error<
 def err_alias_not_supported_on_nvptx : Error<"CUDA older than 10.0 does not support .alias">;
 def err_cuda_unattributed_constexpr_cannot_overload_device : Error<
   "constexpr function %0 without __host__ or __device__ attributes cannot "
-  "overload __device__ function with same signature.  Add a __host__ "
-  "attribute, or build with -fno-cuda-host-device-constexpr.">;
+  "overload __device__ function with the same signature; add a __host__ "
+  "attribute, or build with -fno-cuda-host-device-constexpr">;
 def note_cuda_conflicting_device_function_declared_here : Note<
   "conflicting __device__ function declared here">;
 def err_cuda_device_exceptions : Error<
@@ -8970,9 +8994,9 @@ def err_cuda_device_exceptions : Error<
   "%select{__device__|__global__|__host__|__host__ __device__}1 function">;
 def err_dynamic_var_init : Error<
     "dynamic initialization is not supported for "
-    "__device__, __constant__, __shared__, and __managed__ variables.">;
+    "__device__, __constant__, __shared__, and __managed__ variables">;
 def err_shared_var_init : Error<
-    "initialization is not supported for __shared__ variables.">;
+    "initialization is not supported for __shared__ variables">;
 def err_cuda_vla : Error<
     "cannot use variable-length arrays in "
     "%select{__device__|__global__|__host__|__host__ __device__}0 functions">;
@@ -10056,12 +10080,6 @@ def warn_new_dangling_initializer_list : Warning<
   "the allocated initializer list}0 "
   "will be destroyed at the end of the full-expression">,
   InGroup<DanglingInitializerList>;
-def warn_unsupported_lifetime_extension : Warning<
-  "lifetime extension of "
-  "%select{temporary|backing array of initializer list}0 created "
-  "by aggregate initialization using a default member initializer "
-  "is not yet supported; lifetime of %select{temporary|backing array}0 "
-  "will end at the end of the full-expression">, InGroup<Dangling>;
 
 // For non-floating point, expressions of the form x == x or x != x
 // should result in a warning, since these always evaluate to a constant.
@@ -10237,9 +10255,6 @@ def err_fallthrough_attr_outside_switch : Error<
 def err_fallthrough_attr_invalid_placement : Error<
   "fallthrough annotation does not directly precede switch label">;
 
-def err_assume_attr_args : Error<
-  "attribute '%0' requires a single expression argument">;
-
 def warn_unreachable_default : Warning<
   "default label in switch which covers all enumeration values">,
   InGroup<CoveredSwitchDefault>, DefaultIgnore;
@@ -10365,12 +10380,12 @@ def err_shufflevector_argument_too_large : Error<
   "index for __builtin_shufflevector must be less than the total number "
   "of vector elements">;
 def err_shufflevector_minus_one_is_undefined_behavior_constexpr : Error<
-  "index for __builtin_shufflevector not within the bounds of the input vectors; index of -1 found at position %0 not permitted in a constexpr context.">;
+  "index for __builtin_shufflevector not within the bounds of the input vectors; index of -1 found at position %0 is not permitted in a constexpr context">;
 
 def err_convertvector_non_vector : Error<
   "first argument to __builtin_convertvector must be a vector">;
 def err_convertvector_constexpr_unsupported_vector_cast : Error<
-  "unsupported vector cast from %0 to %1 in a constant expression.">;
+  "unsupported vector cast from %0 to %1 in a constant expression">;
 def err_builtin_non_vector_type : Error<
   "%0 argument to %1 must be of vector type">;
 def err_convertvector_incompatible_vector : Error<
@@ -10698,7 +10713,7 @@ def err_kernel_arg_address_space : Error<
   "pointer arguments to kernel functions must reside in '__global', "
   "'__constant' or '__local' address space">;
 def err_opencl_ext_vector_component_invalid_length : Error<
-  "vector component access has invalid length %0.  Supported: 1,2,3,4,8,16.">;
+  "vector component access has invalid length %0; supported lengths are: 1,2,3,4,8,16">;
 def err_opencl_function_variable : Error<
   "%select{non-kernel function|function scope}0 variable cannot be declared in %1 address space">;
 def err_opencl_addrspace_scope : Error<
@@ -11146,12 +11161,12 @@ def err_omp_atomic_compare : Error<
   "the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}',"
   " '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}',"
   " 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type,"
-  " and 'ordop' is one of '<' or '>'.">;
+  " and 'ordop' is one of '<' or '>'">;
 def err_omp_atomic_compare_capture : Error<
   "the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}',"
   " '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}',"
   " 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x', 'r', and 'v' are lvalue expressions with scalar type, 'expr', 'e', and 'd' are expressions with scalar type,"
-  " and 'ordop' is one of '<' or '>'.">;
+  " and 'ordop' is one of '<' or '>'">;
 def note_omp_atomic_compare: Note<
   "%select{expected compound statement|expected exactly one expression statement|expected assignment statement|expected conditional operator|expect result value to be at false expression|"
   "expect binary operator in conditional expression|expect '<', '>' or '==' as order operator|expect comparison in a form of 'x == e', 'e == x', 'x ordop expr', or 'expr ordop x'|"
@@ -11317,7 +11332,7 @@ def err_omp_expected_int_param : Error<
 def err_omp_at_least_one_motion_clause_required : Error<
   "expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'">;
 def err_omp_cannot_update_with_internal_linkage : Error<
-  "the host cannot update a declare target variable that is not externally visible.">;
+  "the host cannot update a declare target variable that is not externally visible">;
 def err_omp_usedeviceptr_not_a_pointer : Error<
   "expected pointer or reference to pointer in 'use_device_ptr' clause">;
 def err_omp_argument_type_isdeviceptr : Error <
@@ -11338,10 +11353,10 @@ def err_omp_reduction_vla_unsupported : Error<
 def err_omp_linear_distribute_var_non_loop_iteration : Error<
   "only loop iteration variables are allowed in 'linear' clause in distribute directives">;
 def warn_omp_non_trivial_type_mapped : Warning<
-  "Type %0 is not trivially copyable and not guaranteed to be mapped correctly">,
+  "type %0 is not trivially copyable and not guaranteed to be mapped correctly">,
   InGroup<OpenMPMapping>;
 def err_omp_requires_clause_redeclaration : Error <
-  "Only one %0 clause can appear on a requires directive in a single translation unit">;
+  "only one %0 clause can appear on a requires directive in a single translation unit">;
 def note_omp_requires_previous_clause : Note <
   "%0 clause previously used here">;
 def err_omp_directive_before_requires : Error <
@@ -11349,7 +11364,7 @@ def err_omp_directive_before_requires : Error <
 def note_omp_requires_encountered_directive : Note <
   "'%0' previously encountered here">;
 def err_omp_device_ancestor_without_requires_reverse_offload : Error <
-  "Device clause with ancestor device-modifier used without specifying 'requires reverse_offload'">;
+  "device clause with ancestor device-modifier used without specifying 'requires reverse_offload'">;
 def err_omp_invalid_scope : Error <
   "'#pragma omp %0' directive must appear only in file scope">;
 def note_omp_invalid_length_on_this_ptr_mapping : Note <
@@ -11761,7 +11776,7 @@ def note_await_ready_no_bool_conversion : Note<
   "return type of 'await_ready' is required to be contextually convertible to 'bool'"
 >;
 def warn_coroutine_handle_address_invalid_return_type : Warning <
-  "return type of 'coroutine_handle<>::address should be 'void*' (have %0) in order to get capability with existing async C API.">,
+  "return type of 'coroutine_handle<>::address should be 'void*' (have %0) in order to get capability with existing async C API">,
   InGroup<Coroutine>;
 def err_coroutine_promise_final_suspend_requires_nothrow : Error<
   "the expression 'co_await __promise.final_suspend()' is required to be non-throwing"
@@ -11789,7 +11804,7 @@ def err_conflicting_aligned_options : Error <
   "conflicting option '-fcoro-aligned-allocation' and '-fno-aligned-allocation'"
 >;
 def err_coro_invalid_addr_of_label : Error<
-  "the GNU address of label extension is not allowed in coroutines."
+  "the GNU address of label extension is not allowed in coroutines"
 >;
 def err_coroutine_return_type : Error<
   "function returns a type %0 marked with [[clang::coro_return_type]] but is neither a coroutine nor a coroutine wrapper; "
@@ -12389,4 +12404,8 @@ def err_acc_reduction_composite_type
 def err_acc_reduction_composite_member_type :Error<
     "OpenACC 'reduction' composite variable must not have non-scalar field">;
 def note_acc_reduction_composite_member_loc : Note<"invalid field is here">;
+
+// AMDGCN builtins diagnostics
+def err_amdgcn_global_load_lds_size_invalid_value : Error<"invalid size value">;
+def note_amdgcn_global_load_lds_size_valid_value : Note<"size must be 1, 2, or 4">;
 } // end of sema component.
diff --git a/clang/include/clang/Basic/FileManager.h b/clang/include/clang/Basic/FileManager.h
index 8b4206e..e1f33d5 100644
--- a/clang/include/clang/Basic/FileManager.h
+++ b/clang/include/clang/Basic/FileManager.h
@@ -299,6 +299,8 @@ private:
   getBufferForFileImpl(StringRef Filename, int64_t FileSize, bool isVolatile,
                        bool RequiresNullTerminator) const;
 
+  DirectoryEntry *&getRealDirEntry(const llvm::vfs::Status &Status);
+
 public:
   /// Get the 'stat' information for the given \p Path.
   ///
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 09eb92d..4061451 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -300,6 +300,7 @@ LANGOPT(HIPStdParInterposeAlloc, 1, 0, "Replace allocations / deallocations with
 
 LANGOPT(OpenACC           , 1, 0, "OpenACC Enabled")
 
+LANGOPT(MSVCEnableStdcMacro , 1, 0, "Define __STDC__ with '-fms-compatibility'")
 LANGOPT(SizedDeallocation , 1, 0, "sized deallocation")
 LANGOPT(AlignedAllocation , 1, 0, "aligned allocation")
 LANGOPT(AlignedAllocationUnavailable, 1, 0, "aligned allocation functions are unavailable")
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index a9ea71c..03570f9 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2186,9 +2186,6 @@ let TargetGuard = "sme2" in {
 
   def SVSQRSHRUN_X4 : SInst<"svqrshrun[_n]_{0}[_{d}_x4]", "b4i", "il", MergeNone, "aarch64_sve_sqrshrun_x4", [IsStreaming], [ImmCheck<1, ImmCheckShiftRight, 0>]>;
 
-  def REINTERPRET_SVBOOL_TO_SVCOUNT : Inst<"svreinterpret[_c]", "}P", "Pc", MergeNone, "", [IsStreamingCompatible], []>;
-  def REINTERPRET_SVCOUNT_TO_SVBOOL : Inst<"svreinterpret[_b]", "P}", "Pc", MergeNone, "", [IsStreamingCompatible], []>;
-
   // SQDMULH
   def SVSQDMULH_SINGLE_X2 : SInst<"svqdmulh[_single_{d}_x2]", "22d", "csil", MergeNone, "aarch64_sve_sqdmulh_single_vgx2", [IsStreaming], []>;
   def SVSQDMULH_SINGLE_X4 : SInst<"svqdmulh[_single_{d}_x4]", "44d", "csil", MergeNone, "aarch64_sve_sqdmulh_single_vgx4", [IsStreaming], []>;
@@ -2197,6 +2194,9 @@ let TargetGuard = "sme2" in {
 }
 
 let TargetGuard = "sve2p1|sme2" in {
+  def REINTERPRET_SVBOOL_TO_SVCOUNT : Inst<"svreinterpret[_c]", "}P", "Pc", MergeNone, "", [IsStreamingCompatible], []>;
+  def REINTERPRET_SVCOUNT_TO_SVBOOL : Inst<"svreinterpret[_b]", "P}", "Pc", MergeNone, "", [IsStreamingCompatible], []>;
+
   // SQRSHRN / UQRSHRN
   def SVQRSHRN_X2   : SInst<"svqrshrn[_n]_{0}[_{d}_x2]", "h2i", "i",    MergeNone, "aarch64_sve_sqrshrn_x2", [IsStreamingCompatible], [ImmCheck<1, ImmCheck1_16>]>;
   def SVUQRSHRN_X2  : SInst<"svqrshrn[_n]_{0}[_{d}_x2]", "e2i", "Ui",   MergeNone, "aarch64_sve_uqrshrn_x2", [IsStreamingCompatible], [ImmCheck<1, ImmCheck1_16>]>;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 7bb7816..de2f245 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -603,6 +603,7 @@ class MarshallingInfoVisibility<KeyPathAndMacro kpm, code default>
 // Key paths that are constant during parsing of options with the same key path prefix.
 defvar cplusplus = LangOpts<"CPlusPlus">;
 defvar cpp11 = LangOpts<"CPlusPlus11">;
+defvar cpp14 = LangOpts<"CPlusPlus14">;
 defvar cpp17 = LangOpts<"CPlusPlus17">;
 defvar cpp20 = LangOpts<"CPlusPlus20">;
 defvar c99 = LangOpts<"C99">;
@@ -2980,6 +2981,10 @@ def fms_compatibility : Flag<["-"], "fms-compatibility">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option, CLOption]>,
   HelpText<"Enable full Microsoft Visual C++ compatibility">,
   MarshallingInfoFlag<LangOpts<"MSVCCompat">>;
+def fms_define_stdc : Flag<["-"], "fms-define-stdc">, Group<f_Group>,
+  Visibility<[ClangOption, CC1Option, CLOption]>,
+  HelpText<"Define '__STDC__' to '1' in MSVC Compatibility mode">,
+  MarshallingInfoFlag<LangOpts<"MSVCEnableStdcMacro">>;
 def fms_extensions : Flag<["-"], "fms-extensions">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option, CLOption]>,
   HelpText<"Accept some non-standard constructs supported by the Microsoft compiler">,
@@ -3388,10 +3393,9 @@ defm relaxed_template_template_args : BoolFOption<"relaxed-template-template-arg
   NegFlag<SetFalse, [], [CC1Option], "Disable">,
   BothFlags<[], [ClangOption], " C++17 relaxed template template argument matching">>;
 defm sized_deallocation : BoolFOption<"sized-deallocation",
-  LangOpts<"SizedDeallocation">, DefaultFalse,
-  PosFlag<SetTrue, [], [ClangOption, CC1Option],
-          "Enable C++14 sized global deallocation functions">,
-  NegFlag<SetFalse>>;
+  LangOpts<"SizedDeallocation">, Default<cpp14.KeyPath>,
+  PosFlag<SetTrue, [], [], "Enable C++14 sized global deallocation functions">,
+  NegFlag<SetFalse>, BothFlags<[], [ClangOption, CC1Option]>>;
 defm aligned_allocation : BoolFOption<"aligned-allocation",
   LangOpts<"AlignedAllocation">, Default<cpp17.KeyPath>,
   PosFlag<SetTrue, [], [ClangOption], "Enable C++17 aligned allocation functions">,
@@ -6111,14 +6115,10 @@ def mavx512cd : Flag<["-"], "mavx512cd">, Group<m_x86_Features_Group>;
 def mno_avx512cd : Flag<["-"], "mno-avx512cd">, Group<m_x86_Features_Group>;
 def mavx512dq : Flag<["-"], "mavx512dq">, Group<m_x86_Features_Group>;
 def mno_avx512dq : Flag<["-"], "mno-avx512dq">, Group<m_x86_Features_Group>;
-def mavx512er : Flag<["-"], "mavx512er">, Group<m_x86_Features_Group>;
-def mno_avx512er : Flag<["-"], "mno-avx512er">, Group<m_x86_Features_Group>;
 def mavx512fp16 : Flag<["-"], "mavx512fp16">, Group<m_x86_Features_Group>;
 def mno_avx512fp16 : Flag<["-"], "mno-avx512fp16">, Group<m_x86_Features_Group>;
 def mavx512ifma : Flag<["-"], "mavx512ifma">, Group<m_x86_Features_Group>;
 def mno_avx512ifma : Flag<["-"], "mno-avx512ifma">, Group<m_x86_Features_Group>;
-def mavx512pf : Flag<["-"], "mavx512pf">, Group<m_x86_Features_Group>;
-def mno_avx512pf : Flag<["-"], "mno-avx512pf">, Group<m_x86_Features_Group>;
 def mavx512vbmi : Flag<["-"], "mavx512vbmi">, Group<m_x86_Features_Group>;
 def mno_avx512vbmi : Flag<["-"], "mno-avx512vbmi">, Group<m_x86_Features_Group>;
 def mavx512vbmi2 : Flag<["-"], "mavx512vbmi2">, Group<m_x86_Features_Group>;
@@ -6209,8 +6209,6 @@ def mpopcnt : Flag<["-"], "mpopcnt">, Group<m_x86_Features_Group>;
 def mno_popcnt : Flag<["-"], "mno-popcnt">, Group<m_x86_Features_Group>;
 def mprefetchi : Flag<["-"], "mprefetchi">, Group<m_x86_Features_Group>;
 def mno_prefetchi : Flag<["-"], "mno-prefetchi">, Group<m_x86_Features_Group>;
-def mprefetchwt1 : Flag<["-"], "mprefetchwt1">, Group<m_x86_Features_Group>;
-def mno_prefetchwt1 : Flag<["-"], "mno-prefetchwt1">, Group<m_x86_Features_Group>;
 def mprfchw : Flag<["-"], "mprfchw">, Group<m_x86_Features_Group>;
 def mno_prfchw : Flag<["-"], "mno-prfchw">, Group<m_x86_Features_Group>;
 def mptwrite : Flag<["-"], "mptwrite">, Group<m_x86_Features_Group>;
@@ -8312,6 +8310,9 @@ def _SLASH_vd : CLJoined<"vd">, HelpText<"Control vtordisp placement">,
   Alias<vtordisp_mode_EQ>;
 def _SLASH_X : CLFlag<"X">,
   HelpText<"Do not add %INCLUDE% to include search path">, Alias<nostdlibinc>;
+def _SLASH_Zc___STDC__ : CLFlag<"Zc:__STDC__">,
+  HelpText<"Define __STDC__">,
+  Alias<fms_define_stdc>;
 def _SLASH_Zc_sizedDealloc : CLFlag<"Zc:sizedDealloc">,
   HelpText<"Enable C++14 sized global deallocation functions">,
   Alias<fsized_deallocation>;
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 3c4ab64..8493026 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -1646,8 +1646,12 @@ private:
   void ParseLexedAttributes(ParsingClass &Class);
   void ParseLexedAttributeList(LateParsedAttrList &LAs, Decl *D,
                                bool EnterScope, bool OnDefinition);
+  void ParseLexedCAttributeList(LateParsedAttrList &LA, bool EnterScope,
+                                ParsedAttributes *OutAttrs = nullptr);
   void ParseLexedAttribute(LateParsedAttribute &LA,
                            bool EnterScope, bool OnDefinition);
+  void ParseLexedCAttribute(LateParsedAttribute &LA, bool EnterScope,
+                            ParsedAttributes *OutAttrs = nullptr);
   void ParseLexedMethodDeclarations(ParsingClass &Class);
   void ParseLexedMethodDeclaration(LateParsedMethodDeclaration &LM);
   void ParseLexedMethodDefs(ParsingClass &Class);
@@ -2534,7 +2538,8 @@ private:
 
   void ParseStructDeclaration(
       ParsingDeclSpec &DS,
-      llvm::function_ref<void(ParsingFieldDeclarator &)> FieldsCallback);
+      llvm::function_ref<Decl *(ParsingFieldDeclarator &)> FieldsCallback,
+      LateParsedAttrList *LateFieldAttrs = nullptr);
 
   DeclGroupPtrTy ParseTopLevelStmtDecl();
 
@@ -2814,7 +2819,7 @@ private:
                                        SourceLocation CorrectLocation);
 
   void stripTypeAttributesOffDeclSpec(ParsedAttributes &Attrs, DeclSpec &DS,
-                                      Sema::TagUseKind TUK);
+                                      TagUseKind TUK);
 
   // FixItLoc = possible correct location for the attributes
   void ProhibitAttributes(ParsedAttributes &Attrs,
@@ -2997,7 +3002,8 @@ private:
   bool ParseCXXAssumeAttributeArg(ParsedAttributes &Attrs,
                                   IdentifierInfo *AttrName,
                                   SourceLocation AttrNameLoc,
-                                  SourceLocation *EndLoc);
+                                  SourceLocation *EndLoc,
+                                  ParsedAttr::Form Form);
 
   IdentifierInfo *TryParseCXX11AttributeIdentifier(
       SourceLocation &Loc,
@@ -3112,6 +3118,8 @@ private:
                                  SourceLocation ScopeLoc,
                                  ParsedAttr::Form Form);
 
+  void DistributeCLateParsedAttrs(Decl *Dcl, LateParsedAttrList *LateAttrs);
+
   void ParseBoundsAttribute(IdentifierInfo &AttrName,
                             SourceLocation AttrNameLoc, ParsedAttributes &Attrs,
                             IdentifierInfo *ScopeName, SourceLocation ScopeLoc,
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 01ddba5..5247379 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -175,7 +175,9 @@ class SemaObjC;
 class SemaOpenACC;
 class SemaOpenMP;
 class SemaPseudoObject;
+class SemaRISCV;
 class SemaSYCL;
+class SemaX86;
 class StandardConversionSequence;
 class Stmt;
 class StringLiteral;
@@ -447,6 +449,13 @@ enum class CheckedConversionKind {
   ForBuiltinOverloadedOp
 };
 
+enum class TagUseKind {
+  Reference,   // Reference to a tag:  'struct foo *X;'
+  Declaration, // Fwd decl of a tag:   'struct foo;'
+  Definition,  // Definition of a tag: 'struct foo { int X; } Y;'
+  Friend       // Friend declaration:  'friend struct foo;'
+};
+
 /// Sema - This implements semantic analysis and AST building for C.
 /// \nosubgrouping
 class Sema final : public SemaBase {
@@ -484,7 +493,6 @@ class Sema final : public SemaBase {
   // 29. Constraints and Concepts (SemaConcept.cpp)
   // 30. Types (SemaType.cpp)
   // 31. FixIt Helpers (SemaFixItUtils.cpp)
-  // 32. Name Lookup for RISC-V Vector Intrinsic (SemaRISCVVectorLookup.cpp)
 
   /// \name Semantic Analysis
   /// Implementations are in Sema.cpp
@@ -1020,11 +1028,21 @@ public:
     return *PseudoObjectPtr;
   }
 
+  SemaRISCV &RISCV() {
+    assert(RISCVPtr);
+    return *RISCVPtr;
+  }
+
   SemaSYCL &SYCL() {
     assert(SYCLPtr);
     return *SYCLPtr;
   }
 
+  SemaX86 &X86() {
+    assert(X86Ptr);
+    return *X86Ptr;
+  }
+
   /// Source of additional semantic information.
   IntrusiveRefCntPtr<ExternalSemaSource> ExternalSource;
 
@@ -1062,7 +1080,9 @@ private:
   std::unique_ptr<SemaOpenACC> OpenACCPtr;
   std::unique_ptr<SemaOpenMP> OpenMPPtr;
   std::unique_ptr<SemaPseudoObject> PseudoObjectPtr;
+  std::unique_ptr<SemaRISCV> RISCVPtr;
   std::unique_ptr<SemaSYCL> SYCLPtr;
+  std::unique_ptr<SemaX86> X86Ptr;
 
   ///@}
 
@@ -2037,6 +2057,23 @@ public:
 
   void CheckConstrainedAuto(const AutoType *AutoT, SourceLocation Loc);
 
+  bool BuiltinConstantArg(CallExpr *TheCall, int ArgNum, llvm::APSInt &Result);
+  bool BuiltinConstantArgRange(CallExpr *TheCall, int ArgNum, int Low, int High,
+                               bool RangeIsError = true);
+  bool BuiltinConstantArgMultiple(CallExpr *TheCall, int ArgNum,
+                                  unsigned Multiple);
+  bool BuiltinConstantArgPower2(CallExpr *TheCall, int ArgNum);
+  bool BuiltinConstantArgShiftedByte(CallExpr *TheCall, int ArgNum,
+                                     unsigned ArgBits);
+  bool BuiltinConstantArgShiftedByteOrXXFF(CallExpr *TheCall, int ArgNum,
+                                           unsigned ArgBits);
+
+  bool checkArgCountAtLeast(CallExpr *Call, unsigned MinArgCount);
+  bool checkArgCountAtMost(CallExpr *Call, unsigned MaxArgCount);
+  bool checkArgCountRange(CallExpr *Call, unsigned MinArgCount,
+                          unsigned MaxArgCount);
+  bool checkArgCount(CallExpr *Call, unsigned DesiredArgCount);
+
 private:
   void CheckArrayAccess(const Expr *BaseExpr, const Expr *IndexExpr,
                         const ArraySubscriptExpr *ASE = nullptr,
@@ -2092,24 +2129,10 @@ private:
                            CallExpr *TheCall);
   bool CheckMipsBuiltinArgument(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckSystemZBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
-  bool CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall);
-  bool CheckX86BuiltinGatherScatterScale(unsigned BuiltinID, CallExpr *TheCall);
-  bool CheckX86BuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall);
-  bool CheckX86BuiltinTileArgumentsRange(CallExpr *TheCall,
-                                         ArrayRef<int> ArgNums);
-  bool CheckX86BuiltinTileDuplicate(CallExpr *TheCall, ArrayRef<int> ArgNums);
-  bool CheckX86BuiltinTileRangeAndDuplicate(CallExpr *TheCall,
-                                            ArrayRef<int> ArgNums);
-  bool CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
-                                   CallExpr *TheCall);
   bool CheckPPCBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
                                    CallExpr *TheCall);
   bool CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
-  bool CheckRISCVLMUL(CallExpr *TheCall, unsigned ArgNum);
-  bool CheckRISCVBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
-                                     CallExpr *TheCall);
-  void checkRVVTypeSupport(QualType Ty, SourceLocation Loc, Decl *D,
-                           const llvm::StringMap<bool> &FeatureMap);
+
   bool CheckLoongArchBuiltinFunctionCall(const TargetInfo &TI,
                                          unsigned BuiltinID, CallExpr *TheCall);
   bool CheckWebAssemblyBuiltinFunctionCall(const TargetInfo &TI,
@@ -2139,16 +2162,6 @@ private:
   ExprResult BuiltinNontemporalOverloaded(ExprResult TheCallResult);
   ExprResult AtomicOpsOverloaded(ExprResult TheCallResult,
                                  AtomicExpr::AtomicOp Op);
-  bool BuiltinConstantArg(CallExpr *TheCall, int ArgNum, llvm::APSInt &Result);
-  bool BuiltinConstantArgRange(CallExpr *TheCall, int ArgNum, int Low, int High,
-                               bool RangeIsError = true);
-  bool BuiltinConstantArgMultiple(CallExpr *TheCall, int ArgNum,
-                                  unsigned Multiple);
-  bool BuiltinConstantArgPower2(CallExpr *TheCall, int ArgNum);
-  bool BuiltinConstantArgShiftedByte(CallExpr *TheCall, int ArgNum,
-                                     unsigned ArgBits);
-  bool BuiltinConstantArgShiftedByteOrXXFF(CallExpr *TheCall, int ArgNum,
-                                           unsigned ArgBits);
   bool BuiltinARMSpecialReg(unsigned BuiltinID, CallExpr *TheCall, int ArgNum,
                             unsigned ExpectedFieldNum, bool AllowName);
   bool BuiltinARMMemoryTaggingCall(unsigned BuiltinID, CallExpr *TheCall);
@@ -3168,13 +3181,6 @@ public:
                                     bool isDefinition, SourceLocation NewTagLoc,
                                     const IdentifierInfo *Name);
 
-  enum TagUseKind {
-    TUK_Reference,   // Reference to a tag:  'struct foo *X;'
-    TUK_Declaration, // Fwd decl of a tag:   'struct foo;'
-    TUK_Definition,  // Definition of a tag: 'struct foo { int X; } Y;'
-    TUK_Friend       // Friend declaration:  'friend struct foo;'
-  };
-
   enum OffsetOfKind {
     // Not parsing a type within __builtin_offsetof.
     OOK_Outside,
@@ -5106,6 +5112,13 @@ public:
              Context == ExpressionEvaluationContext::UnevaluatedList;
     }
 
+    bool isPotentiallyEvaluated() const {
+      return Context == ExpressionEvaluationContext::PotentiallyEvaluated ||
+             Context ==
+                 ExpressionEvaluationContext::PotentiallyEvaluatedIfUsed ||
+             Context == ExpressionEvaluationContext::ConstantEvaluated;
+    }
+
     bool isConstantEvaluated() const {
       return Context == ExpressionEvaluationContext::ConstantEvaluated ||
              Context == ExpressionEvaluationContext::ImmediateFunctionContext;
@@ -5140,6 +5153,12 @@ public:
     return ExprEvalContexts.back();
   };
 
+  const ExpressionEvaluationContextRecord &parentEvaluationContext() const {
+    assert(ExprEvalContexts.size() >= 2 &&
+           "Must be in an expression evaluation context");
+    return ExprEvalContexts[ExprEvalContexts.size() - 2];
+  };
+
   bool isBoundsAttrContext() const {
     return ExprEvalContexts.back().ExprContext ==
            ExpressionEvaluationContextRecord::ExpressionKind::
@@ -5890,7 +5909,6 @@ public:
                                        SourceLocation Loc, bool IsCompAssign);
 
   bool isValidSveBitcast(QualType srcType, QualType destType);
-  bool isValidRVVBitcast(QualType srcType, QualType destType);
 
   bool areMatrixTypesOfTheSameDimension(QualType srcTy, QualType destTy);
 
@@ -7063,7 +7081,9 @@ public:
       StorageClass SC, ArrayRef<ParmVarDecl *> Params,
       bool HasExplicitResultType);
 
-  void DiagnoseInvalidExplicitObjectParameterInLambda(CXXMethodDecl *Method);
+  /// Returns true if the explicit object parameter was invalid.
+  bool DiagnoseInvalidExplicitObjectParameterInLambda(CXXMethodDecl *Method,
+                                                      SourceLocation CallLoc);
 
   /// Perform initialization analysis of the init-capture and perform
   /// any implicit conversions such as an lvalue-to-rvalue conversion if
@@ -10067,7 +10087,9 @@ public:
 
   bool SubstTemplateArgument(const TemplateArgumentLoc &Input,
                              const MultiLevelTemplateArgumentList &TemplateArgs,
-                             TemplateArgumentLoc &Output);
+                             TemplateArgumentLoc &Output,
+                             SourceLocation Loc = {},
+                             const DeclarationName &Entity = {});
   bool
   SubstTemplateArguments(ArrayRef<TemplateArgumentLoc> Args,
                          const MultiLevelTemplateArgumentList &TemplateArgs,
@@ -11381,7 +11403,8 @@ public:
   QualType BuildMatrixType(QualType T, Expr *NumRows, Expr *NumColumns,
                            SourceLocation AttrLoc);
 
-  QualType BuildCountAttributedArrayType(QualType WrappedTy, Expr *CountExpr);
+  QualType BuildCountAttributedArrayOrPointerType(QualType WrappedTy,
+                                                  Expr *CountExpr);
 
   QualType BuildAddressSpaceAttr(QualType &T, LangAS ASIdx, Expr *AddrSpace,
                                  SourceLocation AttrLoc);
@@ -11685,27 +11708,6 @@ public:
   void ProcessAPINotes(Decl *D);
 
   ///@}
-  //
-  //
-  // -------------------------------------------------------------------------
-  //
-  //
-
-  /// \name Name Lookup for RISC-V Vector Intrinsic
-  /// Implementations are in SemaRISCVVectorLookup.cpp
-  ///@{
-
-public:
-  /// Indicate RISC-V vector builtin functions enabled or not.
-  bool DeclareRISCVVBuiltins = false;
-
-  /// Indicate RISC-V SiFive vector builtin functions enabled or not.
-  bool DeclareRISCVSiFiveVectorBuiltins = false;
-
-private:
-  std::unique_ptr<sema::RISCVIntrinsicManager> RVIntrinsicManager;
-
-  ///@}
 };
 
 DeductionFailureInfo
@@ -11727,9 +11729,6 @@ void Sema::PragmaStack<Sema::AlignPackInfo>::Act(SourceLocation PragmaLocation,
                                                  PragmaMsStackAction Action,
                                                  llvm::StringRef StackSlotLabel,
                                                  AlignPackInfo Value);
-
-std::unique_ptr<sema::RISCVIntrinsicManager>
-CreateRISCVIntrinsicManager(Sema &S);
 } // end namespace clang
 
 #endif
diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index 9927459..51981e1c 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -1390,9 +1390,7 @@ private:
   bool checkTransformableLoopNest(
       OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops,
       SmallVectorImpl<OMPLoopBasedDirective::HelperExprs> &LoopHelpers,
-      Stmt *&Body,
-      SmallVectorImpl<SmallVector<llvm::PointerUnion<Stmt *, Decl *>, 0>>
-          &OriginalInits);
+      Stmt *&Body, SmallVectorImpl<SmallVector<Stmt *, 0>> &OriginalInits);
 
   /// Helper to keep information about the current `omp begin/end declare
   /// variant` nesting.
diff --git a/clang/include/clang/Sema/SemaRISCV.h b/clang/include/clang/Sema/SemaRISCV.h
new file mode 100644
index 0000000..b6dd81f
--- /dev/null
+++ b/clang/include/clang/Sema/SemaRISCV.h
@@ -0,0 +1,52 @@
+//===----- SemaRISCV.h ---- RISC-V target-specific routines ---*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares semantic analysis functions specific to RISC-V.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_SEMA_SEMARISCV_H
+#define LLVM_CLANG_SEMA_SEMARISCV_H
+
+#include "clang/AST/DeclBase.h"
+#include "clang/AST/Expr.h"
+#include "clang/AST/Type.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/TargetInfo.h"
+#include "clang/Sema/RISCVIntrinsicManager.h"
+#include "clang/Sema/SemaBase.h"
+#include "llvm/ADT/StringMap.h"
+#include <memory>
+
+namespace clang {
+class SemaRISCV : public SemaBase {
+public:
+  SemaRISCV(Sema &S);
+
+  bool CheckLMUL(CallExpr *TheCall, unsigned ArgNum);
+  bool CheckBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
+                                CallExpr *TheCall);
+  void checkRVVTypeSupport(QualType Ty, SourceLocation Loc, Decl *D,
+                           const llvm::StringMap<bool> &FeatureMap);
+
+  bool isValidRVVBitcast(QualType srcType, QualType destType);
+
+  /// Indicate RISC-V vector builtin functions enabled or not.
+  bool DeclareRVVBuiltins = false;
+
+  /// Indicate RISC-V SiFive vector builtin functions enabled or not.
+  bool DeclareSiFiveVectorBuiltins = false;
+
+  std::unique_ptr<sema::RISCVIntrinsicManager> IntrinsicManager;
+};
+
+std::unique_ptr<sema::RISCVIntrinsicManager>
+CreateRISCVIntrinsicManager(Sema &S);
+} // namespace clang
+
+#endif // LLVM_CLANG_SEMA_SEMARISCV_H
diff --git a/clang/include/clang/Sema/SemaX86.h b/clang/include/clang/Sema/SemaX86.h
new file mode 100644
index 0000000..e322483
--- /dev/null
+++ b/clang/include/clang/Sema/SemaX86.h
@@ -0,0 +1,38 @@
+//===----- SemaX86.h ------- X86 target-specific routines -----*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares semantic analysis functions specific to X86.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_SEMA_SEMAX86_H
+#define LLVM_CLANG_SEMA_SEMAX86_H
+
+#include "clang/AST/Expr.h"
+#include "clang/Basic/LLVM.h"
+#include "clang/Basic/TargetInfo.h"
+#include "clang/Sema/SemaBase.h"
+
+namespace clang {
+class SemaX86 : public SemaBase {
+public:
+  SemaX86(Sema &S);
+
+  bool CheckBuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall);
+  bool CheckBuiltinGatherScatterScale(unsigned BuiltinID, CallExpr *TheCall);
+  bool CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall);
+  bool CheckBuiltinTileArgumentsRange(CallExpr *TheCall, ArrayRef<int> ArgNums);
+  bool CheckBuiltinTileDuplicate(CallExpr *TheCall, ArrayRef<int> ArgNums);
+  bool CheckBuiltinTileRangeAndDuplicate(CallExpr *TheCall,
+                                         ArrayRef<int> ArgNums);
+  bool CheckBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
+                                CallExpr *TheCall);
+};
+} // namespace clang
+
+#endif // LLVM_CLANG_SEMA_SEMAX86_H
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index 64414e3..40f4430 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -1011,6 +1011,11 @@ def FloatLoopCounter : Checker<"FloatLoopCounter">,
   Dependencies<[SecuritySyntaxChecker]>,
   Documentation<HasDocumentation>;
 
+def SetgidSetuidOrderChecker : Checker<"SetgidSetuidOrder">,
+  HelpText<"Warn on possible reversed order of 'setgid(getgid()))' and "
+           "'setuid(getuid())' (CERT: POS36-C)">,
+  Documentation<HasDocumentation>;
+
 } // end "security"
 
 let ParentPackage = ENV in {
@@ -1030,15 +1035,6 @@ let ParentPackage = ENV in {
 
 } // end "security.cert.env"
 
-let ParentPackage = POSAlpha in {
-
-  def PutenvWithAuto : Checker<"34c">,
-  HelpText<"Finds calls to the 'putenv' function which pass a pointer to "
-           "an automatic variable as the argument.">,
-  Documentation<HasDocumentation>;
-
-} // end "alpha.cert.pos"
-
 let ParentPackage = SecurityAlpha in {
 
 def ArrayBoundChecker : Checker<"ArrayBound">,
@@ -1049,10 +1045,6 @@ def ArrayBoundCheckerV2 : Checker<"ArrayBoundV2">,
   HelpText<"Warn about buffer overflows (newer checker)">,
   Documentation<HasDocumentation>;
 
-def ReturnPointerRangeChecker : Checker<"ReturnPtrRange">,
-  HelpText<"Check for an out-of-bound pointer being returned to callers">,
-  Documentation<HasDocumentation>;
-
 def MallocOverflowSecurityChecker : Checker<"MallocOverflow">,
   HelpText<"Check for overflows in the arguments to malloc()">,
   Documentation<HasDocumentation>;
@@ -1073,6 +1065,15 @@ def MmapWriteExecChecker : Checker<"MmapWriteExec">,
   ]>,
   Documentation<HasDocumentation>;
 
+def PutenvStackArray : Checker<"PutenvStackArray">,
+  HelpText<"Finds calls to the function 'putenv' which pass a pointer to "
+           "an automatic (stack-allocated) array as the argument.">,
+  Documentation<HasDocumentation>;
+
+def ReturnPointerRangeChecker : Checker<"ReturnPtrRange">,
+  HelpText<"Check for an out-of-bound pointer being returned to callers">,
+  Documentation<HasDocumentation>;
+
 } // end "alpha.security"
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 52eab5f..a2398fe 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -6494,7 +6494,8 @@ bool ASTContext::isSameDefaultTemplateArgument(const NamedDecl *X,
     if (!TTPX->hasDefaultArgument() || !TTPY->hasDefaultArgument())
       return false;
 
-    return hasSameType(TTPX->getDefaultArgument(), TTPY->getDefaultArgument());
+    return hasSameType(TTPX->getDefaultArgument().getArgument().getAsType(),
+                       TTPY->getDefaultArgument().getArgument().getAsType());
   }
 
   if (auto *NTTPX = dyn_cast<NonTypeTemplateParmDecl>(X)) {
@@ -6502,8 +6503,10 @@ bool ASTContext::isSameDefaultTemplateArgument(const NamedDecl *X,
     if (!NTTPX->hasDefaultArgument() || !NTTPY->hasDefaultArgument())
       return false;
 
-    Expr *DefaultArgumentX = NTTPX->getDefaultArgument()->IgnoreImpCasts();
-    Expr *DefaultArgumentY = NTTPY->getDefaultArgument()->IgnoreImpCasts();
+    Expr *DefaultArgumentX =
+        NTTPX->getDefaultArgument().getArgument().getAsExpr()->IgnoreImpCasts();
+    Expr *DefaultArgumentY =
+        NTTPY->getDefaultArgument().getArgument().getAsExpr()->IgnoreImpCasts();
     llvm::FoldingSetNodeID XID, YID;
     DefaultArgumentX->Profile(XID, *this, /*Canonical=*/true);
     DefaultArgumentY->Profile(YID, *this, /*Canonical=*/true);
diff --git a/clang/lib/AST/ASTDiagnostic.cpp b/clang/lib/AST/ASTDiagnostic.cpp
index 7b0d5f9..0680ff5 100644
--- a/clang/lib/AST/ASTDiagnostic.cpp
+++ b/clang/lib/AST/ASTDiagnostic.cpp
@@ -1215,46 +1215,19 @@ class TemplateDiff {
                                              bool &NeedAddressOf) {
     if (!Iter.isEnd()) {
       switch (Iter->getKind()) {
-        default:
-          llvm_unreachable("unknown ArgumentKind");
-        case TemplateArgument::Integral:
-          Value = Iter->getAsIntegral();
-          HasInt = true;
-          IntType = Iter->getIntegralType();
-          return;
-        case TemplateArgument::Declaration: {
-          VD = Iter->getAsDecl();
-          QualType ArgType = Iter->getParamTypeForDecl();
-          QualType VDType = VD->getType();
-          if (ArgType->isPointerType() &&
-              Context.hasSameType(ArgType->getPointeeType(), VDType))
-            NeedAddressOf = true;
-          return;
-        }
-        case TemplateArgument::NullPtr:
-          IsNullPtr = true;
-          return;
-        case TemplateArgument::Expression:
-          E = Iter->getAsExpr();
-      }
-    } else if (!Default->isParameterPack()) {
-      E = Default->getDefaultArgument();
-    }
-
-    if (!Iter.hasDesugaredTA()) return;
-
-    const TemplateArgument& TA = Iter.getDesugaredTA();
-    switch (TA.getKind()) {
-      default:
-        llvm_unreachable("unknown ArgumentKind");
+      case TemplateArgument::StructuralValue:
+        // FIXME: Diffing of structural values is not implemented.
+        // There is no possible fallback in this case, this will show up
+        // as '(no argument)'.
+        return;
       case TemplateArgument::Integral:
-        Value = TA.getAsIntegral();
+        Value = Iter->getAsIntegral();
         HasInt = true;
-        IntType = TA.getIntegralType();
+        IntType = Iter->getIntegralType();
         return;
       case TemplateArgument::Declaration: {
-        VD = TA.getAsDecl();
-        QualType ArgType = TA.getParamTypeForDecl();
+        VD = Iter->getAsDecl();
+        QualType ArgType = Iter->getParamTypeForDecl();
         QualType VDType = VD->getType();
         if (ArgType->isPointerType() &&
             Context.hasSameType(ArgType->getPointeeType(), VDType))
@@ -1265,13 +1238,62 @@ class TemplateDiff {
         IsNullPtr = true;
         return;
       case TemplateArgument::Expression:
-        // TODO: Sometimes, the desugared template argument Expr differs from
-        // the sugared template argument Expr.  It may be useful in the future
-        // but for now, it is just discarded.
-        if (!E)
-          E = TA.getAsExpr();
-        return;
+        E = Iter->getAsExpr();
+        break;
+      case TemplateArgument::Null:
+      case TemplateArgument::Type:
+      case TemplateArgument::Template:
+      case TemplateArgument::TemplateExpansion:
+        llvm_unreachable("TemplateArgument kind is not expected for NTTP");
+      case TemplateArgument::Pack:
+        llvm_unreachable("TemplateArgument kind should be handled elsewhere");
+      }
+    } else if (!Default->isParameterPack()) {
+      E = Default->getDefaultArgument().getArgument().getAsExpr();
     }
+
+    if (!Iter.hasDesugaredTA())
+      return;
+
+    const TemplateArgument &TA = Iter.getDesugaredTA();
+    switch (TA.getKind()) {
+    case TemplateArgument::StructuralValue:
+      // FIXME: Diffing of structural values is not implemented.
+      //        Just fall back to the expression.
+      return;
+    case TemplateArgument::Integral:
+      Value = TA.getAsIntegral();
+      HasInt = true;
+      IntType = TA.getIntegralType();
+      return;
+    case TemplateArgument::Declaration: {
+      VD = TA.getAsDecl();
+      QualType ArgType = TA.getParamTypeForDecl();
+      QualType VDType = VD->getType();
+      if (ArgType->isPointerType() &&
+          Context.hasSameType(ArgType->getPointeeType(), VDType))
+        NeedAddressOf = true;
+      return;
+    }
+    case TemplateArgument::NullPtr:
+      IsNullPtr = true;
+      return;
+    case TemplateArgument::Expression:
+      // TODO: Sometimes, the desugared template argument Expr differs from
+      // the sugared template argument Expr.  It may be useful in the future
+      // but for now, it is just discarded.
+      if (!E)
+        E = TA.getAsExpr();
+      return;
+    case TemplateArgument::Null:
+    case TemplateArgument::Type:
+    case TemplateArgument::Template:
+    case TemplateArgument::TemplateExpansion:
+      llvm_unreachable("TemplateArgument kind is not expected for NTTP");
+    case TemplateArgument::Pack:
+      llvm_unreachable("TemplateArgument kind should be handled elsewhere");
+    }
+    llvm_unreachable("Unexpected TemplateArgument kind");
   }
 
   /// DiffNonTypes - Handles any template parameters not handled by DiffTypes
@@ -1914,6 +1936,11 @@ class TemplateDiff {
       return;
     }
 
+    if (E) {
+      PrintExpr(E);
+      return;
+    }
+
     OS << "(no argument)";
   }
 
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 9ff8e1e..cab5ee6 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -5917,11 +5917,11 @@ ASTNodeImporter::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
   }
 
   if (D->hasDefaultArgument()) {
-    Expected<TypeSourceInfo *> ToDefaultArgOrErr =
-        import(D->getDefaultArgumentInfo());
+    Expected<TemplateArgumentLoc> ToDefaultArgOrErr =
+        import(D->getDefaultArgument());
     if (!ToDefaultArgOrErr)
       return ToDefaultArgOrErr.takeError();
-    ToD->setDefaultArgument(*ToDefaultArgOrErr);
+    ToD->setDefaultArgument(ToD->getASTContext(), *ToDefaultArgOrErr);
   }
 
   return ToD;
@@ -5949,10 +5949,11 @@ ASTNodeImporter::VisitNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D) {
     return ToD;
 
   if (D->hasDefaultArgument()) {
-    ExpectedExpr ToDefaultArgOrErr = import(D->getDefaultArgument());
+    Expected<TemplateArgumentLoc> ToDefaultArgOrErr =
+        import(D->getDefaultArgument());
     if (!ToDefaultArgOrErr)
       return ToDefaultArgOrErr.takeError();
-    ToD->setDefaultArgument(*ToDefaultArgOrErr);
+    ToD->setDefaultArgument(Importer.getToContext(), *ToDefaultArgOrErr);
   }
 
   return ToD;
diff --git a/clang/lib/AST/DeclPrinter.cpp b/clang/lib/AST/DeclPrinter.cpp
index c586825..0cf4e64 100644
--- a/clang/lib/AST/DeclPrinter.cpp
+++ b/clang/lib/AST/DeclPrinter.cpp
@@ -1883,7 +1883,8 @@ void DeclPrinter::VisitTemplateTypeParmDecl(const TemplateTypeParmDecl *TTP) {
 
   if (TTP->hasDefaultArgument()) {
     Out << " = ";
-    Out << TTP->getDefaultArgument().getAsString(Policy);
+    TTP->getDefaultArgument().getArgument().print(Policy, Out,
+                                                  /*IncludeType=*/false);
   }
 }
 
@@ -1897,7 +1898,7 @@ void DeclPrinter::VisitNonTypeTemplateParmDecl(
 
   if (NTTP->hasDefaultArgument()) {
     Out << " = ";
-    NTTP->getDefaultArgument()->printPretty(Out, nullptr, Policy, Indentation,
-                                            "\n", &Context);
+    NTTP->getDefaultArgument().getArgument().print(Policy, Out,
+                                                   /*IncludeType=*/false);
   }
 }
diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp
index 26765a5..95ffd47 100644
--- a/clang/lib/AST/DeclTemplate.cpp
+++ b/clang/lib/AST/DeclTemplate.cpp
@@ -669,23 +669,30 @@ TemplateTypeParmDecl::CreateDeserialized(const ASTContext &C, GlobalDeclID ID,
 }
 
 SourceLocation TemplateTypeParmDecl::getDefaultArgumentLoc() const {
-  return hasDefaultArgument()
-             ? getDefaultArgumentInfo()->getTypeLoc().getBeginLoc()
-             : SourceLocation();
+  return hasDefaultArgument() ? getDefaultArgument().getLocation()
+                              : SourceLocation();
 }
 
 SourceRange TemplateTypeParmDecl::getSourceRange() const {
   if (hasDefaultArgument() && !defaultArgumentWasInherited())
     return SourceRange(getBeginLoc(),
-                       getDefaultArgumentInfo()->getTypeLoc().getEndLoc());
+                       getDefaultArgument().getSourceRange().getEnd());
   // TypeDecl::getSourceRange returns a range containing name location, which is
   // wrong for unnamed template parameters. e.g:
   // it will return <[[typename>]] instead of <[[typename]]>
-  else if (getDeclName().isEmpty())
+  if (getDeclName().isEmpty())
     return SourceRange(getBeginLoc());
   return TypeDecl::getSourceRange();
 }
 
+void TemplateTypeParmDecl::setDefaultArgument(
+    const ASTContext &C, const TemplateArgumentLoc &DefArg) {
+  if (DefArg.getArgument().isNull())
+    DefaultArgument.set(nullptr);
+  else
+    DefaultArgument.set(new (C) TemplateArgumentLoc(DefArg));
+}
+
 unsigned TemplateTypeParmDecl::getDepth() const {
   return getTypeForDecl()->castAs<TemplateTypeParmType>()->getDepth();
 }
@@ -788,14 +795,21 @@ NonTypeTemplateParmDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID,
 SourceRange NonTypeTemplateParmDecl::getSourceRange() const {
   if (hasDefaultArgument() && !defaultArgumentWasInherited())
     return SourceRange(getOuterLocStart(),
-                       getDefaultArgument()->getSourceRange().getEnd());
+                       getDefaultArgument().getSourceRange().getEnd());
   return DeclaratorDecl::getSourceRange();
 }
 
 SourceLocation NonTypeTemplateParmDecl::getDefaultArgumentLoc() const {
-  return hasDefaultArgument()
-    ? getDefaultArgument()->getSourceRange().getBegin()
-    : SourceLocation();
+  return hasDefaultArgument() ? getDefaultArgument().getSourceRange().getBegin()
+                              : SourceLocation();
+}
+
+void NonTypeTemplateParmDecl::setDefaultArgument(
+    const ASTContext &C, const TemplateArgumentLoc &DefArg) {
+  if (DefArg.getArgument().isNull())
+    DefaultArgument.set(nullptr);
+  else
+    DefaultArgument.set(new (C) TemplateArgumentLoc(DefArg));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 859a3fa..6607727 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -1050,34 +1050,85 @@ bool ByteCodeExprGen<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
   if (T->isRecordType()) {
     const Record *R = getRecord(E->getType());
 
-    if (Inits.size() == 1 && E->getType() == Inits[0]->getType()) {
+    if (Inits.size() == 1 && E->getType() == Inits[0]->getType())
       return this->visitInitializer(Inits[0]);
+
+    auto initPrimitiveField = [=](const Record::Field *FieldToInit,
+                                  const Expr *Init, PrimType T) -> bool {
+      if (!this->visit(Init))
+        return false;
+
+      if (FieldToInit->isBitField()) {
+        if (!this->emitInitBitField(T, FieldToInit, E))
+          return false;
+      } else {
+        if (!this->emitInitField(T, FieldToInit->Offset, E))
+          return false;
+      }
+      return this->emitPopPtr(E);
+    };
+
+    auto initCompositeField = [=](const Record::Field *FieldToInit,
+                                  const Expr *Init) -> bool {
+      // Non-primitive case. Get a pointer to the field-to-initialize
+      // on the stack and recurse into visitInitializer().
+      if (!this->emitGetPtrField(FieldToInit->Offset, Init))
+        return false;
+      if (!this->visitInitializer(Init))
+        return false;
+      return this->emitPopPtr(E);
+    };
+
+    if (R->isUnion()) {
+      if (Inits.size() == 0) {
+        // Zero-initialize the first union field.
+        if (R->getNumFields() == 0)
+          return this->emitFinishInit(E);
+        const Record::Field *FieldToInit = R->getField(0u);
+        QualType FieldType = FieldToInit->Desc->getType();
+        if (std::optional<PrimType> T = classify(FieldType)) {
+          if (!this->visitZeroInitializer(*T, FieldType, E))
+            return false;
+          if (!this->emitInitField(*T, FieldToInit->Offset, E))
+            return false;
+        }
+        // FIXME: Non-primitive case?
+      } else {
+        const Expr *Init = Inits[0];
+        const FieldDecl *FToInit = nullptr;
+        if (const auto *ILE = dyn_cast<InitListExpr>(E))
+          FToInit = ILE->getInitializedFieldInUnion();
+        else
+          FToInit = cast<CXXParenListInitExpr>(E)->getInitializedFieldInUnion();
+
+        if (!this->emitDupPtr(E))
+          return false;
+
+        const Record::Field *FieldToInit = R->getField(FToInit);
+        if (std::optional<PrimType> T = classify(Init)) {
+          if (!initPrimitiveField(FieldToInit, Init, *T))
+            return false;
+        } else {
+          if (!initCompositeField(FieldToInit, Init))
+            return false;
+        }
+      }
+      return this->emitFinishInit(E);
     }
 
+    assert(!R->isUnion());
     unsigned InitIndex = 0;
     for (const Expr *Init : Inits) {
       // Skip unnamed bitfields.
       while (InitIndex < R->getNumFields() &&
              R->getField(InitIndex)->Decl->isUnnamedBitField())
         ++InitIndex;
-
       if (!this->emitDupPtr(E))
         return false;
 
       if (std::optional<PrimType> T = classify(Init)) {
         const Record::Field *FieldToInit = R->getField(InitIndex);
-        if (!this->visit(Init))
-          return false;
-
-        if (FieldToInit->isBitField()) {
-          if (!this->emitInitBitField(*T, FieldToInit, E))
-            return false;
-        } else {
-          if (!this->emitInitField(*T, FieldToInit->Offset, E))
-            return false;
-        }
-
-        if (!this->emitPopPtr(E))
+        if (!initPrimitiveField(FieldToInit, Init, *T))
           return false;
         ++InitIndex;
       } else {
@@ -1095,21 +1146,13 @@ bool ByteCodeExprGen<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
           // into the Record's fields.
         } else {
           const Record::Field *FieldToInit = R->getField(InitIndex);
-          // Non-primitive case. Get a pointer to the field-to-initialize
-          // on the stack and recurse into visitInitializer().
-          if (!this->emitGetPtrField(FieldToInit->Offset, Init))
-            return false;
-
-          if (!this->visitInitializer(Init))
-            return false;
-
-          if (!this->emitPopPtr(E))
+          if (!initCompositeField(FieldToInit, Init))
             return false;
           ++InitIndex;
         }
       }
     }
-    return true;
+    return this->emitFinishInit(E);
   }
 
   if (T->isArrayType()) {
@@ -1133,7 +1176,7 @@ bool ByteCodeExprGen<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
       }
     }
 
-    return true;
+    return this->emitFinishInit(E);
   }
 
   if (const auto *ComplexTy = E->getType()->getAs<ComplexType>()) {
@@ -3752,7 +3795,8 @@ bool ByteCodeExprGen<Emitter>::VisitDeclRefExpr(const DeclRefExpr *E) {
     }
   } else {
     if (const auto *VD = dyn_cast<VarDecl>(D);
-        VD && VD->getAnyInitializer() && VD->getType().isConstQualified()) {
+        VD && VD->getAnyInitializer() && VD->getType().isConstQualified() &&
+        !VD->isWeak()) {
       if (!this->visitVarDecl(VD))
         return false;
       // Retry.
@@ -3763,6 +3807,8 @@ bool ByteCodeExprGen<Emitter>::VisitDeclRefExpr(const DeclRefExpr *E) {
   if (std::optional<unsigned> I = P.getOrCreateDummy(D)) {
     if (!this->emitGetPtrGlobal(*I, E))
       return false;
+    if (E->getType()->isVoidType())
+      return true;
     // Convert the dummy pointer to another pointer type if we have to.
     if (PrimType PT = classifyPrim(E); PT != PT_Ptr) {
       if (!this->emitDecayPtr(PT_Ptr, PT, E))
diff --git a/clang/lib/AST/Interp/Descriptor.cpp b/clang/lib/AST/Interp/Descriptor.cpp
index d046690..746b765 100644
--- a/clang/lib/AST/Interp/Descriptor.cpp
+++ b/clang/lib/AST/Interp/Descriptor.cpp
@@ -137,9 +137,8 @@ static void moveArrayDesc(Block *B, const std::byte *Src, std::byte *Dst,
 }
 
 static void initField(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
-                      bool IsActive, const Descriptor *D,
+                      bool IsActive, bool IsUnion, const Descriptor *D,
                       unsigned FieldOffset) {
-  bool IsUnion = false; // FIXME
   auto *Desc = reinterpret_cast<InlineDescriptor *>(Ptr + FieldOffset) - 1;
   Desc->Offset = FieldOffset;
   Desc->Desc = D;
@@ -174,7 +173,7 @@ static void initBase(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
     initBase(B, Ptr + FieldOffset, IsConst, IsMutable, IsActive, V.Desc,
              V.Offset, false);
   for (const auto &F : D->ElemRecord->fields())
-    initField(B, Ptr + FieldOffset, IsConst, IsMutable, IsActive, F.Desc,
+    initField(B, Ptr + FieldOffset, IsConst, IsMutable, IsActive, IsUnion, F.Desc,
               F.Offset);
 
   // If this is initializing a virtual base, we do NOT want to consider its
@@ -193,7 +192,7 @@ static void ctorRecord(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
   for (const auto &V : D->ElemRecord->bases())
     initBase(B, Ptr, IsConst, IsMutable, IsActive, V.Desc, V.Offset, false);
   for (const auto &F : D->ElemRecord->fields())
-    initField(B, Ptr, IsConst, IsMutable, IsActive, F.Desc, F.Offset);
+    initField(B, Ptr, IsConst, IsMutable, IsActive, D->ElemRecord->isUnion(), F.Desc, F.Offset);
   for (const auto &V : D->ElemRecord->virtual_bases())
     initBase(B, Ptr, IsConst, IsMutable, IsActive, V.Desc, V.Offset, true);
 }
diff --git a/clang/lib/AST/Interp/EvaluationResult.cpp b/clang/lib/AST/Interp/EvaluationResult.cpp
index e92d686..150a793 100644
--- a/clang/lib/AST/Interp/EvaluationResult.cpp
+++ b/clang/lib/AST/Interp/EvaluationResult.cpp
@@ -101,6 +101,10 @@ static bool CheckFieldsInitialized(InterpState &S, SourceLocation Loc,
     Pointer FieldPtr = BasePtr.atField(F.Offset);
     QualType FieldType = F.Decl->getType();
 
+    // Don't check inactive union members.
+    if (R->isUnion() && !FieldPtr.isActive())
+      continue;
+
     if (FieldType->isRecordType()) {
       Result &= CheckFieldsInitialized(S, Loc, FieldPtr, FieldPtr.getRecord());
     } else if (FieldType->isIncompleteArrayType()) {
diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp
index 3e4da487e..145fa65 100644
--- a/clang/lib/AST/Interp/Interp.cpp
+++ b/clang/lib/AST/Interp/Interp.cpp
@@ -18,6 +18,7 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTDiagnostic.h"
 #include "clang/AST/CXXInheritance.h"
+#include "clang/AST/DeclObjC.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/ExprCXX.h"
 #include "llvm/ADT/APSInt.h"
@@ -76,18 +77,15 @@ static bool diagnoseUnknownDecl(InterpState &S, CodePtr OpPC,
     } else {
       S.FFDiag(E);
     }
-  } else if (const auto *VD = dyn_cast<VarDecl>(D)) {
-    if (!VD->getType().isConstQualified()) {
-      diagnoseNonConstVariable(S, OpPC, VD);
-      return false;
-    }
-
-    // const, but no initializer.
-    if (!VD->getAnyInitializer()) {
-      diagnoseMissingInitializer(S, OpPC, VD);
-      return false;
-    }
+    return false;
   }
+
+  if (!D->getType().isConstQualified())
+    diagnoseNonConstVariable(S, OpPC, D);
+  else if (const auto *VD = dyn_cast<VarDecl>(D);
+           VD && !VD->getAnyInitializer())
+    diagnoseMissingInitializer(S, OpPC, VD);
+
   return false;
 }
 
@@ -104,6 +102,11 @@ static void diagnoseNonConstVariable(InterpState &S, CodePtr OpPC,
     return;
   }
 
+  // Rather random, but this is to match the diagnostic output of the current
+  // interpreter.
+  if (isa<ObjCIvarDecl>(VD))
+    return;
+
   if (VD->getType()->isIntegralOrEnumerationType()) {
     S.FFDiag(Loc, diag::note_constexpr_ltor_non_const_int, 1) << VD;
     S.Note(VD->getLocation(), diag::note_declared_at);
@@ -454,16 +457,16 @@ bool CheckLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   if (!CheckConstant(S, OpPC, Ptr))
     return false;
 
-  if (!CheckDummy(S, OpPC, Ptr))
+  if (!CheckDummy(S, OpPC, Ptr, AK_Read))
     return false;
   if (!CheckExtern(S, OpPC, Ptr))
     return false;
   if (!CheckRange(S, OpPC, Ptr, AK_Read))
     return false;
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Read))
-    return false;
   if (!CheckActive(S, OpPC, Ptr, AK_Read))
     return false;
+  if (!CheckInitialized(S, OpPC, Ptr, AK_Read))
+    return false;
   if (!CheckTemporary(S, OpPC, Ptr, AK_Read))
     return false;
   if (!CheckMutable(S, OpPC, Ptr))
@@ -474,7 +477,7 @@ bool CheckLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
 bool CheckStore(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   if (!CheckLive(S, OpPC, Ptr, AK_Assign))
     return false;
-  if (!CheckDummy(S, OpPC, Ptr))
+  if (!CheckDummy(S, OpPC, Ptr, AK_Assign))
     return false;
   if (!CheckExtern(S, OpPC, Ptr))
     return false;
@@ -657,7 +660,8 @@ bool CheckDeclRef(InterpState &S, CodePtr OpPC, const DeclRefExpr *DR) {
   return diagnoseUnknownDecl(S, OpPC, D);
 }
 
-bool CheckDummy(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
+bool CheckDummy(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
+                AccessKinds AK) {
   if (!Ptr.isDummy())
     return true;
 
@@ -666,7 +670,15 @@ bool CheckDummy(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   if (!D)
     return false;
 
-  return diagnoseUnknownDecl(S, OpPC, D);
+  if (AK == AK_Read || AK == AK_Increment || AK == AK_Decrement)
+    return diagnoseUnknownDecl(S, OpPC, D);
+
+  assert(AK == AK_Assign);
+  if (S.getLangOpts().CPlusPlus11) {
+    const SourceInfo &E = S.Current->getSource(OpPC);
+    S.FFDiag(E, diag::note_constexpr_modify_global);
+  }
+  return false;
 }
 
 bool CheckNonNullArgs(InterpState &S, CodePtr OpPC, const Function *F,
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index bc2ca12..eca1792 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -56,7 +56,8 @@ bool CheckLive(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                AccessKinds AK);
 
 /// Checks if a pointer is a dummy pointer.
-bool CheckDummy(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
+bool CheckDummy(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
+                AccessKinds AK);
 
 /// Checks if a pointer is null.
 bool CheckNull(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
@@ -588,7 +589,7 @@ bool IncDecHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Inc(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-  if (!CheckDummy(S, OpPC, Ptr))
+  if (!CheckDummy(S, OpPC, Ptr, AK_Increment))
     return false;
   if (!CheckInitialized(S, OpPC, Ptr, AK_Increment))
     return false;
@@ -602,7 +603,7 @@ bool Inc(InterpState &S, CodePtr OpPC) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool IncPop(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-  if (!CheckDummy(S, OpPC, Ptr))
+  if (!CheckDummy(S, OpPC, Ptr, AK_Increment))
     return false;
   if (!CheckInitialized(S, OpPC, Ptr, AK_Increment))
     return false;
@@ -617,7 +618,7 @@ bool IncPop(InterpState &S, CodePtr OpPC) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Dec(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-  if (!CheckDummy(S, OpPC, Ptr))
+  if (!CheckDummy(S, OpPC, Ptr, AK_Decrement))
     return false;
   if (!CheckInitialized(S, OpPC, Ptr, AK_Decrement))
     return false;
@@ -631,7 +632,7 @@ bool Dec(InterpState &S, CodePtr OpPC) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool DecPop(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-  if (!CheckDummy(S, OpPC, Ptr))
+  if (!CheckDummy(S, OpPC, Ptr, AK_Decrement))
     return false;
   if (!CheckInitialized(S, OpPC, Ptr, AK_Decrement))
     return false;
@@ -1335,16 +1336,19 @@ inline bool GetPtrThisBase(InterpState &S, CodePtr OpPC, uint32_t Off) {
 
 inline bool FinishInitPop(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-  if (Ptr.canBeInitialized())
+  if (Ptr.canBeInitialized()) {
     Ptr.initialize();
+    Ptr.activate();
+  }
   return true;
 }
 
 inline bool FinishInit(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
-
-  if (Ptr.canBeInitialized())
+  if (Ptr.canBeInitialized()) {
     Ptr.initialize();
+    Ptr.activate();
+  }
   return true;
 }
 
@@ -1370,9 +1374,6 @@ inline bool GetPtrVirtBasePop(InterpState &S, CodePtr OpPC,
   const Pointer &Ptr = S.Stk.pop<Pointer>();
   if (!CheckNull(S, OpPC, Ptr, CSK_Base))
     return false;
-  if (Ptr.isDummy()) // FIXME: Once we have type info for dummy pointers, this
-                     // needs to go.
-    return false;
   return VirtBaseHelper(S, OpPC, D, Ptr);
 }
 
@@ -1538,9 +1539,6 @@ inline bool Memcpy(InterpState &S, CodePtr OpPC) {
 template <class T, ArithOp Op>
 bool OffsetHelper(InterpState &S, CodePtr OpPC, const T &Offset,
                   const Pointer &Ptr) {
-  if (!CheckRange(S, OpPC, Ptr, CSK_ArrayToPointer))
-    return false;
-
   // A zero offset does not change the pointer.
   if (Offset.isZero()) {
     S.Stk.push<Pointer>(Ptr);
@@ -1558,8 +1556,12 @@ bool OffsetHelper(InterpState &S, CodePtr OpPC, const T &Offset,
   if (!CheckArray(S, OpPC, Ptr))
     return false;
 
-  uint64_t Index = Ptr.getIndex();
   uint64_t MaxIndex = static_cast<uint64_t>(Ptr.getNumElems());
+  uint64_t Index;
+  if (Ptr.isOnePastEnd())
+    Index = MaxIndex;
+  else
+    Index = Ptr.getIndex();
 
   bool Invalid = false;
   // Helper to report an invalid offset, computed as APSInt.
diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp
index 565c85b..00206d0 100644
--- a/clang/lib/AST/Interp/InterpBuiltin.cpp
+++ b/clang/lib/AST/Interp/InterpBuiltin.cpp
@@ -214,7 +214,7 @@ static bool interp__builtin_strlen(InterpState &S, CodePtr OpPC,
   if (!CheckLive(S, OpPC, StrPtr, AK_Read))
     return false;
 
-  if (!CheckDummy(S, OpPC, StrPtr))
+  if (!CheckDummy(S, OpPC, StrPtr, AK_Read))
     return false;
 
   assert(StrPtr.getFieldDesc()->isPrimitiveArray());
diff --git a/clang/lib/AST/Interp/Pointer.cpp b/clang/lib/AST/Interp/Pointer.cpp
index ee8cedc..252f7ea 100644
--- a/clang/lib/AST/Interp/Pointer.cpp
+++ b/clang/lib/AST/Interp/Pointer.cpp
@@ -144,13 +144,18 @@ APValue Pointer::toAPValue() const {
 
   // TODO: compute the offset into the object.
   CharUnits Offset = CharUnits::Zero();
-  bool IsOnePastEnd = isOnePastEnd();
 
   // Build the path into the object.
   Pointer Ptr = *this;
   while (Ptr.isField() || Ptr.isArrayElement()) {
-    if (Ptr.isArrayElement()) {
-      Path.push_back(APValue::LValuePathEntry::ArrayIndex(Ptr.getIndex()));
+    if (Ptr.isArrayRoot()) {
+        Path.push_back(APValue::LValuePathEntry::ArrayIndex(0));
+        Ptr = Ptr.getBase();
+    } else if (Ptr.isArrayElement()) {
+      if (Ptr.isOnePastEnd())
+        Path.push_back(APValue::LValuePathEntry::ArrayIndex(Ptr.getArray().getNumElems()));
+      else
+        Path.push_back(APValue::LValuePathEntry::ArrayIndex(Ptr.getIndex()));
       Ptr = Ptr.getArray();
     } else {
       // TODO: figure out if base is virtual
@@ -173,7 +178,7 @@ APValue Pointer::toAPValue() const {
   // Just invert the order of the elements.
   std::reverse(Path.begin(), Path.end());
 
-  return APValue(Base, Offset, Path, IsOnePastEnd, /*IsNullPtr=*/false);
+  return APValue(Base, Offset, Path, /*IsOnePastEnd=*/false, /*IsNullPtr=*/false);
 }
 
 void Pointer::print(llvm::raw_ostream &OS) const {
@@ -346,6 +351,7 @@ std::optional<APValue> Pointer::toRValue(const Context &Ctx) const {
             } else {
               Ok &= Composite(FieldTy, FP, Value);
             }
+            ActiveField = FP.getFieldDesc()->asFieldDecl();
             break;
           }
         }
diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h
index 3ade575..93ca754 100644
--- a/clang/lib/AST/Interp/Pointer.h
+++ b/clang/lib/AST/Interp/Pointer.h
@@ -314,12 +314,14 @@ public:
   /// Returns the type of the innermost field.
   QualType getType() const {
     if (inPrimitiveArray() && Offset != asBlockPointer().Base) {
-      // Unfortunately, complex types are not array types in clang, but they are
-      // for us.
+      // Unfortunately, complex and vector types are not array types in clang,
+      // but they are for us.
       if (const auto *AT = getFieldDesc()->getType()->getAsArrayTypeUnsafe())
         return AT->getElementType();
       if (const auto *CT = getFieldDesc()->getType()->getAs<ComplexType>())
         return CT->getElementType();
+      if (const auto *CT = getFieldDesc()->getType()->getAs<VectorType>())
+        return CT->getElementType();
     }
     return getFieldDesc()->getType();
   }
@@ -535,9 +537,6 @@ public:
     if (isZero())
       return 0;
 
-    if (isElementPastEnd())
-      return 1;
-
     // narrow()ed element in a composite array.
     if (asBlockPointer().Base > sizeof(InlineDescriptor) &&
         asBlockPointer().Base == Offset)
diff --git a/clang/lib/AST/Interp/Record.cpp b/clang/lib/AST/Interp/Record.cpp
index 6a0a28b..8ded765 100644
--- a/clang/lib/AST/Interp/Record.cpp
+++ b/clang/lib/AST/Interp/Record.cpp
@@ -16,7 +16,7 @@ Record::Record(const RecordDecl *Decl, BaseList &&SrcBases,
                FieldList &&SrcFields, VirtualBaseList &&SrcVirtualBases,
                unsigned VirtualSize, unsigned BaseSize)
     : Decl(Decl), Bases(std::move(SrcBases)), Fields(std::move(SrcFields)),
-      BaseSize(BaseSize), VirtualSize(VirtualSize) {
+      BaseSize(BaseSize), VirtualSize(VirtualSize), IsUnion(Decl->isUnion()) {
   for (Base &V : SrcVirtualBases)
     VirtualBases.push_back({ V.Decl, V.Offset + BaseSize, V.Desc, V.R });
 
diff --git a/clang/lib/AST/Interp/Record.h b/clang/lib/AST/Interp/Record.h
index cf0480b..83e15b1 100644
--- a/clang/lib/AST/Interp/Record.h
+++ b/clang/lib/AST/Interp/Record.h
@@ -53,7 +53,7 @@ public:
   /// Returns the name of the underlying declaration.
   const std::string getName() const;
   /// Checks if the record is a union.
-  bool isUnion() const { return getDecl()->isUnion(); }
+  bool isUnion() const { return IsUnion; }
   /// Returns the size of the record.
   unsigned getSize() const { return BaseSize; }
   /// Returns the full size of the record, including records.
@@ -132,6 +132,8 @@ private:
   unsigned BaseSize;
   /// Size of all virtual bases.
   unsigned VirtualSize;
+  /// If this record is a union.
+  bool IsUnion;
 };
 
 } // namespace interp
diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp
index 4260847..3bbb3a9 100644
--- a/clang/lib/AST/JSONNodeDumper.cpp
+++ b/clang/lib/AST/JSONNodeDumper.cpp
@@ -1028,7 +1028,7 @@ void JSONNodeDumper::VisitTemplateTypeParmDecl(const TemplateTypeParmDecl *D) {
 
   if (D->hasDefaultArgument())
     JOS.attributeObject("defaultArg", [=] {
-      Visit(D->getDefaultArgument(), SourceRange(),
+      Visit(D->getDefaultArgument().getArgument(), SourceRange(),
             D->getDefaultArgStorage().getInheritedFrom(),
             D->defaultArgumentWasInherited() ? "inherited from" : "previous");
     });
@@ -1044,7 +1044,7 @@ void JSONNodeDumper::VisitNonTypeTemplateParmDecl(
 
   if (D->hasDefaultArgument())
     JOS.attributeObject("defaultArg", [=] {
-      Visit(D->getDefaultArgument(), SourceRange(),
+      Visit(D->getDefaultArgument().getArgument(), SourceRange(),
             D->getDefaultArgStorage().getInheritedFrom(),
             D->defaultArgumentWasInherited() ? "inherited from" : "previous");
     });
diff --git a/clang/lib/AST/ODRDiagsEmitter.cpp b/clang/lib/AST/ODRDiagsEmitter.cpp
index 5b1cdc1..37f0f68 100644
--- a/clang/lib/AST/ODRDiagsEmitter.cpp
+++ b/clang/lib/AST/ODRDiagsEmitter.cpp
@@ -1409,13 +1409,15 @@ bool ODRDiagsEmitter::diagnoseMismatch(
         }
 
         if (HasFirstDefaultArgument && HasSecondDefaultArgument) {
-          QualType FirstType = FirstTTPD->getDefaultArgument();
-          QualType SecondType = SecondTTPD->getDefaultArgument();
-          if (computeODRHash(FirstType) != computeODRHash(SecondType)) {
+          TemplateArgument FirstTA =
+              FirstTTPD->getDefaultArgument().getArgument();
+          TemplateArgument SecondTA =
+              SecondTTPD->getDefaultArgument().getArgument();
+          if (computeODRHash(FirstTA) != computeODRHash(SecondTA)) {
             DiagTemplateError(FunctionTemplateParameterDifferentDefaultArgument)
-                << (i + 1) << FirstType;
+                << (i + 1) << FirstTA;
             DiagTemplateNote(FunctionTemplateParameterDifferentDefaultArgument)
-                << (i + 1) << SecondType;
+                << (i + 1) << SecondTA;
             return true;
           }
         }
@@ -1521,8 +1523,11 @@ bool ODRDiagsEmitter::diagnoseMismatch(
         }
 
         if (HasFirstDefaultArgument && HasSecondDefaultArgument) {
-          Expr *FirstDefaultArgument = FirstNTTPD->getDefaultArgument();
-          Expr *SecondDefaultArgument = SecondNTTPD->getDefaultArgument();
+          TemplateArgument FirstDefaultArgument =
+              FirstNTTPD->getDefaultArgument().getArgument();
+          TemplateArgument SecondDefaultArgument =
+              SecondNTTPD->getDefaultArgument().getArgument();
+
           if (computeODRHash(FirstDefaultArgument) !=
               computeODRHash(SecondDefaultArgument)) {
             DiagTemplateError(FunctionTemplateParameterDifferentDefaultArgument)
diff --git a/clang/lib/AST/ODRHash.cpp b/clang/lib/AST/ODRHash.cpp
index 6f04739..246e562 100644
--- a/clang/lib/AST/ODRHash.cpp
+++ b/clang/lib/AST/ODRHash.cpp
@@ -462,7 +462,7 @@ public:
         D->hasDefaultArgument() && !D->defaultArgumentWasInherited();
     Hash.AddBoolean(hasDefaultArgument);
     if (hasDefaultArgument) {
-      AddTemplateArgument(D->getDefaultArgument());
+      AddTemplateArgument(D->getDefaultArgument().getArgument());
     }
     Hash.AddBoolean(D->isParameterPack());
 
@@ -480,7 +480,7 @@ public:
         D->hasDefaultArgument() && !D->defaultArgumentWasInherited();
     Hash.AddBoolean(hasDefaultArgument);
     if (hasDefaultArgument) {
-      AddStmt(D->getDefaultArgument());
+      AddTemplateArgument(D->getDefaultArgument().getArgument());
     }
     Hash.AddBoolean(D->isParameterPack());
 
diff --git a/clang/lib/AST/ParentMap.cpp b/clang/lib/AST/ParentMap.cpp
index 3d6a1cc..534793b 100644
--- a/clang/lib/AST/ParentMap.cpp
+++ b/clang/lib/AST/ParentMap.cpp
@@ -97,6 +97,22 @@ static void BuildParentMap(MapTy& M, Stmt* S,
       BuildParentMap(M, SubStmt, OVMode);
     }
     break;
+  case Stmt::CXXDefaultArgExprClass:
+    if (auto *Arg = dyn_cast<CXXDefaultArgExpr>(S)) {
+      if (Arg->hasRewrittenInit()) {
+        M[Arg->getExpr()] = S;
+        BuildParentMap(M, Arg->getExpr(), OVMode);
+      }
+    }
+    break;
+  case Stmt::CXXDefaultInitExprClass:
+    if (auto *Init = dyn_cast<CXXDefaultInitExpr>(S)) {
+      if (Init->hasRewrittenInit()) {
+        M[Init->getExpr()] = S;
+        BuildParentMap(M, Init->getExpr(), OVMode);
+      }
+    }
+    break;
   default:
     for (Stmt *SubStmt : S->children()) {
       if (SubStmt) {
diff --git a/clang/lib/AST/TemplateBase.cpp b/clang/lib/AST/TemplateBase.cpp
index 3310d7d..a7ee973 100644
--- a/clang/lib/AST/TemplateBase.cpp
+++ b/clang/lib/AST/TemplateBase.cpp
@@ -538,9 +538,19 @@ void TemplateArgument::print(const PrintingPolicy &Policy, raw_ostream &Out,
     Out << "nullptr";
     break;
 
-  case Template:
-    getAsTemplate().print(Out, Policy, TemplateName::Qualified::Fully);
+  case Template: {
+    TemplateName TN = getAsTemplate();
+    if (const auto *TD = TN.getAsTemplateDecl();
+        TD && TD->getDeclName().isEmpty()) {
+      assert(isa<TemplateTemplateParmDecl>(TD) &&
+             "Unexpected anonymous template");
+      const auto *TTP = cast<TemplateTemplateParmDecl>(TD);
+      Out << "template-parameter-" << TTP->getDepth() << "-" << TTP->getIndex();
+    } else {
+      TN.print(Out, Policy, TemplateName::Qualified::Fully);
+    }
     break;
+  }
 
   case TemplateExpansion:
     getAsTemplateOrTemplatePattern().print(Out, Policy);
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 3b90b82..04f105c 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -632,6 +632,16 @@ bool Type::isStructureType() const {
   return false;
 }
 
+bool Type::isStructureTypeWithFlexibleArrayMember() const {
+  const auto *RT = getAs<RecordType>();
+  if (!RT)
+    return false;
+  const auto *Decl = RT->getDecl();
+  if (!Decl->isStruct())
+    return false;
+  return Decl->hasFlexibleArrayMember();
+}
+
 bool Type::isObjCBoxableRecordType() const {
   if (const auto *RT = getAs<RecordType>())
     return RT->getDecl()->hasAttr<ObjCBoxableAttr>();
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index 87f0a87..5ed56b3 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -2273,16 +2273,17 @@ bool clang::isSubstitutedDefaultArgument(ASTContext &Ctx, TemplateArgument Arg,
 
   if (auto *TTPD = dyn_cast<TemplateTypeParmDecl>(Param)) {
     return TTPD->hasDefaultArgument() &&
-           isSubstitutedTemplateArgument(Ctx, Arg, TTPD->getDefaultArgument(),
-                                         Args, Depth);
+           isSubstitutedTemplateArgument(
+               Ctx, Arg, TTPD->getDefaultArgument().getArgument(), Args, Depth);
   } else if (auto *TTPD = dyn_cast<TemplateTemplateParmDecl>(Param)) {
     return TTPD->hasDefaultArgument() &&
            isSubstitutedTemplateArgument(
                Ctx, Arg, TTPD->getDefaultArgument().getArgument(), Args, Depth);
   } else if (auto *NTTPD = dyn_cast<NonTypeTemplateParmDecl>(Param)) {
     return NTTPD->hasDefaultArgument() &&
-           isSubstitutedTemplateArgument(Ctx, Arg, NTTPD->getDefaultArgument(),
-                                         Args, Depth);
+           isSubstitutedTemplateArgument(
+               Ctx, Arg, NTTPD->getDefaultArgument().getArgument(), Args,
+               Depth);
   }
   return false;
 }
diff --git a/clang/lib/Analysis/CFG.cpp b/clang/lib/Analysis/CFG.cpp
index 64e6155..0231725 100644
--- a/clang/lib/Analysis/CFG.cpp
+++ b/clang/lib/Analysis/CFG.cpp
@@ -556,6 +556,10 @@ public:
 
 private:
   // Visitors to walk an AST and construct the CFG.
+  CFGBlock *VisitCXXDefaultArgExpr(CXXDefaultArgExpr *Default,
+                                   AddStmtChoice asc);
+  CFGBlock *VisitCXXDefaultInitExpr(CXXDefaultInitExpr *Default,
+                                    AddStmtChoice asc);
   CFGBlock *VisitInitListExpr(InitListExpr *ILE, AddStmtChoice asc);
   CFGBlock *VisitAddrLabelExpr(AddrLabelExpr *A, AddStmtChoice asc);
   CFGBlock *VisitAttributedStmt(AttributedStmt *A, AddStmtChoice asc);
@@ -2254,16 +2258,10 @@ CFGBlock *CFGBuilder::Visit(Stmt * S, AddStmtChoice asc,
                                    asc, ExternallyDestructed);
 
     case Stmt::CXXDefaultArgExprClass:
+      return VisitCXXDefaultArgExpr(cast<CXXDefaultArgExpr>(S), asc);
+
     case Stmt::CXXDefaultInitExprClass:
-      // FIXME: The expression inside a CXXDefaultArgExpr is owned by the
-      // called function's declaration, not by the caller. If we simply add
-      // this expression to the CFG, we could end up with the same Expr
-      // appearing multiple times (PR13385).
-      //
-      // It's likewise possible for multiple CXXDefaultInitExprs for the same
-      // expression to be used in the same function (through aggregate
-      // initialization).
-      return VisitStmt(S, asc);
+      return VisitCXXDefaultInitExpr(cast<CXXDefaultInitExpr>(S), asc);
 
     case Stmt::CXXBindTemporaryExprClass:
       return VisitCXXBindTemporaryExpr(cast<CXXBindTemporaryExpr>(S), asc);
@@ -2433,6 +2431,40 @@ CFGBlock *CFGBuilder::VisitChildren(Stmt *S) {
   return B;
 }
 
+CFGBlock *CFGBuilder::VisitCXXDefaultArgExpr(CXXDefaultArgExpr *Arg,
+                                             AddStmtChoice asc) {
+  if (Arg->hasRewrittenInit()) {
+    if (asc.alwaysAdd(*this, Arg)) {
+      autoCreateBlock();
+      appendStmt(Block, Arg);
+    }
+    return VisitStmt(Arg->getExpr(), asc);
+  }
+
+  // We can't add the default argument if it's not rewritten because the
+  // expression inside a CXXDefaultArgExpr is owned by the called function's
+  // declaration, not by the caller, we could end up with the same expression
+  // appearing multiple times.
+  return VisitStmt(Arg, asc);
+}
+
+CFGBlock *CFGBuilder::VisitCXXDefaultInitExpr(CXXDefaultInitExpr *Init,
+                                              AddStmtChoice asc) {
+  if (Init->hasRewrittenInit()) {
+    if (asc.alwaysAdd(*this, Init)) {
+      autoCreateBlock();
+      appendStmt(Block, Init);
+    }
+    return VisitStmt(Init->getExpr(), asc);
+  }
+
+  // We can't add the default initializer if it's not rewritten because multiple
+  // CXXDefaultInitExprs for the same sub-expression to be used in the same
+  // function (through aggregate initialization). we could end up with the same
+  // expression appearing multiple times.
+  return VisitStmt(Init, asc);
+}
+
 CFGBlock *CFGBuilder::VisitInitListExpr(InitListExpr *ILE, AddStmtChoice asc) {
   if (asc.alwaysAdd(*this, ILE)) {
     autoCreateBlock();
diff --git a/clang/lib/Basic/FileManager.cpp b/clang/lib/Basic/FileManager.cpp
index 143c043..1dc51de 100644
--- a/clang/lib/Basic/FileManager.cpp
+++ b/clang/lib/Basic/FileManager.cpp
@@ -82,6 +82,22 @@ getDirectoryFromFile(FileManager &FileMgr, StringRef Filename,
   return FileMgr.getDirectoryRef(DirName, CacheFailure);
 }
 
+DirectoryEntry *&FileManager::getRealDirEntry(const llvm::vfs::Status &Status) {
+  assert(Status.isDirectory() && "The directory should exist!");
+  // See if we have already opened a directory with the
+  // same inode (this occurs on Unix-like systems when one dir is
+  // symlinked to another, for example) or the same path (on
+  // Windows).
+  DirectoryEntry *&UDE = UniqueRealDirs[Status.getUniqueID()];
+
+  if (!UDE) {
+    // We don't have this directory yet, add it.  We use the string
+    // key from the SeenDirEntries map as the string.
+    UDE = new (DirsAlloc.Allocate()) DirectoryEntry();
+  }
+  return UDE;
+}
+
 /// Add all ancestors of the given path (pointing to either a file or
 /// a directory) as virtual directories.
 void FileManager::addAncestorsAsVirtualDirs(StringRef Path) {
@@ -99,10 +115,21 @@ void FileManager::addAncestorsAsVirtualDirs(StringRef Path) {
   if (NamedDirEnt.second)
     return;
 
-  // Add the virtual directory to the cache.
-  auto *UDE = new (DirsAlloc.Allocate()) DirectoryEntry();
-  NamedDirEnt.second = *UDE;
-  VirtualDirectoryEntries.push_back(UDE);
+  // Check to see if the directory exists.
+  llvm::vfs::Status Status;
+  auto statError =
+      getStatValue(DirName, Status, false, nullptr /*directory lookup*/);
+  if (statError) {
+    // There's no real directory at the given path.
+    // Add the virtual directory to the cache.
+    auto *UDE = new (DirsAlloc.Allocate()) DirectoryEntry();
+    NamedDirEnt.second = *UDE;
+    VirtualDirectoryEntries.push_back(UDE);
+  } else {
+    // There is the real directory
+    DirectoryEntry *&UDE = getRealDirEntry(Status);
+    NamedDirEnt.second = *UDE;
+  }
 
   // Recursively add the other ancestors.
   addAncestorsAsVirtualDirs(DirName);
@@ -162,17 +189,8 @@ FileManager::getDirectoryRef(StringRef DirName, bool CacheFailure) {
     return llvm::errorCodeToError(statError);
   }
 
-  // It exists.  See if we have already opened a directory with the
-  // same inode (this occurs on Unix-like systems when one dir is
-  // symlinked to another, for example) or the same path (on
-  // Windows).
-  DirectoryEntry *&UDE = UniqueRealDirs[Status.getUniqueID()];
-
-  if (!UDE) {
-    // We don't have this directory yet, add it.  We use the string
-    // key from the SeenDirEntries map as the string.
-    UDE = new (DirsAlloc.Allocate()) DirectoryEntry();
-  }
+  // It exists.
+  DirectoryEntry *&UDE = getRealDirEntry(Status);
   NamedDirEnt.second = *UDE;
 
   return DirectoryEntryRef(NamedDirEnt);
diff --git a/clang/lib/Basic/Targets/Mips.cpp b/clang/lib/Basic/Targets/Mips.cpp
index 3a65f53..174bc9d 100644
--- a/clang/lib/Basic/Targets/Mips.cpp
+++ b/clang/lib/Basic/Targets/Mips.cpp
@@ -273,6 +273,34 @@ bool MipsTargetInfo::validateTarget(DiagnosticsEngine &Diags) const {
     Diags.Report(diag::err_mips_fp64_req) << "-mfp64";
     return false;
   }
+  // FPXX requires mips2+
+  if (FPMode == FPXX && CPU == "mips1") {
+    Diags.Report(diag::err_opt_not_valid_with_opt) << "-mfpxx" << CPU;
+    return false;
+  }
+  // -mmsa with -msoft-float makes nonsense
+  if (FloatABI == SoftFloat && HasMSA) {
+    Diags.Report(diag::err_opt_not_valid_with_opt) << "-msoft-float"
+                                                   << "-mmsa";
+    return false;
+  }
+  // Option -mmsa permitted on Mips32 iff revision 2 or higher is present
+  if (HasMSA && (CPU == "mips1" || CPU == "mips2" || getISARev() < 2) &&
+      ABI == "o32") {
+    Diags.Report(diag::err_mips_fp64_req) << "-mmsa";
+    return false;
+  }
+  // MSA requires FP64
+  if (FPMode == FPXX && HasMSA) {
+    Diags.Report(diag::err_opt_not_valid_with_opt) << "-mfpxx"
+                                                   << "-mmsa";
+    return false;
+  }
+  if (FPMode == FP32 && HasMSA) {
+    Diags.Report(diag::err_opt_not_valid_with_opt) << "-mfp32"
+                                                   << "-mmsa";
+    return false;
+  }
 
   return true;
 }
diff --git a/clang/lib/Basic/Targets/WebAssembly.h b/clang/lib/Basic/Targets/WebAssembly.h
index 4db9786..e4a449d 100644
--- a/clang/lib/Basic/Targets/WebAssembly.h
+++ b/clang/lib/Basic/Targets/WebAssembly.h
@@ -90,6 +90,9 @@ public:
 
   StringRef getABI() const override;
   bool setABI(const std::string &Name) override;
+  bool useFP16ConversionIntrinsics() const override {
+    return !HasHalfPrecision;
+  }
 
 protected:
   void getTargetDefines(const LangOptions &Opts,
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index b823eaf..3a30cff 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -310,15 +310,9 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasAVX512VNNI = true;
     } else if (Feature == "+avx512bf16") {
       HasAVX512BF16 = true;
-    } else if (Feature == "+avx512er") {
-      HasAVX512ER = true;
-      Diags.Report(diag::warn_knl_knm_isa_support_removed);
     } else if (Feature == "+avx512fp16") {
       HasAVX512FP16 = true;
       HasLegalHalfType = true;
-    } else if (Feature == "+avx512pf") {
-      HasAVX512PF = true;
-      Diags.Report(diag::warn_knl_knm_isa_support_removed);
     } else if (Feature == "+avx512dq") {
       HasAVX512DQ = true;
     } else if (Feature == "+avx512bitalg") {
@@ -375,9 +369,6 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasWBNOINVD = true;
     } else if (Feature == "+prefetchi") {
       HasPREFETCHI = true;
-    } else if (Feature == "+prefetchwt1") {
-      HasPREFETCHWT1 = true;
-      Diags.Report(diag::warn_knl_knm_isa_support_removed);
     } else if (Feature == "+clzero") {
       HasCLZERO = true;
     } else if (Feature == "+cldemote") {
@@ -840,12 +831,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__AVX512VNNI__");
   if (HasAVX512BF16)
     Builder.defineMacro("__AVX512BF16__");
-  if (HasAVX512ER)
-    Builder.defineMacro("__AVX512ER__");
   if (HasAVX512FP16)
     Builder.defineMacro("__AVX512FP16__");
-  if (HasAVX512PF)
-    Builder.defineMacro("__AVX512PF__");
   if (HasAVX512DQ)
     Builder.defineMacro("__AVX512DQ__");
   if (HasAVX512BITALG)
@@ -897,8 +884,6 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__SM4__");
   if (HasPREFETCHI)
     Builder.defineMacro("__PREFETCHI__");
-  if (HasPREFETCHWT1)
-    Builder.defineMacro("__PREFETCHWT1__");
   if (HasCLZERO)
     Builder.defineMacro("__CLZERO__");
   if (HasKL)
@@ -1084,9 +1069,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
       .Case("avx512vpopcntdq", true)
       .Case("avx512vnni", true)
       .Case("avx512bf16", true)
-      .Case("avx512er", true)
       .Case("avx512fp16", true)
-      .Case("avx512pf", true)
       .Case("avx512dq", true)
       .Case("avx512bitalg", true)
       .Case("avx512bw", true)
@@ -1134,7 +1117,6 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
       .Case("pku", true)
       .Case("popcnt", true)
       .Case("prefetchi", true)
-      .Case("prefetchwt1", true)
       .Case("prfchw", true)
       .Case("ptwrite", true)
       .Case("raoint", true)
@@ -1201,9 +1183,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
       .Case("avx512vpopcntdq", HasAVX512VPOPCNTDQ)
       .Case("avx512vnni", HasAVX512VNNI)
       .Case("avx512bf16", HasAVX512BF16)
-      .Case("avx512er", HasAVX512ER)
       .Case("avx512fp16", HasAVX512FP16)
-      .Case("avx512pf", HasAVX512PF)
       .Case("avx512dq", HasAVX512DQ)
       .Case("avx512bitalg", HasAVX512BITALG)
       .Case("avx512bw", HasAVX512BW)
@@ -1253,7 +1233,6 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
       .Case("pku", HasPKU)
       .Case("popcnt", HasPOPCNT)
       .Case("prefetchi", HasPREFETCHI)
-      .Case("prefetchwt1", HasPREFETCHWT1)
       .Case("prfchw", HasPRFCHW)
       .Case("ptwrite", HasPTWRITE)
       .Case("raoint", HasRAOINT)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index 6a0a6cb..0633b7e 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -103,8 +103,6 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
   bool HasAVX512VNNI = false;
   bool HasAVX512FP16 = false;
   bool HasAVX512BF16 = false;
-  bool HasAVX512ER = false;
-  bool HasAVX512PF = false;
   bool HasAVX512DQ = false;
   bool HasAVX512BITALG = false;
   bool HasAVX512BW = false;
@@ -136,7 +134,6 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
   bool HasCLWB = false;
   bool HasMOVBE = false;
   bool HasPREFETCHI = false;
-  bool HasPREFETCHWT1 = false;
   bool HasRDPID = false;
   bool HasRDPRU = false;
   bool HasRetpolineExternalThunk = false;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index ba94bf8..0549afa 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -21230,6 +21230,17 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_storef16_f32);
     return Builder.CreateCall(Callee, {Val, Addr});
   }
+  case WebAssembly::BI__builtin_wasm_splat_f16x8: {
+    Value *Val = EmitScalarExpr(E->getArg(0));
+    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_splat_f16x8);
+    return Builder.CreateCall(Callee, {Val});
+  }
+  case WebAssembly::BI__builtin_wasm_extract_lane_f16x8: {
+    Value *Vector = EmitScalarExpr(E->getArg(0));
+    Value *Index = EmitScalarExpr(E->getArg(1));
+    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_extract_lane_f16x8);
+    return Builder.CreateCall(Callee, {Vector, Index});
+  }
   case WebAssembly::BI__builtin_wasm_table_get: {
     assert(E->getArg(0)->getType()->isArrayType());
     Value *Table = EmitArrayToPointerDecay(E->getArg(0)).emitRawPointer(*this);
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index cd1c48b..d6478cc 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -317,8 +317,8 @@ pushTemporaryCleanup(CodeGenFunction &CGF, const MaterializeTemporaryExpr *M,
         CleanupKind CleanupKind;
         if (Lifetime == Qualifiers::OCL_Strong) {
           const ValueDecl *VD = M->getExtendingDecl();
-          bool Precise =
-              VD && isa<VarDecl>(VD) && VD->hasAttr<ObjCPreciseLifetimeAttr>();
+          bool Precise = isa_and_nonnull<VarDecl>(VD) &&
+                         VD->hasAttr<ObjCPreciseLifetimeAttr>();
           CleanupKind = CGF.getARCCleanupKind();
           Destroy = Precise ? &CodeGenFunction::destroyARCStrongPrecise
                             : &CodeGenFunction::destroyARCStrongImprecise;
@@ -4180,7 +4180,7 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E,
 
   // If the base is a vector type, then we are forming a vector element lvalue
   // with this subscript.
-  if (E->getBase()->getType()->isVectorType() &&
+  if (E->getBase()->getType()->isSubscriptableVectorType() &&
       !isa<ExtVectorElementExpr>(E->getBase())) {
     // Emit the vector as an lvalue to get its address.
     LValue LHS = EmitLValue(E->getBase());
@@ -4676,7 +4676,8 @@ LValue CodeGenFunction::EmitMemberExpr(const MemberExpr *E) {
 LValue CodeGenFunction::EmitLValueForLambdaField(const FieldDecl *Field,
                                                  llvm::Value *ThisValue) {
   bool HasExplicitObjectParameter = false;
-  if (const auto *MD = dyn_cast_if_present<CXXMethodDecl>(CurCodeDecl)) {
+  const auto *MD = dyn_cast_if_present<CXXMethodDecl>(CurCodeDecl);
+  if (MD) {
     HasExplicitObjectParameter = MD->isExplicitObjectMemberFunction();
     assert(MD->getParent()->isLambda());
     assert(MD->getParent() == Field->getParent());
@@ -4693,6 +4694,17 @@ LValue CodeGenFunction::EmitLValueForLambdaField(const FieldDecl *Field,
     else
       LambdaLV = MakeAddrLValue(AddrOfExplicitObject,
                                 D->getType().getNonReferenceType());
+
+    // Make sure we have an lvalue to the lambda itself and not a derived class.
+    auto *ThisTy = D->getType().getNonReferenceType()->getAsCXXRecordDecl();
+    auto *LambdaTy = cast<CXXRecordDecl>(Field->getParent());
+    if (ThisTy != LambdaTy) {
+      const CXXCastPath &BasePathArray = getContext().LambdaCastPaths.at(MD);
+      Address Base = GetAddressOfBaseClass(
+          LambdaLV.getAddress(), ThisTy, BasePathArray.begin(),
+          BasePathArray.end(), /*NullCheckValue=*/false, SourceLocation());
+      LambdaLV = MakeAddrLValue(Base, QualType{LambdaTy->getTypeForDecl(), 0});
+    }
   } else {
     QualType LambdaTagType = getContext().getTagDeclType(Field->getParent());
     LambdaLV = MakeNaturalAlignAddrLValue(ThisValue, LambdaTagType);
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index eac5ef3..6410f9e 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -142,7 +142,7 @@ public:
 /// of used expression from loop statement.
 class OMPLoopScope : public CodeGenFunction::RunCleanupsScope {
   void emitPreInitStmt(CodeGenFunction &CGF, const OMPLoopBasedDirective &S) {
-    const DeclStmt *PreInits;
+    const Stmt *PreInits;
     CodeGenFunction::OMPMapVars PreCondVars;
     if (auto *LD = dyn_cast<OMPLoopDirective>(&S)) {
       llvm::DenseSet<const VarDecl *> EmittedAsPrivate;
@@ -182,17 +182,34 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope {
             }
             return false;
           });
-      PreInits = cast_or_null<DeclStmt>(LD->getPreInits());
+      PreInits = LD->getPreInits();
     } else if (const auto *Tile = dyn_cast<OMPTileDirective>(&S)) {
-      PreInits = cast_or_null<DeclStmt>(Tile->getPreInits());
+      PreInits = Tile->getPreInits();
     } else if (const auto *Unroll = dyn_cast<OMPUnrollDirective>(&S)) {
-      PreInits = cast_or_null<DeclStmt>(Unroll->getPreInits());
+      PreInits = Unroll->getPreInits();
     } else {
       llvm_unreachable("Unknown loop-based directive kind.");
     }
     if (PreInits) {
-      for (const auto *I : PreInits->decls())
-        CGF.EmitVarDecl(cast<VarDecl>(*I));
+      // CompoundStmts and DeclStmts are used as lists of PreInit statements and
+      // declarations. Since declarations must be visible in the the following
+      // that they initialize, unpack the ComboundStmt they are nested in.
+      SmallVector<const Stmt *> PreInitStmts;
+      if (auto *PreInitCompound = dyn_cast<CompoundStmt>(PreInits))
+        llvm::append_range(PreInitStmts, PreInitCompound->body());
+      else
+        PreInitStmts.push_back(PreInits);
+
+      for (const Stmt *S : PreInitStmts) {
+        // EmitStmt skips any OMPCapturedExprDecls, but needs to be emitted
+        // here.
+        if (auto *PreInitDecl = dyn_cast<DeclStmt>(S)) {
+          for (Decl *I : PreInitDecl->decls())
+            CGF.EmitVarDecl(cast<VarDecl>(*I));
+          continue;
+        }
+        CGF.EmitStmt(S);
+      }
     }
     PreCondVars.restore(CGF);
   }
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 227813a..e4774a5 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -4150,7 +4150,7 @@ llvm::GlobalValue::LinkageTypes getMultiversionLinkage(CodeGenModule &CGM,
 }
 
 static FunctionDecl *createDefaultTargetVersionFrom(const FunctionDecl *FD) {
-  DeclContext *DeclCtx = FD->getASTContext().getTranslationUnitDecl();
+  auto *DeclCtx = const_cast<DeclContext *>(FD->getDeclContext());
   TypeSourceInfo *TInfo = FD->getTypeSourceInfo();
   StorageClass SC = FD->getStorageClass();
   DeclarationName Name = FD->getNameInfo().getName();
@@ -5740,15 +5740,17 @@ CodeGenModule::getLLVMLinkageVarDefinition(const VarDecl *VD) {
 static void replaceUsesOfNonProtoConstant(llvm::Constant *old,
                                           llvm::Function *newFn) {
   // Fast path.
-  if (old->use_empty()) return;
+  if (old->use_empty())
+    return;
 
   llvm::Type *newRetTy = newFn->getReturnType();
-  SmallVector<llvm::Value*, 4> newArgs;
+  SmallVector<llvm::Value *, 4> newArgs;
+
+  SmallVector<llvm::CallBase *> callSitesToBeRemovedFromParent;
 
   for (llvm::Value::use_iterator ui = old->use_begin(), ue = old->use_end();
-         ui != ue; ) {
-    llvm::Value::use_iterator use = ui++; // Increment before the use is erased.
-    llvm::User *user = use->getUser();
+       ui != ue; ui++) {
+    llvm::User *user = ui->getUser();
 
     // Recognize and replace uses of bitcasts.  Most calls to
     // unprototyped functions will use bitcasts.
@@ -5760,8 +5762,9 @@ static void replaceUsesOfNonProtoConstant(llvm::Constant *old,
 
     // Recognize calls to the function.
     llvm::CallBase *callSite = dyn_cast<llvm::CallBase>(user);
-    if (!callSite) continue;
-    if (!callSite->isCallee(&*use))
+    if (!callSite)
+      continue;
+    if (!callSite->isCallee(&*ui))
       continue;
 
     // If the return types don't match exactly, then we can't
@@ -5830,6 +5833,10 @@ static void replaceUsesOfNonProtoConstant(llvm::Constant *old,
     if (callSite->getDebugLoc())
       newCall->setDebugLoc(callSite->getDebugLoc());
 
+    callSitesToBeRemovedFromParent.push_back(callSite);
+  }
+
+  for (auto *callSite : callSitesToBeRemovedFromParent) {
     callSite->eraseFromParent();
   }
 }
diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index f4de21b..6ce2d32 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -191,6 +191,10 @@ public:
 
   bool isBranch() const { return FalseCount.has_value(); }
 
+  bool isMCDCBranch() const {
+    return std::holds_alternative<mcdc::BranchParameters>(MCDCParams);
+  }
+
   bool isMCDCDecision() const {
     return std::holds_alternative<mcdc::DecisionParameters>(MCDCParams);
   }
@@ -290,10 +294,36 @@ public:
     return SM.getLocForEndOfFile(SM.getFileID(Loc));
   }
 
-  /// Find out where the current file is included or macro is expanded.
-  SourceLocation getIncludeOrExpansionLoc(SourceLocation Loc) {
-    return Loc.isMacroID() ? SM.getImmediateExpansionRange(Loc).getBegin()
-                           : SM.getIncludeLoc(SM.getFileID(Loc));
+  /// Find out where a macro is expanded. If the immediate result is a
+  /// <scratch space>, keep looking until the result isn't. Return a pair of
+  /// \c SourceLocation. The first object is always the begin sloc of found
+  /// result. The second should be checked by the caller: if it has value, it's
+  /// the end sloc of the found result. Otherwise the while loop didn't get
+  /// executed, which means the location wasn't changed and the caller has to
+  /// learn the end sloc from somewhere else.
+  std::pair<SourceLocation, std::optional<SourceLocation>>
+  getNonScratchExpansionLoc(SourceLocation Loc) {
+    std::optional<SourceLocation> EndLoc = std::nullopt;
+    while (Loc.isMacroID() &&
+           SM.isWrittenInScratchSpace(SM.getSpellingLoc(Loc))) {
+      auto ExpansionRange = SM.getImmediateExpansionRange(Loc);
+      Loc = ExpansionRange.getBegin();
+      EndLoc = ExpansionRange.getEnd();
+    }
+    return std::make_pair(Loc, EndLoc);
+  }
+
+  /// Find out where the current file is included or macro is expanded. If
+  /// \c AcceptScratch is set to false, keep looking for expansions until the
+  /// found sloc is not a <scratch space>.
+  SourceLocation getIncludeOrExpansionLoc(SourceLocation Loc,
+                                          bool AcceptScratch = true) {
+    if (!Loc.isMacroID())
+      return SM.getIncludeLoc(SM.getFileID(Loc));
+    Loc = SM.getImmediateExpansionRange(Loc).getBegin();
+    if (AcceptScratch)
+      return Loc;
+    return getNonScratchExpansionLoc(Loc).first;
   }
 
   /// Return true if \c Loc is a location in a built-in macro.
@@ -340,6 +370,15 @@ public:
     for (auto &Region : SourceRegions) {
       SourceLocation Loc = Region.getBeginLoc();
 
+      // Replace Region with its definition if it is in <scratch space>.
+      auto NonScratchExpansionLoc = getNonScratchExpansionLoc(Loc);
+      auto EndLoc = NonScratchExpansionLoc.second;
+      if (EndLoc.has_value()) {
+        Loc = NonScratchExpansionLoc.first;
+        Region.setStartLoc(Loc);
+        Region.setEndLoc(EndLoc.value());
+      }
+
       // Replace Loc with FileLoc if it is expanded with system headers.
       if (!SystemHeadersCoverage && SM.isInSystemMacro(Loc)) {
         auto BeginLoc = SM.getSpellingLoc(Loc);
@@ -472,13 +511,19 @@ public:
       // Ignore regions from system headers unless collecting coverage from
       // system headers is explicitly enabled.
       if (!SystemHeadersCoverage &&
-          SM.isInSystemHeader(SM.getSpellingLoc(LocStart)))
+          SM.isInSystemHeader(SM.getSpellingLoc(LocStart))) {
+        assert(!Region.isMCDCBranch() && !Region.isMCDCDecision() &&
+               "Don't suppress the condition in system headers");
         continue;
+      }
 
       auto CovFileID = getCoverageFileID(LocStart);
       // Ignore regions that don't have a file, such as builtin macros.
-      if (!CovFileID)
+      if (!CovFileID) {
+        assert(!Region.isMCDCBranch() && !Region.isMCDCDecision() &&
+               "Don't suppress the condition in non-file regions");
         continue;
+      }
 
       SourceLocation LocEnd = Region.getEndLoc();
       assert(SM.isWrittenInSameFile(LocStart, LocEnd) &&
@@ -488,8 +533,11 @@ public:
       // This not only suppresses redundant regions, but sometimes prevents
       // creating regions with wrong counters if, for example, a statement's
       // body ends at the end of a nested macro.
-      if (Filter.count(std::make_pair(LocStart, LocEnd)))
+      if (Filter.count(std::make_pair(LocStart, LocEnd))) {
+        assert(!Region.isMCDCBranch() && !Region.isMCDCDecision() &&
+               "Don't suppress the condition");
         continue;
+      }
 
       // Find the spelling locations for the mapping region.
       SpellingRegion SR{SM, LocStart, LocEnd};
@@ -525,7 +573,7 @@ public:
     SourceRegionFilter Filter;
     for (const auto &FM : FileIDMapping) {
       SourceLocation ExpandedLoc = FM.second.second;
-      SourceLocation ParentLoc = getIncludeOrExpansionLoc(ExpandedLoc);
+      SourceLocation ParentLoc = getIncludeOrExpansionLoc(ExpandedLoc, false);
       if (ParentLoc.isInvalid())
         continue;
 
@@ -2223,7 +2271,8 @@ struct CounterCoverageMappingBuilder
   }
 
   void VisitOpaqueValueExpr(const OpaqueValueExpr* OVE) {
-    Visit(OVE->getSourceExpr());
+    if (OVE->isUnique())
+      Visit(OVE->getSourceExpr());
   }
 };
 
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 18acf77..8427286 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -1793,6 +1793,37 @@ void ItaniumCXXABI::EmitDestructorCall(CodeGenFunction &CGF,
                             ThisTy, VTT, VTTTy, nullptr);
 }
 
+// Check if any non-inline method has the specified attribute.
+template <typename T>
+static bool CXXRecordNonInlineHasAttr(const CXXRecordDecl *RD) {
+  for (const auto *D : RD->noload_decls()) {
+    if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
+      if (FD->isInlined() || FD->doesThisDeclarationHaveABody() ||
+          FD->isPureVirtual())
+        continue;
+      if (D->hasAttr<T>())
+        return true;
+    }
+  }
+
+  return false;
+}
+
+static void setVTableSelectiveDLLImportExport(CodeGenModule &CGM,
+                                              llvm::GlobalVariable *VTable,
+                                              const CXXRecordDecl *RD) {
+  if (VTable->getDLLStorageClass() !=
+          llvm::GlobalVariable::DefaultStorageClass ||
+      RD->hasAttr<DLLImportAttr>() || RD->hasAttr<DLLExportAttr>())
+    return;
+
+  if (CGM.getVTables().isVTableExternal(RD)) {
+    if (CXXRecordNonInlineHasAttr<DLLImportAttr>(RD))
+      VTable->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass);
+  } else if (CXXRecordNonInlineHasAttr<DLLExportAttr>(RD))
+    VTable->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
+}
+
 void ItaniumCXXABI::emitVTableDefinitions(CodeGenVTables &CGVT,
                                           const CXXRecordDecl *RD) {
   llvm::GlobalVariable *VTable = getAddrOfVTable(RD, CharUnits());
@@ -1818,6 +1849,9 @@ void ItaniumCXXABI::emitVTableDefinitions(CodeGenVTables &CGVT,
   if (CGM.supportsCOMDAT() && VTable->isWeakForLinker())
     VTable->setComdat(CGM.getModule().getOrInsertComdat(VTable->getName()));
 
+  if (CGM.getTarget().hasPS4DLLImportExport())
+    setVTableSelectiveDLLImportExport(CGM, VTable, RD);
+
   // Set the right visibility.
   CGM.setGVProperties(VTable, RD);
 
@@ -1905,29 +1939,6 @@ ItaniumCXXABI::getVTableAddressPoint(BaseSubobject Base,
       VTable->getValueType(), VTable, Indices, /*InBounds=*/true, InRange);
 }
 
-// Check whether all the non-inline virtual methods for the class have the
-// specified attribute.
-template <typename T>
-static bool CXXRecordAllNonInlineVirtualsHaveAttr(const CXXRecordDecl *RD) {
-  bool FoundNonInlineVirtualMethodWithAttr = false;
-  for (const auto *D : RD->noload_decls()) {
-    if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
-      if (!FD->isVirtualAsWritten() || FD->isInlineSpecified() ||
-          FD->doesThisDeclarationHaveABody())
-        continue;
-      if (!D->hasAttr<T>())
-        return false;
-      FoundNonInlineVirtualMethodWithAttr = true;
-    }
-  }
-
-  // We didn't find any non-inline virtual methods missing the attribute.  We
-  // will return true when we found at least one non-inline virtual with the
-  // attribute.  (This lets our caller know that the attribute needs to be
-  // propagated up to the vtable.)
-  return FoundNonInlineVirtualMethodWithAttr;
-}
-
 llvm::Value *ItaniumCXXABI::getVTableAddressPointInStructorWithVTT(
     CodeGenFunction &CGF, const CXXRecordDecl *VTableClass, BaseSubobject Base,
     const CXXRecordDecl *NearestVBase) {
@@ -1981,26 +1992,10 @@ llvm::GlobalVariable *ItaniumCXXABI::getAddrOfVTable(const CXXRecordDecl *RD,
       getContext().toCharUnitsFromBits(PAlign).getAsAlign());
   VTable->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
 
-  // In MS C++ if you have a class with virtual functions in which you are using
-  // selective member import/export, then all virtual functions must be exported
-  // unless they are inline, otherwise a link error will result. To match this
-  // behavior, for such classes, we dllimport the vtable if it is defined
-  // externally and all the non-inline virtual methods are marked dllimport, and
-  // we dllexport the vtable if it is defined in this TU and all the non-inline
-  // virtual methods are marked dllexport.
-  if (CGM.getTarget().hasPS4DLLImportExport()) {
-    if ((!RD->hasAttr<DLLImportAttr>()) && (!RD->hasAttr<DLLExportAttr>())) {
-      if (CGM.getVTables().isVTableExternal(RD)) {
-        if (CXXRecordAllNonInlineVirtualsHaveAttr<DLLImportAttr>(RD))
-          VTable->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass);
-      } else {
-        if (CXXRecordAllNonInlineVirtualsHaveAttr<DLLExportAttr>(RD))
-          VTable->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
-      }
-    }
-  }
-  CGM.setGVProperties(VTable, RD);
+  if (CGM.getTarget().hasPS4DLLImportExport())
+    setVTableSelectiveDLLImportExport(CGM, VTable, RD);
 
+  CGM.setGVProperties(VTable, RD);
   return VTable;
 }
 
@@ -3285,7 +3280,7 @@ ItaniumRTTIBuilder::GetAddrOfExternalRTTIDescriptor(QualType Ty) {
     // Import the typeinfo symbol when all non-inline virtual methods are
     // imported.
     if (CGM.getTarget().hasPS4DLLImportExport()) {
-      if (RD && CXXRecordAllNonInlineVirtualsHaveAttr<DLLImportAttr>(RD)) {
+      if (RD && CXXRecordNonInlineHasAttr<DLLImportAttr>(RD)) {
         GV->setDLLStorageClass(llvm::GlobalVariable::DLLImportStorageClass);
         CGM.setDSOLocal(GV);
       }
@@ -3938,13 +3933,13 @@ llvm::Constant *ItaniumRTTIBuilder::BuildTypeInfo(
 
   // Export the typeinfo in the same circumstances as the vtable is exported.
   auto GVDLLStorageClass = DLLStorageClass;
-  if (CGM.getTarget().hasPS4DLLImportExport()) {
+  if (CGM.getTarget().hasPS4DLLImportExport() &&
+      GVDLLStorageClass != llvm::GlobalVariable::DLLExportStorageClass) {
     if (const RecordType *RecordTy = dyn_cast<RecordType>(Ty)) {
       const CXXRecordDecl *RD = cast<CXXRecordDecl>(RecordTy->getDecl());
       if (RD->hasAttr<DLLExportAttr>() ||
-          CXXRecordAllNonInlineVirtualsHaveAttr<DLLExportAttr>(RD)) {
+          CXXRecordNonInlineHasAttr<DLLExportAttr>(RD))
         GVDLLStorageClass = llvm::GlobalVariable::DLLExportStorageClass;
-      }
     }
   }
 
@@ -3984,9 +3979,7 @@ llvm::Constant *ItaniumRTTIBuilder::BuildTypeInfo(
   CGM.setDSOLocal(GV);
 
   TypeName->setDLLStorageClass(DLLStorageClass);
-  GV->setDLLStorageClass(CGM.getTarget().hasPS4DLLImportExport()
-                             ? GVDLLStorageClass
-                             : DLLStorageClass);
+  GV->setDLLStorageClass(GVDLLStorageClass);
 
   TypeName->setPartition(CGM.getCodeGenOpts().SymbolPartition);
   GV->setPartition(CGM.getCodeGenOpts().SymbolPartition);
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 2868b4f..f5ea73a 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -2653,22 +2653,13 @@ void Driver::BuildInputs(const ToolChain &TC, DerivedArgList &Args,
       Diag(clang::diag::note_drv_t_option_is_global);
   }
 
-  // CUDA/HIP and their preprocessor expansions can be accepted by CL mode.
   // Warn -x after last input file has no effect
-  auto LastXArg = Args.getLastArgValue(options::OPT_x);
-  const llvm::StringSet<> ValidXArgs = {"cuda", "hip", "cui", "hipi"};
-  if (!IsCLMode() || ValidXArgs.contains(LastXArg)) {
+  {
     Arg *LastXArg = Args.getLastArgNoClaim(options::OPT_x);
     Arg *LastInputArg = Args.getLastArgNoClaim(options::OPT_INPUT);
     if (LastXArg && LastInputArg &&
         LastInputArg->getIndex() < LastXArg->getIndex())
       Diag(clang::diag::warn_drv_unused_x) << LastXArg->getValue();
-  } else {
-    // In CL mode suggest /TC or /TP since -x doesn't make sense if passed via
-    // /clang:.
-    if (auto *A = Args.getLastArg(options::OPT_x))
-      Diag(diag::err_drv_unsupported_opt_with_suggestion)
-          << A->getAsString(Args) << "/TC' or '/TP";
   }
 
   for (Arg *A : Args) {
diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
index 85825e1..381d72e 100644
--- a/clang/lib/Driver/ToolChains/AIX.cpp
+++ b/clang/lib/Driver/ToolChains/AIX.cpp
@@ -479,14 +479,6 @@ static void addTocDataOptions(const llvm::opt::ArgList &Args,
       return false;
   }();
 
-  // Currently only supported for small code model.
-  if (TOCDataGloballyinEffect &&
-      (Args.getLastArgValue(options::OPT_mcmodel_EQ) == "large" ||
-       Args.getLastArgValue(options::OPT_mcmodel_EQ) == "medium")) {
-    D.Diag(clang::diag::warn_drv_unsupported_tocdata);
-    return;
-  }
-
   enum TOCDataSetting {
     AddressInTOC = 0, // Address of the symbol stored in the TOC.
     DataInTOC = 1     // Symbol defined in the TOC.
diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
index d23f9b3..9ea4cc3 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@@ -181,7 +181,7 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     // -m*-float and -mfpu=none/0/32 conflict with -mlsx.
     if (A->getOption().matches(options::OPT_mlsx)) {
       if (llvm::find(Features, "-d") != Features.end())
-        D.Diag(diag::err_drv_loongarch_wrong_fpu_width_for_lsx);
+        D.Diag(diag::err_drv_loongarch_wrong_fpu_width) << /*LSX*/ 0;
       else /*-mlsx*/
         Features.push_back("+lsx");
     } else /*-mno-lsx*/ {
@@ -196,7 +196,7 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     // -mno-lsx conflicts with -mlasx.
     if (A->getOption().matches(options::OPT_mlasx)) {
       if (llvm::find(Features, "-d") != Features.end())
-        D.Diag(diag::err_drv_loongarch_wrong_fpu_width_for_lasx);
+        D.Diag(diag::err_drv_loongarch_wrong_fpu_width) << /*LASX*/ 1;
       else if (llvm::find(Features, "-lsx") != Features.end())
         D.Diag(diag::err_drv_loongarch_invalid_simd_option_combination);
       else { /*-mlasx*/
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 6d2015b..97e451c 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1030,7 +1030,7 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
 
       // If user provided -o, that is the dependency target, except
       // when we are only generating a dependency file.
-      Arg *OutputOpt = Args.getLastArg(options::OPT_o);
+      Arg *OutputOpt = Args.getLastArg(options::OPT_o, options::OPT__SLASH_Fo);
       if (OutputOpt && Output.getType() != types::TY_Dependencies) {
         DepTarget = OutputOpt->getValue();
       } else {
@@ -5681,11 +5681,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   // enabled.  This alias option is being used to simplify the hasFlag logic.
   OptSpecifier StrictAliasingAliasOption =
       OFastEnabled ? options::OPT_Ofast : options::OPT_fstrict_aliasing;
-  // We turn strict aliasing off by default if we're in CL mode, since MSVC
+  // We turn strict aliasing off by default if we're Windows MSVC since MSVC
   // doesn't do any TBAA.
-  bool TBAAOnByDefault = !D.IsCLMode();
   if (!Args.hasFlag(options::OPT_fstrict_aliasing, StrictAliasingAliasOption,
-                    options::OPT_fno_strict_aliasing, TBAAOnByDefault))
+                    options::OPT_fno_strict_aliasing, !IsWindowsMSVC))
     CmdArgs.push_back("-relaxed-aliasing");
   if (!Args.hasFlag(options::OPT_fstruct_path_tbaa,
                     options::OPT_fno_struct_path_tbaa, true))
@@ -7027,8 +7026,12 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       options::OPT_fms_compatibility, options::OPT_fno_ms_compatibility,
       (IsWindowsMSVC && Args.hasFlag(options::OPT_fms_extensions,
                                      options::OPT_fno_ms_extensions, true)));
-  if (IsMSVCCompat)
+  if (IsMSVCCompat) {
     CmdArgs.push_back("-fms-compatibility");
+    if (!types::isCXX(Input.getType()) &&
+        Args.hasArg(options::OPT_fms_define_stdc))
+      CmdArgs.push_back("-fms-define-stdc");
+  }
 
   if (Triple.isWindowsMSVCEnvironment() && !D.IsCLMode() &&
       Args.hasArg(options::OPT_fms_runtime_lib_EQ))
@@ -7263,10 +7266,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
 
-  // -fsized-deallocation is off by default, as it is an ABI-breaking change for
-  // most platforms.
-  Args.addOptInFlag(CmdArgs, options::OPT_fsized_deallocation,
-                    options::OPT_fno_sized_deallocation);
+  // -fsized-deallocation is on by default in C++14 onwards and otherwise off
+  // by default.
+  Args.addLastArg(CmdArgs, options::OPT_fsized_deallocation,
+                  options::OPT_fno_sized_deallocation);
 
   // -faligned-allocation is on by default in C++17 onwards and otherwise off
   // by default.
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index caf6c4a..593b403 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -2912,9 +2912,54 @@ static bool sdkSupportsBuiltinModules(const Darwin::DarwinPlatformKind &TargetPl
   }
 }
 
-void Darwin::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
-                                   llvm::opt::ArgStringList &CC1Args,
-                                   Action::OffloadKind DeviceOffloadKind) const {
+static inline llvm::VersionTuple
+sizedDeallocMinVersion(llvm::Triple::OSType OS) {
+  switch (OS) {
+  default:
+    break;
+  case llvm::Triple::Darwin:
+  case llvm::Triple::MacOSX: // Earliest supporting version is 10.12.
+    return llvm::VersionTuple(10U, 12U);
+  case llvm::Triple::IOS:
+  case llvm::Triple::TvOS: // Earliest supporting version is 10.0.0.
+    return llvm::VersionTuple(10U);
+  case llvm::Triple::WatchOS: // Earliest supporting version is 3.0.0.
+    return llvm::VersionTuple(3U);
+  }
+
+  llvm_unreachable("Unexpected OS");
+}
+
+bool Darwin::isSizedDeallocationUnavailable() const {
+  llvm::Triple::OSType OS;
+
+  if (isTargetMacCatalyst())
+    return TargetVersion < sizedDeallocMinVersion(llvm::Triple::MacOSX);
+  switch (TargetPlatform) {
+  case MacOS: // Earlier than 10.12.
+    OS = llvm::Triple::MacOSX;
+    break;
+  case IPhoneOS:
+    OS = llvm::Triple::IOS;
+    break;
+  case TvOS: // Earlier than 10.0.
+    OS = llvm::Triple::TvOS;
+    break;
+  case WatchOS: // Earlier than 3.0.
+    OS = llvm::Triple::WatchOS;
+    break;
+  case DriverKit:
+  case XROS:
+    // Always available.
+    return false;
+  }
+
+  return TargetVersion < sizedDeallocMinVersion(OS);
+}
+
+void Darwin::addClangTargetOptions(
+    const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args,
+    Action::OffloadKind DeviceOffloadKind) const {
   // Pass "-faligned-alloc-unavailable" only when the user hasn't manually
   // enabled or disabled aligned allocations.
   if (!DriverArgs.hasArgNoClaim(options::OPT_faligned_allocation,
@@ -2922,6 +2967,13 @@ void Darwin::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
       isAlignedAllocationUnavailable())
     CC1Args.push_back("-faligned-alloc-unavailable");
 
+  // Pass "-fno-sized-deallocation" only when the user hasn't manually enabled
+  // or disabled sized deallocations.
+  if (!DriverArgs.hasArgNoClaim(options::OPT_fsized_deallocation,
+                                options::OPT_fno_sized_deallocation) &&
+      isSizedDeallocationUnavailable())
+    CC1Args.push_back("-fno-sized-deallocation");
+
   addClangCC1ASTargetOptions(DriverArgs, CC1Args);
 
   // Enable compatibility mode for NSItemProviderCompletionHandler in
diff --git a/clang/lib/Driver/ToolChains/Darwin.h b/clang/lib/Driver/ToolChains/Darwin.h
index 10d4b69..b45279e 100644
--- a/clang/lib/Driver/ToolChains/Darwin.h
+++ b/clang/lib/Driver/ToolChains/Darwin.h
@@ -511,6 +511,10 @@ protected:
   /// targeting.
   bool isAlignedAllocationUnavailable() const;
 
+  /// Return true if c++14 sized deallocation functions are not implemented in
+  /// the c++ standard library of the deployment target we are targeting.
+  bool isSizedDeallocationUnavailable() const;
+
   void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
                              llvm::opt::ArgStringList &CC1Args,
                              Action::OffloadKind DeviceOffloadKind) const override;
diff --git a/clang/lib/Driver/ToolChains/HIPSPV.cpp b/clang/lib/Driver/ToolChains/HIPSPV.cpp
index a144b28..bdbcf91 100644
--- a/clang/lib/Driver/ToolChains/HIPSPV.cpp
+++ b/clang/lib/Driver/ToolChains/HIPSPV.cpp
@@ -193,7 +193,7 @@ void HIPSPVToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs,
 
   StringRef hipPath = DriverArgs.getLastArgValue(options::OPT_hip_path_EQ);
   if (hipPath.empty()) {
-    getDriver().Diag(diag::err_drv_hipspv_no_hip_path) << 1 << "'-nogpuinc'";
+    getDriver().Diag(diag::err_drv_hipspv_no_hip_path);
     return;
   }
   SmallString<128> P(hipPath);
diff --git a/clang/lib/Driver/ToolChains/ZOS.cpp b/clang/lib/Driver/ToolChains/ZOS.cpp
index d5fc7b8..074e055 100644
--- a/clang/lib/Driver/ToolChains/ZOS.cpp
+++ b/clang/lib/Driver/ToolChains/ZOS.cpp
@@ -36,6 +36,12 @@ void ZOS::addClangTargetOptions(const ArgList &DriverArgs,
   if (!DriverArgs.hasArgNoClaim(options::OPT_faligned_allocation,
                                 options::OPT_fno_aligned_allocation))
     CC1Args.push_back("-faligned-alloc-unavailable");
+
+  // Pass "-fno-sized-deallocation" only when the user hasn't manually enabled
+  // or disabled sized deallocations.
+  if (!DriverArgs.hasArgNoClaim(options::OPT_fsized_deallocation,
+                                options::OPT_fno_sized_deallocation))
+    CC1Args.push_back("-fno-sized-deallocation");
 }
 
 void zos::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
diff --git a/clang/lib/ExtractAPI/DeclarationFragments.cpp b/clang/lib/ExtractAPI/DeclarationFragments.cpp
index 98b9343..8c7c0f8 100644
--- a/clang/lib/ExtractAPI/DeclarationFragments.cpp
+++ b/clang/lib/ExtractAPI/DeclarationFragments.cpp
@@ -999,11 +999,11 @@ DeclarationFragmentsBuilder::getFragmentsForTemplateParameters(
             DeclarationFragments::FragmentKind::GenericParameter);
 
       if (TemplateParam->hasDefaultArgument()) {
-        DeclarationFragments After;
+        const auto Default = TemplateParam->getDefaultArgument();
         Fragments.append(" = ", DeclarationFragments::FragmentKind::Text)
-            .append(getFragmentsForType(TemplateParam->getDefaultArgument(),
-                                        TemplateParam->getASTContext(), After));
-        Fragments.append(std::move(After));
+            .append(getFragmentsForTemplateArguments(
+                {Default.getArgument()}, TemplateParam->getASTContext(),
+                {Default}));
       }
     } else if (const auto *NTP =
                    dyn_cast<NonTypeTemplateParmDecl>(ParameterArray[i])) {
@@ -1023,8 +1023,9 @@ DeclarationFragmentsBuilder::getFragmentsForTemplateParameters(
       if (NTP->hasDefaultArgument()) {
         SmallString<8> ExprStr;
         raw_svector_ostream Output(ExprStr);
-        NTP->getDefaultArgument()->printPretty(
-            Output, nullptr, NTP->getASTContext().getPrintingPolicy());
+        NTP->getDefaultArgument().getArgument().print(
+            NTP->getASTContext().getPrintingPolicy(), Output,
+            /*IncludeType=*/false);
         Fragments.append(" = ", DeclarationFragments::FragmentKind::Text)
             .append(ExprStr, DeclarationFragments::FragmentKind::Text);
       }
@@ -1083,12 +1084,22 @@ DeclarationFragmentsBuilder::getFragmentsForTemplateArguments(
 
       if (StringRef(ArgumentFragment.begin()->Spelling)
               .starts_with("type-parameter")) {
-        std::string ProperArgName = TemplateArgumentLocs.value()[i]
-                                        .getTypeSourceInfo()
-                                        ->getType()
-                                        .getAsString();
-        ArgumentFragment.begin()->Spelling.swap(ProperArgName);
+        if (TemplateArgumentLocs.has_value() &&
+            TemplateArgumentLocs->size() > i) {
+          std::string ProperArgName = TemplateArgumentLocs.value()[i]
+                                          .getTypeSourceInfo()
+                                          ->getType()
+                                          .getAsString();
+          ArgumentFragment.begin()->Spelling.swap(ProperArgName);
+        } else {
+          auto &Spelling = ArgumentFragment.begin()->Spelling;
+          Spelling.clear();
+          raw_string_ostream OutStream(Spelling);
+          CTA.print(Context.getPrintingPolicy(), OutStream, false);
+          OutStream.flush();
+        }
       }
+
       Fragments.append(std::move(ArgumentFragment));
       break;
     }
@@ -1211,9 +1222,9 @@ DeclarationFragmentsBuilder::getFragmentsForClassTemplateSpecialization(
           cast<CXXRecordDecl>(Decl)))
       .pop_back() // there is an extra semicolon now
       .append("<", DeclarationFragments::FragmentKind::Text)
-      .append(
-          getFragmentsForTemplateArguments(Decl->getTemplateArgs().asArray(),
-                                           Decl->getASTContext(), std::nullopt))
+      .append(getFragmentsForTemplateArguments(
+          Decl->getTemplateArgs().asArray(), Decl->getASTContext(),
+          Decl->getTemplateArgsAsWritten()->arguments()))
       .append(">", DeclarationFragments::FragmentKind::Text)
       .appendSemicolon();
 }
@@ -1254,9 +1265,9 @@ DeclarationFragmentsBuilder::getFragmentsForVarTemplateSpecialization(
       .append(DeclarationFragmentsBuilder::getFragmentsForVarTemplate(Decl))
       .pop_back() // there is an extra semicolon now
       .append("<", DeclarationFragments::FragmentKind::Text)
-      .append(
-          getFragmentsForTemplateArguments(Decl->getTemplateArgs().asArray(),
-                                           Decl->getASTContext(), std::nullopt))
+      .append(getFragmentsForTemplateArguments(
+          Decl->getTemplateArgs().asArray(), Decl->getASTContext(),
+          Decl->getTemplateArgsAsWritten()->arguments()))
       .append(">", DeclarationFragments::FragmentKind::Text)
       .appendSemicolon();
 }
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 3dd10f6..b6f7567 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -1410,6 +1410,13 @@ void UnwrappedLineParser::readTokenWithJavaScriptASI() {
   }
 }
 
+static bool isAltOperator(const FormatToken &Tok) {
+  return isalpha(Tok.TokenText[0]) &&
+         Tok.isOneOf(tok::ampamp, tok::ampequal, tok::amp, tok::pipe,
+                     tok::tilde, tok::exclaim, tok::exclaimequal, tok::pipepipe,
+                     tok::pipeequal, tok::caret, tok::caretequal);
+}
+
 void UnwrappedLineParser::parseStructuralElement(
     const FormatToken *OpeningBrace, IfStmtKind *IfKind,
     FormatToken **IfLeftBrace, bool *HasDoWhile, bool *HasLabel) {
@@ -1689,9 +1696,15 @@ void UnwrappedLineParser::parseStructuralElement(
     break;
   }
 
-  const bool InRequiresExpression =
-      OpeningBrace && OpeningBrace->is(TT_RequiresExpressionLBrace);
-  do {
+  for (const bool InRequiresExpression =
+           OpeningBrace && OpeningBrace->is(TT_RequiresExpressionLBrace);
+       !eof();) {
+    if (IsCpp && isAltOperator(*FormatTok)) {
+      if (auto *Next = Tokens->peekNextToken(/*SkipComment=*/true);
+          Next && Next->isBinaryOperator()) {
+        FormatTok->Tok.setKind(tok::identifier);
+      }
+    }
     const FormatToken *Previous = FormatTok->Previous;
     switch (FormatTok->Tok.getKind()) {
     case tok::at:
@@ -2122,7 +2135,7 @@ void UnwrappedLineParser::parseStructuralElement(
       nextToken();
       break;
     }
-  } while (!eof());
+  }
 }
 
 bool UnwrappedLineParser::tryToParsePropertyAccessor() {
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 68760e0..e8c8a51 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -432,7 +432,8 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
   //      [C++] Whether __STDC__ is predefined and if so, what its value is,
   //      are implementation-defined.
   // (Removed in C++20.)
-  if (!LangOpts.MSVCCompat && !LangOpts.TraditionalCPP)
+  if ((!LangOpts.MSVCCompat || LangOpts.MSVCEnableStdcMacro) &&
+      !LangOpts.TraditionalCPP)
     Builder.defineMacro("__STDC__");
   //   -- __STDC_HOSTED__
   //      The integer literal 1 if the implementation is a hosted
diff --git a/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp b/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp
index b76728a..0887b5a 100644
--- a/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp
+++ b/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp
@@ -574,7 +574,7 @@ void SDiagsWriter::HandleDiagnostic(DiagnosticsEngine::Level DiagLevel,
     SmallString<256> diagnostic;
     Info.FormatDiagnostic(diagnostic);
     getMetaDiags()->Report(
-        diag::warn_fe_serialized_diag_failure_during_finalisation)
+        diag::warn_fe_serialized_diag_failure_during_finalization)
         << diagnostic;
     return;
   }
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 5f02c71..dbff92b 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -153,12 +153,10 @@ set(x86_files
   avx512bwintrin.h
   avx512cdintrin.h
   avx512dqintrin.h
-  avx512erintrin.h
   avx512fintrin.h
   avx512fp16intrin.h
   avx512ifmaintrin.h
   avx512ifmavlintrin.h
-  avx512pfintrin.h
   avx512vbmi2intrin.h
   avx512vbmiintrin.h
   avx512vbmivlintrin.h
diff --git a/clang/lib/Headers/avx512erintrin.h b/clang/lib/Headers/avx512erintrin.h
deleted file mode 100644
index 1c5a2d2..0000000
--- a/clang/lib/Headers/avx512erintrin.h
+++ /dev/null
@@ -1,271 +0,0 @@
-/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512ERINTRIN_H
-#define __AVX512ERINTRIN_H
-
-/* exp2a23 */
-#define _mm512_exp2a23_round_pd(A, R) \
-  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
-                                       (__v8df)_mm512_setzero_pd(), \
-                                       (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_exp2a23_round_pd(S, M, A, R) \
-  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
-                                       (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                       (int)(R)))
-
-#define _mm512_maskz_exp2a23_round_pd(M, A, R) \
-  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
-                                       (__v8df)_mm512_setzero_pd(), \
-                                       (__mmask8)(M), (int)(R)))
-
-#define _mm512_exp2a23_pd(A) \
-  _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_exp2a23_pd(S, M, A) \
-  _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_exp2a23_pd(M, A) \
-  _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_exp2a23_round_ps(A, R) \
-  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
-                                      (__v16sf)_mm512_setzero_ps(), \
-                                      (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_exp2a23_round_ps(S, M, A, R) \
-  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
-                                      (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                      (int)(R)))
-
-#define _mm512_maskz_exp2a23_round_ps(M, A, R) \
-  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
-                                      (__v16sf)_mm512_setzero_ps(), \
-                                      (__mmask16)(M), (int)(R)))
-
-#define _mm512_exp2a23_ps(A) \
-  _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_exp2a23_ps(S, M, A) \
-  _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_exp2a23_ps(M, A) \
-  _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-/* rsqrt28 */
-#define _mm512_rsqrt28_round_pd(A, R) \
-  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)_mm512_setzero_pd(), \
-                                          (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \
-  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                          (int)(R)))
-
-#define _mm512_maskz_rsqrt28_round_pd(M, A, R) \
-  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)_mm512_setzero_pd(), \
-                                          (__mmask8)(M), (int)(R)))
-
-#define _mm512_rsqrt28_pd(A) \
-  _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rsqrt28_pd(S, M, A) \
-  _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rsqrt28_pd(M, A) \
-  _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_rsqrt28_round_ps(A, R) \
-  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)_mm512_setzero_ps(), \
-                                         (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \
-  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                         (int)(R)))
-
-#define _mm512_maskz_rsqrt28_round_ps(M, A, R) \
-  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)_mm512_setzero_ps(), \
-                                         (__mmask16)(M), (int)(R)))
-
-#define _mm512_rsqrt28_ps(A) \
-  _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rsqrt28_ps(S, M, A) \
-  _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rsqrt28_ps(M, A) \
-  _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rsqrt28_round_ss(A, B, R) \
-  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \
-  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)(__m128)(S), \
-                                               (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \
-  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)(M), (int)(R)))
-
-#define _mm_rsqrt28_ss(A, B) \
-  _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rsqrt28_ss(S, M, A, B) \
-  _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rsqrt28_ss(M, A, B) \
-  _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rsqrt28_round_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \
-  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)(__m128d)(S), \
-                                                (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \
-  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)(M), (int)(R)))
-
-#define _mm_rsqrt28_sd(A, B) \
-  _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rsqrt28_sd(S, M, A, B) \
-  _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rsqrt28_sd(M, A, B) \
-  _mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-/* rcp28 */
-#define _mm512_rcp28_round_pd(A, R) \
-  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)_mm512_setzero_pd(), \
-                                        (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_rcp28_round_pd(S, M, A, R) \
-  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                        (int)(R)))
-
-#define _mm512_maskz_rcp28_round_pd(M, A, R) \
-  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)_mm512_setzero_pd(), \
-                                        (__mmask8)(M), (int)(R)))
-
-#define _mm512_rcp28_pd(A) \
-  _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rcp28_pd(S, M, A) \
-  _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rcp28_pd(M, A) \
-  _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_rcp28_round_ps(A, R) \
-  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)_mm512_setzero_ps(), \
-                                       (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_rcp28_round_ps(S, M, A, R) \
-  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                       (int)(R)))
-
-#define _mm512_maskz_rcp28_round_ps(M, A, R) \
-  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)_mm512_setzero_ps(), \
-                                       (__mmask16)(M), (int)(R)))
-
-#define _mm512_rcp28_ps(A) \
-  _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rcp28_ps(S, M, A) \
-  _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rcp28_ps(M, A) \
-  _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rcp28_round_ss(A, B, R) \
-  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
-                                             (__v4sf)(__m128)(B), \
-                                             (__v4sf)_mm_setzero_ps(), \
-                                             (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rcp28_round_ss(S, M, A, B, R) \
-  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
-                                             (__v4sf)(__m128)(B), \
-                                             (__v4sf)(__m128)(S), \
-                                             (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rcp28_round_ss(M, A, B, R) \
-  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
-                                             (__v4sf)(__m128)(B), \
-                                             (__v4sf)_mm_setzero_ps(), \
-                                             (__mmask8)(M), (int)(R)))
-
-#define _mm_rcp28_ss(A, B) \
-  _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rcp28_ss(S, M, A, B) \
-  _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rcp28_ss(M, A, B) \
-  _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rcp28_round_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v2df)_mm_setzero_pd(), \
-                                              (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rcp28_round_sd(S, M, A, B, R) \
-  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v2df)(__m128d)(S), \
-                                              (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rcp28_round_sd(M, A, B, R) \
-  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v2df)_mm_setzero_pd(), \
-                                              (__mmask8)(M), (int)(R)))
-
-#define _mm_rcp28_sd(A, B) \
-  _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rcp28_sd(S, M, A, B) \
-  _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rcp28_sd(M, A, B) \
-  _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#endif /* __AVX512ERINTRIN_H */
diff --git a/clang/lib/Headers/avx512pfintrin.h b/clang/lib/Headers/avx512pfintrin.h
deleted file mode 100644
index f853be0..0000000
--- a/clang/lib/Headers/avx512pfintrin.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*===------------- avx512pfintrin.h - PF intrinsics ------------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512PFINTRIN_H
-#define __AVX512PFINTRIN_H
-
-#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \
-  __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
-                             (void const *)(addr), (int)(scale), \
-                             (int)(hint))
-
-#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) \
-  __builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
-                             (void const *)(addr), (int)(scale), \
-                             (int)(hint))
-
-#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) \
-  __builtin_ia32_gatherpfdps((__mmask16)(mask), \
-                             (__v16si)(__m512i)(index), (void const *)(addr), \
-                             (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) \
-  __builtin_ia32_gatherpfdps((__mmask16) -1, \
-                             (__v16si)(__m512i)(index), (void const *)(addr), \
-                             (int)(scale), (int)(hint))
-
-#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) \
-  __builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                             (void const *)(addr), (int)(scale), \
-                             (int)(hint))
-
-#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) \
-  __builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
-                             (void const *)(addr), (int)(scale), \
-                             (int)(hint))
-
-#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) \
-  __builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                             (void const *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) \
-  __builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
-                             (void const *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) \
-  __builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
-                              (void *)(addr), (int)(scale), \
-                              (int)(hint))
-
-#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) \
-  __builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
-                              (void *)(addr), (int)(scale), \
-                              (int)(hint))
-
-#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) \
-  __builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) \
-  __builtin_ia32_scatterpfdps((__mmask16)(mask), \
-                              (__v16si)(__m512i)(index), (void *)(addr), \
-                              (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) \
-  __builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), \
-                              (int)(hint))
-
-#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) \
-  __builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), \
-                              (int)(hint))
-
-#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) \
-  __builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) \
-  __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), (int)(hint))
-
-#endif
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 508696d..cd6cf09 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -151,10 +151,6 @@
 #include <avx512vldqintrin.h>
 #endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512ER__)
-#include <avx512erintrin.h>
-#endif
-
 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512IFMA__)
 #include <avx512ifmaintrin.h>
 #endif
@@ -186,10 +182,6 @@
 #include <avx512vlvbmi2intrin.h>
 #endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512PF__)
-#include <avx512pfintrin.h>
-#endif
-
 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512FP16__)
 #include <avx512fp16intrin.h>
 #endif
diff --git a/clang/lib/Headers/intrin.h b/clang/lib/Headers/intrin.h
index 7eb6dce..5ceb986 100644
--- a/clang/lib/Headers/intrin.h
+++ b/clang/lib/Headers/intrin.h
@@ -378,7 +378,7 @@ unsigned int _CountLeadingSigns64(__int64);
 unsigned int _CountOneBits(unsigned long);
 unsigned int _CountOneBits64(unsigned __int64);
 
-void __cdecl __prefetch(void *);
+void __cdecl __prefetch(const void *);
 #endif
 
 /*----------------------------------------------------------------------------*\
diff --git a/clang/lib/Headers/module.modulemap b/clang/lib/Headers/module.modulemap
index 4abfd1d..9ffc249 100644
--- a/clang/lib/Headers/module.modulemap
+++ b/clang/lib/Headers/module.modulemap
@@ -44,7 +44,6 @@ module _Builtin_intrinsics [system] [extern_c] {
     textual header "avxintrin.h"
     textual header "avx2intrin.h"
     textual header "avx512fintrin.h"
-    textual header "avx512erintrin.h"
     textual header "fmaintrin.h"
 
     header "x86intrin.h"
diff --git a/clang/lib/Index/IndexDecl.cpp b/clang/lib/Index/IndexDecl.cpp
index 8eb88f5..a7fa6c5 100644
--- a/clang/lib/Index/IndexDecl.cpp
+++ b/clang/lib/Index/IndexDecl.cpp
@@ -703,14 +703,16 @@ public:
         IndexCtx.handleDecl(TP);
       if (const auto *TTP = dyn_cast<TemplateTypeParmDecl>(TP)) {
         if (TTP->hasDefaultArgument())
-          IndexCtx.indexTypeSourceInfo(TTP->getDefaultArgumentInfo(), Parent);
+          handleTemplateArgumentLoc(TTP->getDefaultArgument(), Parent,
+                                    TP->getLexicalDeclContext());
         if (auto *C = TTP->getTypeConstraint())
           IndexCtx.handleReference(C->getNamedConcept(), C->getConceptNameLoc(),
                                    Parent, TTP->getLexicalDeclContext());
       } else if (const auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(TP)) {
         IndexCtx.indexTypeSourceInfo(NTTP->getTypeSourceInfo(), Parent);
         if (NTTP->hasDefaultArgument())
-          IndexCtx.indexBody(NTTP->getDefaultArgument(), Parent);
+          handleTemplateArgumentLoc(NTTP->getDefaultArgument(), Parent,
+                                    TP->getLexicalDeclContext());
       } else if (const auto *TTPD = dyn_cast<TemplateTemplateParmDecl>(TP)) {
         if (TTPD->hasDefaultArgument())
           handleTemplateArgumentLoc(TTPD->getDefaultArgument(), Parent,
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 445d3fd..86e8a6b 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -666,6 +666,9 @@ void Parser::ParseGNUAttributeArgs(
     ParseBoundsAttribute(*AttrName, AttrNameLoc, Attrs, ScopeName, ScopeLoc,
                          Form);
     return;
+  } else if (AttrKind == ParsedAttr::AT_CXXAssume) {
+    ParseCXXAssumeAttributeArg(Attrs, AttrName, AttrNameLoc, EndLoc, Form);
+    return;
   }
 
   // These may refer to the function arguments, but need to be parsed early to
@@ -720,6 +723,10 @@ unsigned Parser::ParseClangAttributeArgs(
     ParseTypeTagForDatatypeAttribute(*AttrName, AttrNameLoc, Attrs, EndLoc,
                                      ScopeName, ScopeLoc, Form);
     break;
+
+  case ParsedAttr::AT_CXXAssume:
+    ParseCXXAssumeAttributeArg(Attrs, AttrName, AttrNameLoc, EndLoc, Form);
+    break;
   }
   return !Attrs.empty() ? Attrs.begin()->getNumArgs() : 0;
 }
@@ -1923,9 +1930,8 @@ void Parser::DiagnoseCXX11AttributeExtension(ParsedAttributes &Attrs) {
 // variable.
 // This function moves attributes that should apply to the type off DS to Attrs.
 void Parser::stripTypeAttributesOffDeclSpec(ParsedAttributes &Attrs,
-                                            DeclSpec &DS,
-                                            Sema::TagUseKind TUK) {
-  if (TUK == Sema::TUK_Reference)
+                                            DeclSpec &DS, TagUseKind TUK) {
+  if (TUK == TagUseKind::Reference)
     return;
 
   llvm::SmallVector<ParsedAttr *, 1> ToBeMoved;
@@ -3306,6 +3312,19 @@ void Parser::ParseAlignmentSpecifier(ParsedAttributes &Attrs,
   }
 }
 
+void Parser::DistributeCLateParsedAttrs(Decl *Dcl,
+                                        LateParsedAttrList *LateAttrs) {
+  if (!LateAttrs)
+    return;
+
+  if (Dcl) {
+    for (auto *LateAttr : *LateAttrs) {
+      if (LateAttr->Decls.empty())
+        LateAttr->addDecl(Dcl);
+    }
+  }
+}
+
 /// Bounds attributes (e.g., counted_by):
 ///   AttrName '(' expression ')'
 void Parser::ParseBoundsAttribute(IdentifierInfo &AttrName,
@@ -4843,13 +4862,14 @@ static void DiagnoseCountAttributedTypeInUnnamedAnon(ParsingDeclSpec &DS,
 ///
 void Parser::ParseStructDeclaration(
     ParsingDeclSpec &DS,
-    llvm::function_ref<void(ParsingFieldDeclarator &)> FieldsCallback) {
+    llvm::function_ref<Decl *(ParsingFieldDeclarator &)> FieldsCallback,
+    LateParsedAttrList *LateFieldAttrs) {
 
   if (Tok.is(tok::kw___extension__)) {
     // __extension__ silences extension warnings in the subexpression.
     ExtensionRAIIObject O(Diags);  // Use RAII to do this.
     ConsumeToken();
-    return ParseStructDeclaration(DS, FieldsCallback);
+    return ParseStructDeclaration(DS, FieldsCallback, LateFieldAttrs);
   }
 
   // Parse leading attributes.
@@ -4914,10 +4934,12 @@ void Parser::ParseStructDeclaration(
     }
 
     // If attributes exist after the declarator, parse them.
-    MaybeParseGNUAttributes(DeclaratorInfo.D);
+    MaybeParseGNUAttributes(DeclaratorInfo.D, LateFieldAttrs);
 
     // We're done with this declarator;  invoke the callback.
-    FieldsCallback(DeclaratorInfo);
+    Decl *Field = FieldsCallback(DeclaratorInfo);
+    if (Field)
+      DistributeCLateParsedAttrs(Field, LateFieldAttrs);
 
     // If we don't have a comma, it is either the end of the list (a ';')
     // or an error, bail out.
@@ -4928,6 +4950,73 @@ void Parser::ParseStructDeclaration(
   }
 }
 
+// TODO: All callers of this function should be moved to
+// `Parser::ParseLexedAttributeList`.
+void Parser::ParseLexedCAttributeList(LateParsedAttrList &LAs, bool EnterScope,
+                                      ParsedAttributes *OutAttrs) {
+  assert(LAs.parseSoon() &&
+         "Attribute list should be marked for immediate parsing.");
+  for (auto *LA : LAs) {
+    ParseLexedCAttribute(*LA, EnterScope, OutAttrs);
+    delete LA;
+  }
+  LAs.clear();
+}
+
+/// Finish parsing an attribute for which parsing was delayed.
+/// This will be called at the end of parsing a class declaration
+/// for each LateParsedAttribute. We consume the saved tokens and
+/// create an attribute with the arguments filled in. We add this
+/// to the Attribute list for the decl.
+void Parser::ParseLexedCAttribute(LateParsedAttribute &LA, bool EnterScope,
+                                  ParsedAttributes *OutAttrs) {
+  // Create a fake EOF so that attribute parsing won't go off the end of the
+  // attribute.
+  Token AttrEnd;
+  AttrEnd.startToken();
+  AttrEnd.setKind(tok::eof);
+  AttrEnd.setLocation(Tok.getLocation());
+  AttrEnd.setEofData(LA.Toks.data());
+  LA.Toks.push_back(AttrEnd);
+
+  // Append the current token at the end of the new token stream so that it
+  // doesn't get lost.
+  LA.Toks.push_back(Tok);
+  PP.EnterTokenStream(LA.Toks, /*DisableMacroExpansion=*/true,
+                      /*IsReinject=*/true);
+  // Drop the current token and bring the first cached one. It's the same token
+  // as when we entered this function.
+  ConsumeAnyToken(/*ConsumeCodeCompletionTok=*/true);
+
+  // TODO: Use `EnterScope`
+  (void)EnterScope;
+
+  ParsedAttributes Attrs(AttrFactory);
+
+  assert(LA.Decls.size() <= 1 &&
+         "late field attribute expects to have at most one declaration.");
+
+  // Dispatch based on the attribute and parse it
+  ParseGNUAttributeArgs(&LA.AttrName, LA.AttrNameLoc, Attrs, nullptr, nullptr,
+                        SourceLocation(), ParsedAttr::Form::GNU(), nullptr);
+
+  for (auto *D : LA.Decls)
+    Actions.ActOnFinishDelayedAttribute(getCurScope(), D, Attrs);
+
+  // Due to a parsing error, we either went over the cached tokens or
+  // there are still cached tokens left, so we skip the leftover tokens.
+  while (Tok.isNot(tok::eof))
+    ConsumeAnyToken();
+
+  // Consume the fake EOF token if it's there
+  if (Tok.is(tok::eof) && Tok.getEofData() == AttrEnd.getEofData())
+    ConsumeAnyToken();
+
+  if (OutAttrs) {
+    OutAttrs->takeAllFrom(Attrs);
+  }
+}
+
 /// ParseStructUnionBody
 ///       struct-contents:
 ///         struct-declaration-list
@@ -4951,6 +5040,11 @@ void Parser::ParseStructUnionBody(SourceLocation RecordLoc,
   ParseScope StructScope(this, Scope::ClassScope|Scope::DeclScope);
   Actions.ActOnTagStartDefinition(getCurScope(), TagDecl);
 
+  // `LateAttrParseExperimentalExtOnly=true` requests that only attributes
+  // marked with `LateAttrParseExperimentalExt` are late parsed.
+  LateParsedAttrList LateFieldAttrs(/*PSoon=*/true,
+                                    /*LateAttrParseExperimentalExtOnly=*/true);
+
   // While we still have something to read, read the declarations in the struct.
   while (!tryParseMisplacedModuleImport() && Tok.isNot(tok::r_brace) &&
          Tok.isNot(tok::eof)) {
@@ -5001,18 +5095,19 @@ void Parser::ParseStructUnionBody(SourceLocation RecordLoc,
     }
 
     if (!Tok.is(tok::at)) {
-      auto CFieldCallback = [&](ParsingFieldDeclarator &FD) {
+      auto CFieldCallback = [&](ParsingFieldDeclarator &FD) -> Decl * {
         // Install the declarator into the current TagDecl.
         Decl *Field =
             Actions.ActOnField(getCurScope(), TagDecl,
                                FD.D.getDeclSpec().getSourceRange().getBegin(),
                                FD.D, FD.BitfieldSize);
         FD.complete(Field);
+        return Field;
       };
 
       // Parse all the comma separated declarators.
       ParsingDeclSpec DS(*this);
-      ParseStructDeclaration(DS, CFieldCallback);
+      ParseStructDeclaration(DS, CFieldCallback, &LateFieldAttrs);
     } else { // Handle @defs
       ConsumeToken();
       if (!Tok.isObjCAtKeyword(tok::objc_defs)) {
@@ -5053,7 +5148,10 @@ void Parser::ParseStructUnionBody(SourceLocation RecordLoc,
 
   ParsedAttributes attrs(AttrFactory);
   // If attributes exist after struct contents, parse them.
-  MaybeParseGNUAttributes(attrs);
+  MaybeParseGNUAttributes(attrs, &LateFieldAttrs);
+
+  // Late parse field attributes if necessary.
+  ParseLexedCAttributeList(LateFieldAttrs, /*EnterScope=*/false);
 
   SmallVector<Decl *, 32> FieldDecls(TagDecl->fields());
 
@@ -5287,9 +5385,9 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS,
   // enum foo {..};  void bar() { enum foo; }    <- new foo in bar.
   // enum foo {..};  void bar() { enum foo x; }  <- use of old foo.
   //
-  Sema::TagUseKind TUK;
+  TagUseKind TUK;
   if (AllowEnumSpecifier == AllowDefiningTypeSpec::No)
-    TUK = Sema::TUK_Reference;
+    TUK = TagUseKind::Reference;
   else if (Tok.is(tok::l_brace)) {
     if (DS.isFriendSpecified()) {
       Diag(Tok.getLocation(), diag::err_friend_decl_defines_type)
@@ -5301,9 +5399,9 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS,
       ScopedEnumKWLoc = SourceLocation();
       IsScopedUsingClassTag = false;
       BaseType = TypeResult();
-      TUK = Sema::TUK_Friend;
+      TUK = TagUseKind::Friend;
     } else {
-      TUK = Sema::TUK_Definition;
+      TUK = TagUseKind::Definition;
     }
   } else if (!isTypeSpecifier(DSC) &&
              (Tok.is(tok::semi) ||
@@ -5312,7 +5410,7 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS,
     // An opaque-enum-declaration is required to be standalone (no preceding or
     // following tokens in the declaration). Sema enforces this separately by
     // diagnosing anything else in the DeclSpec.
-    TUK = DS.isFriendSpecified() ? Sema::TUK_Friend : Sema::TUK_Declaration;
+    TUK = DS.isFriendSpecified() ? TagUseKind::Friend : TagUseKind::Declaration;
     if (Tok.isNot(tok::semi)) {
       // A semicolon was missing after this declaration. Diagnose and recover.
       ExpectAndConsume(tok::semi, diag::err_expected_after, "enum");
@@ -5320,21 +5418,21 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS,
       Tok.setKind(tok::semi);
     }
   } else {
-    TUK = Sema::TUK_Reference;
+    TUK = TagUseKind::Reference;
   }
 
   bool IsElaboratedTypeSpecifier =
-      TUK == Sema::TUK_Reference || TUK == Sema::TUK_Friend;
+      TUK == TagUseKind::Reference || TUK == TagUseKind::Friend;
 
   // If this is an elaborated type specifier nested in a larger declaration,
   // and we delayed diagnostics before, just merge them into the current pool.
-  if (TUK == Sema::TUK_Reference && shouldDelayDiagsInTag) {
+  if (TUK == TagUseKind::Reference && shouldDelayDiagsInTag) {
     diagsFromTag.redelay();
   }
 
   MultiTemplateParamsArg TParams;
   if (TemplateInfo.Kind != ParsedTemplateInfo::NonTemplate &&
-      TUK != Sema::TUK_Reference) {
+      TUK != TagUseKind::Reference) {
     if (!getLangOpts().CPlusPlus11 || !SS.isSet()) {
       // Skip the rest of this declarator, up until the comma or semicolon.
       Diag(Tok, diag::err_enum_template);
@@ -5355,7 +5453,7 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS,
     SS.setTemplateParamLists(TParams);
   }
 
-  if (!Name && TUK != Sema::TUK_Definition) {
+  if (!Name && TUK != TagUseKind::Definition) {
     Diag(Tok, diag::err_enumerator_unnamed_no_def);
 
     DS.SetTypeSpecError();
@@ -5388,7 +5486,7 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS,
   stripTypeAttributesOffDeclSpec(attrs, DS, TUK);
 
   SkipBodyInfo SkipBody;
-  if (!Name && TUK == Sema::TUK_Definition && Tok.is(tok::l_brace) &&
+  if (!Name && TUK == TagUseKind::Definition && Tok.is(tok::l_brace) &&
       NextToken().is(tok::identifier))
     SkipBody = Actions.shouldSkipAnonEnumBody(getCurScope(),
                                               NextToken().getIdentifierInfo(),
@@ -5409,7 +5507,7 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS,
                     OffsetOfState, &SkipBody).get();
 
   if (SkipBody.ShouldSkip) {
-    assert(TUK == Sema::TUK_Definition && "can only skip a definition");
+    assert(TUK == TagUseKind::Definition && "can only skip a definition");
 
     BalancedDelimiterTracker T(*this, tok::l_brace);
     T.consumeOpen();
@@ -5451,7 +5549,7 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS,
   if (!TagDecl) {
     // The action failed to produce an enumeration tag. If this is a
     // definition, consume the entire definition.
-    if (Tok.is(tok::l_brace) && TUK != Sema::TUK_Reference) {
+    if (Tok.is(tok::l_brace) && TUK != TagUseKind::Reference) {
       ConsumeBrace();
       SkipUntil(tok::r_brace, StopAtSemi);
     }
@@ -5460,7 +5558,7 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS,
     return;
   }
 
-  if (Tok.is(tok::l_brace) && TUK == Sema::TUK_Definition) {
+  if (Tok.is(tok::l_brace) && TUK == TagUseKind::Definition) {
     Decl *D = SkipBody.CheckSameAsPrevious ? SkipBody.New : TagDecl;
     ParseEnumBody(StartLoc, D);
     if (SkipBody.CheckSameAsPrevious &&
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index 5eaec2b..9a4a777 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -1961,11 +1961,11 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
   MaybeParseCXX11Attributes(Attributes);
 
   const PrintingPolicy &Policy = Actions.getASTContext().getPrintingPolicy();
-  Sema::TagUseKind TUK;
+  TagUseKind TUK;
   if (isDefiningTypeSpecifierContext(DSC, getLangOpts().CPlusPlus) ==
           AllowDefiningTypeSpec::No ||
       (getLangOpts().OpenMP && OpenMPDirectiveParsing))
-    TUK = Sema::TUK_Reference;
+    TUK = TagUseKind::Reference;
   else if (Tok.is(tok::l_brace) ||
            (DSC != DeclSpecContext::DSC_association &&
             getLangOpts().CPlusPlus && Tok.is(tok::colon)) ||
@@ -1980,10 +1980,10 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
       // Skip everything up to the semicolon, so that this looks like a proper
       // friend class (or template thereof) declaration.
       SkipUntil(tok::semi, StopBeforeMatch);
-      TUK = Sema::TUK_Friend;
+      TUK = TagUseKind::Friend;
     } else {
       // Okay, this is a class definition.
-      TUK = Sema::TUK_Definition;
+      TUK = TagUseKind::Definition;
     }
   } else if (isClassCompatibleKeyword() &&
              (NextToken().is(tok::l_square) ||
@@ -2024,15 +2024,15 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
     }
 
     if (Tok.isOneOf(tok::l_brace, tok::colon))
-      TUK = Sema::TUK_Definition;
+      TUK = TagUseKind::Definition;
     else
-      TUK = Sema::TUK_Reference;
+      TUK = TagUseKind::Reference;
 
     PA.Revert();
   } else if (!isTypeSpecifier(DSC) &&
              (Tok.is(tok::semi) ||
               (Tok.isAtStartOfLine() && !isValidAfterTypeSpecifier(false)))) {
-    TUK = DS.isFriendSpecified() ? Sema::TUK_Friend : Sema::TUK_Declaration;
+    TUK = DS.isFriendSpecified() ? TagUseKind::Friend : TagUseKind::Declaration;
     if (Tok.isNot(tok::semi)) {
       const PrintingPolicy &PPol = Actions.getASTContext().getPrintingPolicy();
       // A semicolon was missing after this declaration. Diagnose and recover.
@@ -2042,11 +2042,11 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
       Tok.setKind(tok::semi);
     }
   } else
-    TUK = Sema::TUK_Reference;
+    TUK = TagUseKind::Reference;
 
   // Forbid misplaced attributes. In cases of a reference, we pass attributes
   // to caller to handle.
-  if (TUK != Sema::TUK_Reference) {
+  if (TUK != TagUseKind::Reference) {
     // If this is not a reference, then the only possible
     // valid place for C++11 attributes to appear here
     // is between class-key and class-name. If there are
@@ -2072,7 +2072,7 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
 
   if (!Name && !TemplateId &&
       (DS.getTypeSpecType() == DeclSpec::TST_error ||
-       TUK != Sema::TUK_Definition)) {
+       TUK != TagUseKind::Definition)) {
     if (DS.getTypeSpecType() != DeclSpec::TST_error) {
       // We have a declaration or reference to an anonymous class.
       Diag(StartLoc, diag::err_anon_type_definition)
@@ -2082,7 +2082,7 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
     // If we are parsing a definition and stop at a base-clause, continue on
     // until the semicolon.  Continuing from the comma will just trick us into
     // thinking we are seeing a variable declaration.
-    if (TUK == Sema::TUK_Definition && Tok.is(tok::colon))
+    if (TUK == TagUseKind::Definition && Tok.is(tok::colon))
       SkipUntil(tok::semi, StopBeforeMatch);
     else
       SkipUntil(tok::comma, StopAtSemi);
@@ -2103,7 +2103,7 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
     if (TemplateId->isInvalid()) {
       // Can't build the declaration.
     } else if (TemplateInfo.Kind == ParsedTemplateInfo::ExplicitInstantiation &&
-               TUK == Sema::TUK_Declaration) {
+               TUK == TagUseKind::Declaration) {
       // This is an explicit instantiation of a class template.
       ProhibitCXX11Attributes(attrs, diag::err_attributes_not_allowed,
                               diag::err_keyword_not_allowed,
@@ -2119,8 +2119,8 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
       // they have template headers, in which case they're ill-formed
       // (FIXME: "template <class T> friend class A<T>::B<int>;").
       // We diagnose this error in ActOnClassTemplateSpecialization.
-    } else if (TUK == Sema::TUK_Reference ||
-               (TUK == Sema::TUK_Friend &&
+    } else if (TUK == TagUseKind::Reference ||
+               (TUK == TagUseKind::Friend &&
                 TemplateInfo.Kind == ParsedTemplateInfo::NonTemplate)) {
       ProhibitCXX11Attributes(attrs, diag::err_attributes_not_allowed,
                               diag::err_keyword_not_allowed,
@@ -2145,10 +2145,10 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
         // It this is friend declaration however, since it cannot have a
         // template header, it is most likely that the user meant to
         // remove the 'template' keyword.
-        assert((TUK == Sema::TUK_Definition || TUK == Sema::TUK_Friend) &&
+        assert((TUK == TagUseKind::Definition || TUK == TagUseKind::Friend) &&
                "Expected a definition here");
 
-        if (TUK == Sema::TUK_Friend) {
+        if (TUK == TagUseKind::Friend) {
           Diag(DS.getFriendSpecLoc(), diag::err_friend_explicit_instantiation);
           TemplateParams = nullptr;
         } else {
@@ -2179,7 +2179,7 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
           &SkipBody);
     }
   } else if (TemplateInfo.Kind == ParsedTemplateInfo::ExplicitInstantiation &&
-             TUK == Sema::TUK_Declaration) {
+             TUK == TagUseKind::Declaration) {
     // Explicit instantiation of a member of a class template
     // specialization, e.g.,
     //
@@ -2190,7 +2190,7 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
     TagOrTempResult = Actions.ActOnExplicitInstantiation(
         getCurScope(), TemplateInfo.ExternLoc, TemplateInfo.TemplateLoc,
         TagType, StartLoc, SS, Name, NameLoc, attrs);
-  } else if (TUK == Sema::TUK_Friend &&
+  } else if (TUK == TagUseKind::Friend &&
              TemplateInfo.Kind != ParsedTemplateInfo::NonTemplate) {
     ProhibitCXX11Attributes(attrs, diag::err_attributes_not_allowed,
                             diag::err_keyword_not_allowed,
@@ -2202,12 +2202,12 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
         MultiTemplateParamsArg(TemplateParams ? &(*TemplateParams)[0] : nullptr,
                                TemplateParams ? TemplateParams->size() : 0));
   } else {
-    if (TUK != Sema::TUK_Declaration && TUK != Sema::TUK_Definition)
+    if (TUK != TagUseKind::Declaration && TUK != TagUseKind::Definition)
       ProhibitCXX11Attributes(attrs, diag::err_attributes_not_allowed,
                               diag::err_keyword_not_allowed,
                               /* DiagnoseEmptyAttrs=*/true);
 
-    if (TUK == Sema::TUK_Definition &&
+    if (TUK == TagUseKind::Definition &&
         TemplateInfo.Kind == ParsedTemplateInfo::ExplicitInstantiation) {
       // If the declarator-id is not a template-id, issue a diagnostic and
       // recover by ignoring the 'template' keyword.
@@ -2222,7 +2222,7 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
     // reference.  For example, we don't need the template parameters here:
     //   template <class T> class A *makeA(T t);
     MultiTemplateParamsArg TParams;
-    if (TUK != Sema::TUK_Reference && TemplateParams)
+    if (TUK != TagUseKind::Reference && TemplateParams)
       TParams =
           MultiTemplateParamsArg(&(*TemplateParams)[0], TemplateParams->size());
 
@@ -2241,7 +2241,7 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
     // If ActOnTag said the type was dependent, try again with the
     // less common call.
     if (IsDependent) {
-      assert(TUK == Sema::TUK_Reference || TUK == Sema::TUK_Friend);
+      assert(TUK == TagUseKind::Reference || TUK == TagUseKind::Friend);
       TypeResult = Actions.ActOnDependentTag(getCurScope(), TagType, TUK, SS,
                                              Name, StartLoc, NameLoc);
     }
@@ -2252,13 +2252,13 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
   // just merge them into the current pool.
   if (shouldDelayDiagsInTag) {
     diagsFromTag.done();
-    if (TUK == Sema::TUK_Reference &&
+    if (TUK == TagUseKind::Reference &&
         TemplateInfo.Kind == ParsedTemplateInfo::Template)
       diagsFromTag.redelay();
   }
 
   // If there is a body, parse it and inform the actions module.
-  if (TUK == Sema::TUK_Definition) {
+  if (TUK == TagUseKind::Definition) {
     assert(Tok.is(tok::l_brace) ||
            (getLangOpts().CPlusPlus && Tok.is(tok::colon)) ||
            isClassCompatibleKeyword());
@@ -2316,7 +2316,7 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
   //
   // After a type-specifier, we don't expect a semicolon. This only happens in
   // C, since definitions are not permitted in this context in C++.
-  if (TUK == Sema::TUK_Definition &&
+  if (TUK == TagUseKind::Definition &&
       (getLangOpts().CPlusPlus || !isTypeSpecifier(DSC)) &&
       (TemplateInfo.Kind || !isValidAfterTypeSpecifier(false))) {
     if (Tok.isNot(tok::semi)) {
@@ -4560,7 +4560,8 @@ static bool IsBuiltInOrStandardCXX11Attribute(IdentifierInfo *AttrName,
 bool Parser::ParseCXXAssumeAttributeArg(ParsedAttributes &Attrs,
                                         IdentifierInfo *AttrName,
                                         SourceLocation AttrNameLoc,
-                                        SourceLocation *EndLoc) {
+                                        SourceLocation *EndLoc,
+                                        ParsedAttr::Form Form) {
   assert(Tok.is(tok::l_paren) && "Not a C++11 attribute argument list");
   BalancedDelimiterTracker T(*this, tok::l_paren);
   T.consumeOpen();
@@ -4603,7 +4604,7 @@ bool Parser::ParseCXXAssumeAttributeArg(ParsedAttributes &Attrs,
   auto RParen = Tok.getLocation();
   T.consumeClose();
   Attrs.addNew(AttrName, SourceRange(AttrNameLoc, RParen), nullptr,
-               SourceLocation(), &Assumption, 1, ParsedAttr::Form::CXX11());
+               SourceLocation(), &Assumption, 1, Form);
 
   if (EndLoc)
     *EndLoc = RParen;
@@ -4683,7 +4684,7 @@ bool Parser::ParseCXX11AttributeArgs(
                                       ScopeName, ScopeLoc, Form);
   // So does C++23's assume() attribute.
   else if (!ScopeName && AttrName->isStr("assume")) {
-    if (ParseCXXAssumeAttributeArg(Attrs, AttrName, AttrNameLoc, EndLoc))
+    if (ParseCXXAssumeAttributeArg(Attrs, AttrName, AttrNameLoc, EndLoc, Form))
       return true;
     NumArgs = 1;
   } else
diff --git a/clang/lib/Parse/ParseObjc.cpp b/clang/lib/Parse/ParseObjc.cpp
index 89f4acb..6a2088a 100644
--- a/clang/lib/Parse/ParseObjc.cpp
+++ b/clang/lib/Parse/ParseObjc.cpp
@@ -780,16 +780,16 @@ void Parser::ParseObjCInterfaceDeclList(tok::ObjCKeywordKind contextKey,
       }
 
       bool addedToDeclSpec = false;
-      auto ObjCPropertyCallback = [&](ParsingFieldDeclarator &FD) {
+      auto ObjCPropertyCallback = [&](ParsingFieldDeclarator &FD) -> Decl * {
         if (FD.D.getIdentifier() == nullptr) {
           Diag(AtLoc, diag::err_objc_property_requires_field_name)
               << FD.D.getSourceRange();
-          return;
+          return nullptr;
         }
         if (FD.BitfieldSize) {
           Diag(AtLoc, diag::err_objc_property_bitfield)
               << FD.D.getSourceRange();
-          return;
+          return nullptr;
         }
 
         // Map a nullability property attribute to a context-sensitive keyword
@@ -818,6 +818,7 @@ void Parser::ParseObjCInterfaceDeclList(tok::ObjCKeywordKind contextKey,
             MethodImplKind);
 
         FD.complete(Property);
+        return Property;
       };
 
       // Parse all the comma separated declarators.
@@ -2013,7 +2014,7 @@ void Parser::ParseObjCClassInstanceVariables(ObjCContainerDecl *interfaceDecl,
       continue;
     }
 
-    auto ObjCIvarCallback = [&](ParsingFieldDeclarator &FD) {
+    auto ObjCIvarCallback = [&](ParsingFieldDeclarator &FD) -> Decl * {
       assert(getObjCDeclContext() == interfaceDecl &&
              "Ivar should have interfaceDecl as its decl context");
       // Install the declarator into the interface decl.
@@ -2024,6 +2025,7 @@ void Parser::ParseObjCClassInstanceVariables(ObjCContainerDecl *interfaceDecl,
       if (Field)
         AllIvarDecls.push_back(Field);
       FD.complete(Field);
+      return Field;
     };
 
     // Parse all the comma separated declarators.
diff --git a/clang/lib/Parse/ParsePragma.cpp b/clang/lib/Parse/ParsePragma.cpp
index 643fdac..cc6f18b 100644
--- a/clang/lib/Parse/ParsePragma.cpp
+++ b/clang/lib/Parse/ParsePragma.cpp
@@ -23,6 +23,7 @@
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaCodeCompletion.h"
+#include "clang/Sema/SemaRISCV.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include <optional>
@@ -4154,7 +4155,7 @@ void PragmaRISCVHandler::HandlePragma(Preprocessor &PP,
   }
 
   if (II->isStr("vector"))
-    Actions.DeclareRISCVVBuiltins = true;
+    Actions.RISCV().DeclareRVVBuiltins = true;
   else if (II->isStr("sifive_vector"))
-    Actions.DeclareRISCVSiFiveVectorBuiltins = true;
+    Actions.RISCV().DeclareSiFiveVectorBuiltins = true;
 }
diff --git a/clang/lib/Sema/CMakeLists.txt b/clang/lib/Sema/CMakeLists.txt
index 6b7742c..fe6471c 100644
--- a/clang/lib/Sema/CMakeLists.txt
+++ b/clang/lib/Sema/CMakeLists.txt
@@ -71,6 +71,7 @@ add_clang_library(clangSema
   SemaTemplateInstantiateDecl.cpp
   SemaTemplateVariadic.cpp
   SemaType.cpp
+  SemaX86.cpp
   TypeLocBuilder.cpp
 
   DEPENDS
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index bb283c5..a2b29a7 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -308,17 +308,18 @@ struct BuiltinTypeDeclBuilder {
     return *this;
   }
 
-  TemplateParameterListBuilder addTemplateArgumentList();
-  BuiltinTypeDeclBuilder &addSimpleTemplateParams(ArrayRef<StringRef> Names);
+  TemplateParameterListBuilder addTemplateArgumentList(Sema &S);
+  BuiltinTypeDeclBuilder &addSimpleTemplateParams(Sema &S,
+                                                  ArrayRef<StringRef> Names);
 };
 
 struct TemplateParameterListBuilder {
   BuiltinTypeDeclBuilder &Builder;
-  ASTContext &AST;
+  Sema &S;
   llvm::SmallVector<NamedDecl *> Params;
 
-  TemplateParameterListBuilder(BuiltinTypeDeclBuilder &RB)
-      : Builder(RB), AST(RB.Record->getASTContext()) {}
+  TemplateParameterListBuilder(Sema &S, BuiltinTypeDeclBuilder &RB)
+      : Builder(RB), S(S) {}
 
   ~TemplateParameterListBuilder() { finalizeTemplateArgs(); }
 
@@ -328,12 +329,15 @@ struct TemplateParameterListBuilder {
       return *this;
     unsigned Position = static_cast<unsigned>(Params.size());
     auto *Decl = TemplateTypeParmDecl::Create(
-        AST, Builder.Record->getDeclContext(), SourceLocation(),
+        S.Context, Builder.Record->getDeclContext(), SourceLocation(),
         SourceLocation(), /* TemplateDepth */ 0, Position,
-        &AST.Idents.get(Name, tok::TokenKind::identifier), /* Typename */ false,
+        &S.Context.Idents.get(Name, tok::TokenKind::identifier),
+        /* Typename */ false,
         /* ParameterPack */ false);
     if (!DefaultValue.isNull())
-      Decl->setDefaultArgument(AST.getTrivialTypeSourceInfo(DefaultValue));
+      Decl->setDefaultArgument(
+          S.Context, S.getTrivialTemplateArgumentLoc(DefaultValue, QualType(),
+                                                     SourceLocation()));
 
     Params.emplace_back(Decl);
     return *this;
@@ -342,11 +346,11 @@ struct TemplateParameterListBuilder {
   BuiltinTypeDeclBuilder &finalizeTemplateArgs() {
     if (Params.empty())
       return Builder;
-    auto *ParamList =
-        TemplateParameterList::Create(AST, SourceLocation(), SourceLocation(),
-                                      Params, SourceLocation(), nullptr);
+    auto *ParamList = TemplateParameterList::Create(S.Context, SourceLocation(),
+                                                    SourceLocation(), Params,
+                                                    SourceLocation(), nullptr);
     Builder.Template = ClassTemplateDecl::Create(
-        AST, Builder.Record->getDeclContext(), SourceLocation(),
+        S.Context, Builder.Record->getDeclContext(), SourceLocation(),
         DeclarationName(Builder.Record->getIdentifier()), ParamList,
         Builder.Record);
     Builder.Record->setDescribedClassTemplate(Builder.Template);
@@ -359,20 +363,22 @@ struct TemplateParameterListBuilder {
     Params.clear();
 
     QualType T = Builder.Template->getInjectedClassNameSpecialization();
-    T = AST.getInjectedClassNameType(Builder.Record, T);
+    T = S.Context.getInjectedClassNameType(Builder.Record, T);
 
     return Builder;
   }
 };
 } // namespace
 
-TemplateParameterListBuilder BuiltinTypeDeclBuilder::addTemplateArgumentList() {
-  return TemplateParameterListBuilder(*this);
+TemplateParameterListBuilder
+BuiltinTypeDeclBuilder::addTemplateArgumentList(Sema &S) {
+  return TemplateParameterListBuilder(S, *this);
 }
 
 BuiltinTypeDeclBuilder &
-BuiltinTypeDeclBuilder::addSimpleTemplateParams(ArrayRef<StringRef> Names) {
-  TemplateParameterListBuilder Builder = this->addTemplateArgumentList();
+BuiltinTypeDeclBuilder::addSimpleTemplateParams(Sema &S,
+                                                ArrayRef<StringRef> Names) {
+  TemplateParameterListBuilder Builder = this->addTemplateArgumentList(S);
   for (StringRef Name : Names)
     Builder.addTypeParameter(Name);
   return Builder.finalizeTemplateArgs();
@@ -426,7 +432,9 @@ void HLSLExternalSemaSource::defineHLSLVectorAlias() {
   auto *TypeParam = TemplateTypeParmDecl::Create(
       AST, HLSLNamespace, SourceLocation(), SourceLocation(), 0, 0,
       &AST.Idents.get("element", tok::TokenKind::identifier), false, false);
-  TypeParam->setDefaultArgument(AST.getTrivialTypeSourceInfo(AST.FloatTy));
+  TypeParam->setDefaultArgument(
+      AST, SemaPtr->getTrivialTemplateArgumentLoc(
+               TemplateArgument(AST.FloatTy), QualType(), SourceLocation()));
 
   TemplateParams.emplace_back(TypeParam);
 
@@ -434,10 +442,12 @@ void HLSLExternalSemaSource::defineHLSLVectorAlias() {
       AST, HLSLNamespace, SourceLocation(), SourceLocation(), 0, 1,
       &AST.Idents.get("element_count", tok::TokenKind::identifier), AST.IntTy,
       false, AST.getTrivialTypeSourceInfo(AST.IntTy));
-  Expr *LiteralExpr =
-      IntegerLiteral::Create(AST, llvm::APInt(AST.getIntWidth(AST.IntTy), 4),
-                             AST.IntTy, SourceLocation());
-  SizeParam->setDefaultArgument(LiteralExpr);
+  llvm::APInt Val(AST.getIntWidth(AST.IntTy), 4);
+  TemplateArgument Default(AST, llvm::APSInt(std::move(Val)), AST.IntTy,
+                           /*IsDefaulted=*/true);
+  SizeParam->setDefaultArgument(
+      AST, SemaPtr->getTrivialTemplateArgumentLoc(Default, AST.IntTy,
+                                                  SourceLocation(), SizeParam));
   TemplateParams.emplace_back(SizeParam);
 
   auto *ParamList =
@@ -492,7 +502,7 @@ static BuiltinTypeDeclBuilder setupBufferType(CXXRecordDecl *Decl, Sema &S,
 void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
   CXXRecordDecl *Decl;
   Decl = BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "RWBuffer")
-             .addSimpleTemplateParams({"element_type"})
+             .addSimpleTemplateParams(*SemaPtr, {"element_type"})
              .Record;
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV,
@@ -503,7 +513,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
 
   Decl =
       BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "RasterizerOrderedBuffer")
-          .addSimpleTemplateParams({"element_type"})
+          .addSimpleTemplateParams(*SemaPtr, {"element_type"})
           .Record;
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV,
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 2c5774d..d1fb21b 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -50,7 +50,9 @@
 #include "clang/Sema/SemaOpenACC.h"
 #include "clang/Sema/SemaOpenMP.h"
 #include "clang/Sema/SemaPseudoObject.h"
+#include "clang/Sema/SemaRISCV.h"
 #include "clang/Sema/SemaSYCL.h"
+#include "clang/Sema/SemaX86.h"
 #include "clang/Sema/TemplateDeduction.h"
 #include "clang/Sema/TemplateInstCallback.h"
 #include "clang/Sema/TypoCorrection.h"
@@ -212,7 +214,9 @@ Sema::Sema(Preprocessor &pp, ASTContext &ctxt, ASTConsumer &consumer,
       OpenACCPtr(std::make_unique<SemaOpenACC>(*this)),
       OpenMPPtr(std::make_unique<SemaOpenMP>(*this)),
       PseudoObjectPtr(std::make_unique<SemaPseudoObject>(*this)),
+      RISCVPtr(std::make_unique<SemaRISCV>(*this)),
       SYCLPtr(std::make_unique<SemaSYCL>(*this)),
+      X86Ptr(std::make_unique<SemaX86>(*this)),
       MSPointerToMemberRepresentationMethod(
           LangOpts.getMSPointerToMemberRepresentationMethod()),
       MSStructPragmaOn(false), VtorDispStack(LangOpts.getVtorDispMode()),
@@ -2051,7 +2055,7 @@ void Sema::checkTypeSupport(QualType Ty, SourceLocation Loc, ValueDecl *D) {
     if (TI.hasRISCVVTypes() && Ty->isRVVSizelessBuiltinType() && FD) {
       llvm::StringMap<bool> CallerFeatureMap;
       Context.getFunctionFeatureMap(CallerFeatureMap, FD);
-      checkRVVTypeSupport(Ty, Loc, D, CallerFeatureMap);
+      RISCV().checkRVVTypeSupport(Ty, Loc, D, CallerFeatureMap);
     }
 
     // Don't allow SVE types in functions without a SVE target.
diff --git a/clang/lib/Sema/SemaAvailability.cpp b/clang/lib/Sema/SemaAvailability.cpp
index 663b6f3..22f5a2f 100644
--- a/clang/lib/Sema/SemaAvailability.cpp
+++ b/clang/lib/Sema/SemaAvailability.cpp
@@ -987,11 +987,6 @@ void Sema::DiagnoseUnguardedAvailabilityViolations(Decl *D) {
   Stmt *Body = nullptr;
 
   if (auto *FD = D->getAsFunction()) {
-    // FIXME: We only examine the pattern decl for availability violations now,
-    // but we should also examine instantiated templates.
-    if (FD->isTemplateInstantiation())
-      return;
-
     Body = FD->getBody();
 
     if (auto *CD = dyn_cast<CXXConstructorDecl>(FD))
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index 483ec7e..7db6b1d 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -25,6 +25,7 @@
 #include "clang/Sema/Initialization.h"
 #include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/SemaObjC.h"
+#include "clang/Sema/SemaRISCV.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include <set>
@@ -2391,7 +2392,7 @@ static TryCastResult TryReinterpretCast(Sema &Self, ExprResult &SrcExpr,
     }
 
     // Allow bitcasting between SVE VLATs and VLSTs, and vice-versa.
-    if (Self.isValidRVVBitcast(SrcType, DestType)) {
+    if (Self.RISCV().isValidRVVBitcast(SrcType, DestType)) {
       Kind = CK_BitCast;
       return TC_Success;
     }
@@ -3002,7 +3003,7 @@ void CastOperation::CheckCStyleCast() {
 
   // Allow bitcasting between compatible RVV vector types.
   if ((SrcType->isVectorType() || DestType->isVectorType()) &&
-      Self.isValidRVVBitcast(SrcType, DestType)) {
+      Self.RISCV().isValidRVVBitcast(SrcType, DestType)) {
     Kind = CK_BitCast;
     return;
   }
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index f2dc8e9..fac9a58 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -63,6 +63,8 @@
 #include "clang/Sema/Sema.h"
 #include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/SemaObjC.h"
+#include "clang/Sema/SemaRISCV.h"
+#include "clang/Sema/SemaX86.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/APSInt.h"
@@ -120,13 +122,12 @@ static constexpr unsigned short combineFAPK(Sema::FormatArgumentPassingKind A,
 /// Checks that a call expression's argument count is at least the desired
 /// number. This is useful when doing custom type-checking on a variadic
 /// function. Returns true on error.
-static bool checkArgCountAtLeast(Sema &S, CallExpr *Call,
-                                 unsigned MinArgCount) {
+bool Sema::checkArgCountAtLeast(CallExpr *Call, unsigned MinArgCount) {
   unsigned ArgCount = Call->getNumArgs();
   if (ArgCount >= MinArgCount)
     return false;
 
-  return S.Diag(Call->getEndLoc(), diag::err_typecheck_call_too_few_args)
+  return Diag(Call->getEndLoc(), diag::err_typecheck_call_too_few_args)
          << 0 /*function call*/ << MinArgCount << ArgCount
          << /*is non object*/ 0 << Call->getSourceRange();
 }
@@ -134,12 +135,11 @@ static bool checkArgCountAtLeast(Sema &S, CallExpr *Call,
 /// Checks that a call expression's argument count is at most the desired
 /// number. This is useful when doing custom type-checking on a variadic
 /// function. Returns true on error.
-static bool checkArgCountAtMost(Sema &S, CallExpr *Call, unsigned MaxArgCount) {
+bool Sema::checkArgCountAtMost(CallExpr *Call, unsigned MaxArgCount) {
   unsigned ArgCount = Call->getNumArgs();
   if (ArgCount <= MaxArgCount)
     return false;
-  return S.Diag(Call->getEndLoc(),
-                diag::err_typecheck_call_too_many_args_at_most)
+  return Diag(Call->getEndLoc(), diag::err_typecheck_call_too_many_args_at_most)
          << 0 /*function call*/ << MaxArgCount << ArgCount
          << /*is non object*/ 0 << Call->getSourceRange();
 }
@@ -147,20 +147,20 @@ static bool checkArgCountAtMost(Sema &S, CallExpr *Call, unsigned MaxArgCount) {
 /// Checks that a call expression's argument count is in the desired range. This
 /// is useful when doing custom type-checking on a variadic function. Returns
 /// true on error.
-static bool checkArgCountRange(Sema &S, CallExpr *Call, unsigned MinArgCount,
-                               unsigned MaxArgCount) {
-  return checkArgCountAtLeast(S, Call, MinArgCount) ||
-         checkArgCountAtMost(S, Call, MaxArgCount);
+bool Sema::checkArgCountRange(CallExpr *Call, unsigned MinArgCount,
+                              unsigned MaxArgCount) {
+  return checkArgCountAtLeast(Call, MinArgCount) ||
+         checkArgCountAtMost(Call, MaxArgCount);
 }
 
 /// Checks that a call expression's argument count is the desired number.
 /// This is useful when doing custom type-checking.  Returns true on error.
-static bool checkArgCount(Sema &S, CallExpr *Call, unsigned DesiredArgCount) {
+bool Sema::checkArgCount(CallExpr *Call, unsigned DesiredArgCount) {
   unsigned ArgCount = Call->getNumArgs();
   if (ArgCount == DesiredArgCount)
     return false;
 
-  if (checkArgCountAtLeast(S, Call, DesiredArgCount))
+  if (checkArgCountAtLeast(Call, DesiredArgCount))
     return true;
   assert(ArgCount > DesiredArgCount && "should have diagnosed this");
 
@@ -168,7 +168,7 @@ static bool checkArgCount(Sema &S, CallExpr *Call, unsigned DesiredArgCount) {
   SourceRange Range(Call->getArg(DesiredArgCount)->getBeginLoc(),
                     Call->getArg(ArgCount - 1)->getEndLoc());
 
-  return S.Diag(Range.getBegin(), diag::err_typecheck_call_too_many_args)
+  return Diag(Range.getBegin(), diag::err_typecheck_call_too_many_args)
          << 0 /*function call*/ << DesiredArgCount << ArgCount
          << /*is non object*/ 0 << Call->getArg(1)->getSourceRange();
 }
@@ -190,7 +190,7 @@ static bool convertArgumentToType(Sema &S, Expr *&Value, QualType Ty) {
 /// Check that the first argument to __builtin_annotation is an integer
 /// and the second argument is a non-wide string literal.
 static bool BuiltinAnnotation(Sema &S, CallExpr *TheCall) {
-  if (checkArgCount(S, TheCall, 2))
+  if (S.checkArgCount(TheCall, 2))
     return true;
 
   // First argument should be an integer.
@@ -240,7 +240,7 @@ static bool BuiltinMSVCAnnotation(Sema &S, CallExpr *TheCall) {
 /// Check that the argument to __builtin_addressof is a glvalue, and set the
 /// result type to the corresponding pointer type.
 static bool BuiltinAddressof(Sema &S, CallExpr *TheCall) {
-  if (checkArgCount(S, TheCall, 1))
+  if (S.checkArgCount(TheCall, 1))
     return true;
 
   ExprResult Arg(TheCall->getArg(0));
@@ -255,7 +255,7 @@ static bool BuiltinAddressof(Sema &S, CallExpr *TheCall) {
 
 /// Check that the argument to __builtin_function_start is a function.
 static bool BuiltinFunctionStart(Sema &S, CallExpr *TheCall) {
-  if (checkArgCount(S, TheCall, 1))
+  if (S.checkArgCount(TheCall, 1))
     return true;
 
   ExprResult Arg = S.DefaultFunctionArrayLvalueConversion(TheCall->getArg(0));
@@ -279,7 +279,7 @@ static bool BuiltinFunctionStart(Sema &S, CallExpr *TheCall) {
 /// Check the number of arguments and set the result type to
 /// the argument type.
 static bool BuiltinPreserveAI(Sema &S, CallExpr *TheCall) {
-  if (checkArgCount(S, TheCall, 1))
+  if (S.checkArgCount(TheCall, 1))
     return true;
 
   TheCall->setType(TheCall->getArg(0)->getType());
@@ -290,7 +290,7 @@ static bool BuiltinPreserveAI(Sema &S, CallExpr *TheCall) {
 /// __builtin_aligned_{up,down}(value, alignment) is an integer or a pointer
 /// type (but not a function pointer) and that the alignment is a power-of-two.
 static bool BuiltinAlignment(Sema &S, CallExpr *TheCall, unsigned ID) {
-  if (checkArgCount(S, TheCall, 2))
+  if (S.checkArgCount(TheCall, 2))
     return true;
 
   clang::Expr *Source = TheCall->getArg(0);
@@ -368,7 +368,7 @@ static bool BuiltinAlignment(Sema &S, CallExpr *TheCall, unsigned ID) {
 }
 
 static bool BuiltinOverflow(Sema &S, CallExpr *TheCall, unsigned BuiltinID) {
-  if (checkArgCount(S, TheCall, 3))
+  if (S.checkArgCount(TheCall, 3))
     return true;
 
   std::pair<unsigned, const char *> Builtins[] = {
@@ -696,7 +696,7 @@ struct BuiltinDumpStructGenerator {
 } // namespace
 
 static ExprResult BuiltinDumpStruct(Sema &S, CallExpr *TheCall) {
-  if (checkArgCountAtLeast(S, TheCall, 2))
+  if (S.checkArgCountAtLeast(TheCall, 2))
     return ExprError();
 
   ExprResult PtrArgResult = S.DefaultLvalueConversion(TheCall->getArg(0));
@@ -762,7 +762,7 @@ static ExprResult BuiltinDumpStruct(Sema &S, CallExpr *TheCall) {
 }
 
 static bool BuiltinCallWithStaticChain(Sema &S, CallExpr *BuiltinCall) {
-  if (checkArgCount(S, BuiltinCall, 2))
+  if (S.checkArgCount(BuiltinCall, 2))
     return true;
 
   SourceLocation BuiltinLoc = BuiltinCall->getBeginLoc();
@@ -1504,7 +1504,7 @@ static bool checkOpenCLSubgroupExt(Sema &S, CallExpr *Call) {
 }
 
 static bool OpenCLBuiltinNDRangeAndBlock(Sema &S, CallExpr *TheCall) {
-  if (checkArgCount(S, TheCall, 2))
+  if (S.checkArgCount(TheCall, 2))
     return true;
 
   if (checkOpenCLSubgroupExt(S, TheCall))
@@ -1531,7 +1531,7 @@ static bool OpenCLBuiltinNDRangeAndBlock(Sema &S, CallExpr *TheCall) {
 /// get_kernel_work_group_size
 /// and get_kernel_preferred_work_group_size_multiple builtin functions.
 static bool OpenCLBuiltinKernelWorkGroupSize(Sema &S, CallExpr *TheCall) {
-  if (checkArgCount(S, TheCall, 1))
+  if (S.checkArgCount(TheCall, 1))
     return true;
 
   Expr *BlockArg = TheCall->getArg(0);
@@ -1861,7 +1861,7 @@ static bool BuiltinRWPipe(Sema &S, CallExpr *Call) {
 // \param Call The call to the builtin function to be analyzed.
 // \return True if a semantic error was found, false otherwise.
 static bool BuiltinReserveRWPipe(Sema &S, CallExpr *Call) {
-  if (checkArgCount(S, Call, 2))
+  if (S.checkArgCount(Call, 2))
     return true;
 
   if (checkOpenCLPipeArg(S, Call))
@@ -1890,7 +1890,7 @@ static bool BuiltinReserveRWPipe(Sema &S, CallExpr *Call) {
 // \param Call The call to the builtin function to be analyzed.
 // \return True if a semantic error was found, false otherwise.
 static bool BuiltinCommitRWPipe(Sema &S, CallExpr *Call) {
-  if (checkArgCount(S, Call, 2))
+  if (S.checkArgCount(Call, 2))
     return true;
 
   if (checkOpenCLPipeArg(S, Call))
@@ -1913,7 +1913,7 @@ static bool BuiltinCommitRWPipe(Sema &S, CallExpr *Call) {
 // \param Call The call to the builtin function to be analyzed.
 // \return True if a semantic error was found, false otherwise.
 static bool BuiltinPipePackets(Sema &S, CallExpr *Call) {
-  if (checkArgCount(S, Call, 1))
+  if (S.checkArgCount(Call, 1))
     return true;
 
   if (!Call->getArg(0)->getType()->isPipeType()) {
@@ -1932,7 +1932,7 @@ static bool BuiltinPipePackets(Sema &S, CallExpr *Call) {
 // \param Call A pointer to the builtin call.
 // \return True if a semantic error has been found, false otherwise.
 static bool OpenCLBuiltinToAddr(Sema &S, unsigned BuiltinID, CallExpr *Call) {
-  if (checkArgCount(S, Call, 1))
+  if (S.checkArgCount(Call, 1))
     return true;
 
   auto RT = Call->getArg(0)->getType();
@@ -2087,7 +2087,7 @@ static bool checkPointerAuthValue(Sema &S, Expr *&Arg,
 }
 
 static ExprResult PointerAuthStrip(Sema &S, CallExpr *Call) {
-  if (checkArgCount(S, Call, 2))
+  if (S.checkArgCount(Call, 2))
     return ExprError();
   if (checkPointerAuthEnabled(S, Call))
     return ExprError();
@@ -2100,7 +2100,7 @@ static ExprResult PointerAuthStrip(Sema &S, CallExpr *Call) {
 }
 
 static ExprResult PointerAuthBlendDiscriminator(Sema &S, CallExpr *Call) {
-  if (checkArgCount(S, Call, 2))
+  if (S.checkArgCount(Call, 2))
     return ExprError();
   if (checkPointerAuthEnabled(S, Call))
     return ExprError();
@@ -2113,7 +2113,7 @@ static ExprResult PointerAuthBlendDiscriminator(Sema &S, CallExpr *Call) {
 }
 
 static ExprResult PointerAuthSignGenericData(Sema &S, CallExpr *Call) {
-  if (checkArgCount(S, Call, 2))
+  if (S.checkArgCount(Call, 2))
     return ExprError();
   if (checkPointerAuthEnabled(S, Call))
     return ExprError();
@@ -2127,7 +2127,7 @@ static ExprResult PointerAuthSignGenericData(Sema &S, CallExpr *Call) {
 
 static ExprResult PointerAuthSignOrAuth(Sema &S, CallExpr *Call,
                                         PointerAuthOpKind OpKind) {
-  if (checkArgCount(S, Call, 3))
+  if (S.checkArgCount(Call, 3))
     return ExprError();
   if (checkPointerAuthEnabled(S, Call))
     return ExprError();
@@ -2141,7 +2141,7 @@ static ExprResult PointerAuthSignOrAuth(Sema &S, CallExpr *Call,
 }
 
 static ExprResult PointerAuthAuthAndResign(Sema &S, CallExpr *Call) {
-  if (checkArgCount(S, Call, 5))
+  if (S.checkArgCount(Call, 5))
     return ExprError();
   if (checkPointerAuthEnabled(S, Call))
     return ExprError();
@@ -2157,7 +2157,7 @@ static ExprResult PointerAuthAuthAndResign(Sema &S, CallExpr *Call) {
 }
 
 static ExprResult BuiltinLaunder(Sema &S, CallExpr *TheCall) {
-  if (checkArgCount(S, TheCall, 1))
+  if (S.checkArgCount(TheCall, 1))
     return ExprError();
 
   // Compute __builtin_launder's parameter type from the argument.
@@ -2278,7 +2278,7 @@ bool Sema::CheckTSBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
     return CheckSystemZBuiltinFunctionCall(BuiltinID, TheCall);
   case llvm::Triple::x86:
   case llvm::Triple::x86_64:
-    return CheckX86BuiltinFunctionCall(TI, BuiltinID, TheCall);
+    return X86().CheckBuiltinFunctionCall(TI, BuiltinID, TheCall);
   case llvm::Triple::ppc:
   case llvm::Triple::ppcle:
   case llvm::Triple::ppc64:
@@ -2288,7 +2288,7 @@ bool Sema::CheckTSBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
     return CheckAMDGCNBuiltinFunctionCall(BuiltinID, TheCall);
   case llvm::Triple::riscv32:
   case llvm::Triple::riscv64:
-    return CheckRISCVBuiltinFunctionCall(TI, BuiltinID, TheCall);
+    return RISCV().CheckBuiltinFunctionCall(TI, BuiltinID, TheCall);
   case llvm::Triple::loongarch32:
   case llvm::Triple::loongarch64:
     return CheckLoongArchBuiltinFunctionCall(TI, BuiltinID, TheCall);
@@ -2377,7 +2377,7 @@ static bool BuiltinCpu(Sema &S, const TargetInfo &TI, CallExpr *TheCall,
 /// Checks that __builtin_popcountg was called with a single argument, which is
 /// an unsigned integer.
 static bool BuiltinPopcountg(Sema &S, CallExpr *TheCall) {
-  if (checkArgCount(S, TheCall, 1))
+  if (S.checkArgCount(TheCall, 1))
     return true;
 
   ExprResult ArgRes = S.DefaultLvalueConversion(TheCall->getArg(0));
@@ -2401,7 +2401,7 @@ static bool BuiltinPopcountg(Sema &S, CallExpr *TheCall) {
 /// an unsigned integer, and an optional second argument, which is promoted to
 /// an 'int'.
 static bool BuiltinCountZeroBitsGeneric(Sema &S, CallExpr *TheCall) {
-  if (checkArgCountRange(S, TheCall, 1, 2))
+  if (S.checkArgCountRange(TheCall, 1, 2))
     return true;
 
   ExprResult Arg0Res = S.DefaultLvalueConversion(TheCall->getArg(0));
@@ -2625,7 +2625,8 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
       return ExprError();
     break;
   case Builtin::BI__builtin_classify_type:
-    if (checkArgCount(*this, TheCall, 1)) return true;
+    if (checkArgCount(TheCall, 1))
+      return true;
     TheCall->setType(Context.IntTy);
     break;
   case Builtin::BI__builtin_complex:
@@ -2633,7 +2634,8 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
       return ExprError();
     break;
   case Builtin::BI__builtin_constant_p: {
-    if (checkArgCount(*this, TheCall, 1)) return true;
+    if (checkArgCount(TheCall, 1))
+      return true;
     ExprResult Arg = DefaultFunctionArrayLvalueConversion(TheCall->getArg(0));
     if (Arg.isInvalid()) return true;
     TheCall->setArg(0, Arg.get());
@@ -2822,7 +2824,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     return BuiltinDumpStruct(*this, TheCall);
   case Builtin::BI__builtin_expect_with_probability: {
     // We first want to ensure we are called with 3 arguments
-    if (checkArgCount(*this, TheCall, 3))
+    if (checkArgCount(TheCall, 3))
       return ExprError();
     // then check probability is constant float in range [0.0, 1.0]
     const Expr *ProbArg = TheCall->getArg(2);
@@ -2870,7 +2872,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
       return ExprError();
     break;
   case Builtin::BI__GetExceptionInfo:
-    if (checkArgCount(*this, TheCall, 1))
+    if (checkArgCount(TheCall, 1))
       return ExprError();
 
     if (CheckCXXThrowOperand(
@@ -2891,7 +2893,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     // These are all expected to be of the form
     //   T &/&&/* f(U &/&&)
     // where T and U only differ in qualification.
-    if (checkArgCount(*this, TheCall, 1))
+    if (checkArgCount(TheCall, 1))
       return ExprError();
     QualType Param = FDecl->getParamDecl(0)->getType();
     QualType Result = FDecl->getReturnType();
@@ -3129,7 +3131,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
   }
 
   case Builtin::BI__builtin_elementwise_copysign: {
-    if (checkArgCount(*this, TheCall, 2))
+    if (checkArgCount(TheCall, 2))
       return ExprError();
 
     ExprResult Magnitude = UsualUnaryConversions(TheCall->getArg(0));
@@ -3806,7 +3808,7 @@ bool Sema::CheckARMBuiltinExclusiveCall(unsigned BuiltinID, CallExpr *TheCall,
   DeclRefExpr *DRE =cast<DeclRefExpr>(TheCall->getCallee()->IgnoreParenCasts());
 
   // Ensure that we have the proper number of arguments.
-  if (checkArgCount(*this, TheCall, IsLdrex ? 1 : 2))
+  if (checkArgCount(TheCall, IsLdrex ? 1 : 2))
     return true;
 
   // Inspect the pointer argument of the atomic builtin.  This should always be
@@ -4145,7 +4147,7 @@ bool Sema::CheckBPFBuiltinFunctionCall(unsigned BuiltinID,
           BuiltinID == BPF::BI__builtin_preserve_enum_value) &&
          "unexpected BPF builtin");
 
-  if (checkArgCount(*this, TheCall, 2))
+  if (checkArgCount(TheCall, 2))
     return true;
 
   // The second argument needs to be a constant int
@@ -5589,12 +5591,12 @@ bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
   switch (BuiltinID) {
   case Builtin::BI__builtin_hlsl_elementwise_all:
   case Builtin::BI__builtin_hlsl_elementwise_any: {
-    if (checkArgCount(*this, TheCall, 1))
+    if (checkArgCount(TheCall, 1))
       return true;
     break;
   }
   case Builtin::BI__builtin_hlsl_elementwise_clamp: {
-    if (checkArgCount(*this, TheCall, 3))
+    if (checkArgCount(TheCall, 3))
       return true;
     if (CheckVectorElementCallArgs(this, TheCall))
       return true;
@@ -5605,7 +5607,7 @@ bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
     break;
   }
   case Builtin::BI__builtin_hlsl_dot: {
-    if (checkArgCount(*this, TheCall, 2))
+    if (checkArgCount(TheCall, 2))
       return true;
     if (CheckVectorElementCallArgs(this, TheCall))
       return true;
@@ -5639,7 +5641,7 @@ bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
     break;
   }
   case Builtin::BI__builtin_hlsl_lerp: {
-    if (checkArgCount(*this, TheCall, 3))
+    if (checkArgCount(TheCall, 3))
       return true;
     if (CheckVectorElementCallArgs(this, TheCall))
       return true;
@@ -5650,7 +5652,7 @@ bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
     break;
   }
   case Builtin::BI__builtin_hlsl_mad: {
-    if (checkArgCount(*this, TheCall, 3))
+    if (checkArgCount(TheCall, 3))
       return true;
     if (CheckVectorElementCallArgs(this, TheCall))
       return true;
@@ -5694,6 +5696,28 @@ bool Sema::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
   // position of memory order and scope arguments in the builtin
   unsigned OrderIndex, ScopeIndex;
   switch (BuiltinID) {
+  case AMDGPU::BI__builtin_amdgcn_global_load_lds: {
+    constexpr const int SizeIdx = 2;
+    llvm::APSInt Size;
+    Expr *ArgExpr = TheCall->getArg(SizeIdx);
+    ExprResult R = VerifyIntegerConstantExpression(ArgExpr, &Size);
+    if (R.isInvalid())
+      return true;
+    switch (Size.getSExtValue()) {
+    case 1:
+    case 2:
+    case 4:
+      return false;
+    default:
+      Diag(ArgExpr->getExprLoc(),
+           diag::err_amdgcn_global_load_lds_size_invalid_value)
+          << ArgExpr->getSourceRange();
+      Diag(ArgExpr->getExprLoc(),
+           diag::note_amdgcn_global_load_lds_size_valid_value)
+          << ArgExpr->getSourceRange();
+      return true;
+    }
+  }
   case AMDGPU::BI__builtin_amdgcn_get_fpenv:
   case AMDGPU::BI__builtin_amdgcn_set_fpenv:
     return false;
@@ -5753,866 +5777,6 @@ bool Sema::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
   return false;
 }
 
-bool Sema::CheckRISCVLMUL(CallExpr *TheCall, unsigned ArgNum) {
-  llvm::APSInt Result;
-
-  // We can't check the value of a dependent argument.
-  Expr *Arg = TheCall->getArg(ArgNum);
-  if (Arg->isTypeDependent() || Arg->isValueDependent())
-    return false;
-
-  // Check constant-ness first.
-  if (BuiltinConstantArg(TheCall, ArgNum, Result))
-    return true;
-
-  int64_t Val = Result.getSExtValue();
-  if ((Val >= 0 && Val <= 3) || (Val >= 5 && Val <= 7))
-    return false;
-
-  return Diag(TheCall->getBeginLoc(), diag::err_riscv_builtin_invalid_lmul)
-         << Arg->getSourceRange();
-}
-
-static bool CheckInvalidVLENandLMUL(const TargetInfo &TI, CallExpr *TheCall,
-                                    Sema &S, QualType Type, int EGW) {
-  assert((EGW == 128 || EGW == 256) && "EGW can only be 128 or 256 bits");
-
-  // LMUL * VLEN >= EGW
-  ASTContext::BuiltinVectorTypeInfo Info =
-      S.Context.getBuiltinVectorTypeInfo(Type->castAs<BuiltinType>());
-  unsigned ElemSize = S.Context.getTypeSize(Info.ElementType);
-  unsigned MinElemCount = Info.EC.getKnownMinValue();
-
-  unsigned EGS = EGW / ElemSize;
-  // If EGS is less than or equal to the minimum number of elements, then the
-  // type is valid.
-  if (EGS <= MinElemCount)
-    return false;
-
-  // Otherwise, we need vscale to be at least EGS / MinElemCont.
-  assert(EGS % MinElemCount == 0);
-  unsigned VScaleFactor = EGS / MinElemCount;
-  // Vscale is VLEN/RVVBitsPerBlock.
-  unsigned MinRequiredVLEN = VScaleFactor * llvm::RISCV::RVVBitsPerBlock;
-  std::string RequiredExt = "zvl" + std::to_string(MinRequiredVLEN) + "b";
-  if (!TI.hasFeature(RequiredExt))
-    return S.Diag(TheCall->getBeginLoc(),
-        diag::err_riscv_type_requires_extension) << Type << RequiredExt;
-
-  return false;
-}
-
-bool Sema::CheckRISCVBuiltinFunctionCall(const TargetInfo &TI,
-                                         unsigned BuiltinID,
-                                         CallExpr *TheCall) {
-  // vmulh.vv, vmulh.vx, vmulhu.vv, vmulhu.vx, vmulhsu.vv, vmulhsu.vx,
-  // vsmul.vv, vsmul.vx are not included for EEW=64 in Zve64*.
-  switch (BuiltinID) {
-  default:
-    break;
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vv:
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vx:
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vx_tu:
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vv_m:
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vx_m:
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vv_mu:
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vx_mu:
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vv_tum:
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vx_tum:
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vv_tumu:
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vx_tumu:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vv:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vx:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vx_tu:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vv_m:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vx_m:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vv_mu:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vx_mu:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vv_tum:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vx_tum:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vv_tumu:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vx_tumu:
-  case RISCVVector::BI__builtin_rvv_vmulh_vv:
-  case RISCVVector::BI__builtin_rvv_vmulh_vx:
-  case RISCVVector::BI__builtin_rvv_vmulh_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vmulh_vx_tu:
-  case RISCVVector::BI__builtin_rvv_vmulh_vv_m:
-  case RISCVVector::BI__builtin_rvv_vmulh_vx_m:
-  case RISCVVector::BI__builtin_rvv_vmulh_vv_mu:
-  case RISCVVector::BI__builtin_rvv_vmulh_vx_mu:
-  case RISCVVector::BI__builtin_rvv_vmulh_vv_tum:
-  case RISCVVector::BI__builtin_rvv_vmulh_vx_tum:
-  case RISCVVector::BI__builtin_rvv_vmulh_vv_tumu:
-  case RISCVVector::BI__builtin_rvv_vmulh_vx_tumu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx_tu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv_m:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx_m:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv_mu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx_mu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv_tum:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx_tum:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv_tumu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx_tumu: {
-    ASTContext::BuiltinVectorTypeInfo Info = Context.getBuiltinVectorTypeInfo(
-        TheCall->getType()->castAs<BuiltinType>());
-
-    if (Context.getTypeSize(Info.ElementType) == 64 && !TI.hasFeature("v"))
-      return Diag(TheCall->getBeginLoc(),
-                  diag::err_riscv_builtin_requires_extension)
-             << /* IsExtension */ true << TheCall->getSourceRange() << "v";
-
-    break;
-  }
-  }
-
-  switch (BuiltinID) {
-  case RISCVVector::BI__builtin_rvv_vsetvli:
-    return BuiltinConstantArgRange(TheCall, 1, 0, 3) ||
-           CheckRISCVLMUL(TheCall, 2);
-  case RISCVVector::BI__builtin_rvv_vsetvlimax:
-    return BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
-           CheckRISCVLMUL(TheCall, 1);
-  case RISCVVector::BI__builtin_rvv_vget_v: {
-    ASTContext::BuiltinVectorTypeInfo ResVecInfo =
-        Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(
-            TheCall->getType().getCanonicalType().getTypePtr()));
-    ASTContext::BuiltinVectorTypeInfo VecInfo =
-        Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(
-            TheCall->getArg(0)->getType().getCanonicalType().getTypePtr()));
-    unsigned MaxIndex;
-    if (VecInfo.NumVectors != 1) // vget for tuple type
-      MaxIndex = VecInfo.NumVectors;
-    else // vget for non-tuple type
-      MaxIndex = (VecInfo.EC.getKnownMinValue() * VecInfo.NumVectors) /
-                 (ResVecInfo.EC.getKnownMinValue() * ResVecInfo.NumVectors);
-    return BuiltinConstantArgRange(TheCall, 1, 0, MaxIndex - 1);
-  }
-  case RISCVVector::BI__builtin_rvv_vset_v: {
-    ASTContext::BuiltinVectorTypeInfo ResVecInfo =
-        Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(
-            TheCall->getType().getCanonicalType().getTypePtr()));
-    ASTContext::BuiltinVectorTypeInfo VecInfo =
-        Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(
-            TheCall->getArg(2)->getType().getCanonicalType().getTypePtr()));
-    unsigned MaxIndex;
-    if (ResVecInfo.NumVectors != 1) // vset for tuple type
-      MaxIndex = ResVecInfo.NumVectors;
-    else // vset fo non-tuple type
-      MaxIndex = (ResVecInfo.EC.getKnownMinValue() * ResVecInfo.NumVectors) /
-                 (VecInfo.EC.getKnownMinValue() * VecInfo.NumVectors);
-    return BuiltinConstantArgRange(TheCall, 1, 0, MaxIndex - 1);
-  }
-  // Vector Crypto
-  case RISCVVector::BI__builtin_rvv_vaeskf1_vi_tu:
-  case RISCVVector::BI__builtin_rvv_vaeskf2_vi_tu:
-  case RISCVVector::BI__builtin_rvv_vaeskf2_vi:
-  case RISCVVector::BI__builtin_rvv_vsm4k_vi_tu: {
-    QualType Op1Type = TheCall->getArg(0)->getType();
-    QualType Op2Type = TheCall->getArg(1)->getType();
-    return CheckInvalidVLENandLMUL(TI, TheCall, *this, Op1Type, 128) ||
-           CheckInvalidVLENandLMUL(TI, TheCall, *this, Op2Type, 128) ||
-           BuiltinConstantArgRange(TheCall, 2, 0, 31);
-  }
-  case RISCVVector::BI__builtin_rvv_vsm3c_vi_tu:
-  case RISCVVector::BI__builtin_rvv_vsm3c_vi: {
-    QualType Op1Type = TheCall->getArg(0)->getType();
-    return CheckInvalidVLENandLMUL(TI, TheCall, *this, Op1Type, 256) ||
-           BuiltinConstantArgRange(TheCall, 2, 0, 31);
-  }
-  case RISCVVector::BI__builtin_rvv_vaeskf1_vi:
-  case RISCVVector::BI__builtin_rvv_vsm4k_vi: {
-    QualType Op1Type = TheCall->getArg(0)->getType();
-    return CheckInvalidVLENandLMUL(TI, TheCall, *this, Op1Type, 128) ||
-           BuiltinConstantArgRange(TheCall, 1, 0, 31);
-  }
-  case RISCVVector::BI__builtin_rvv_vaesdf_vv:
-  case RISCVVector::BI__builtin_rvv_vaesdf_vs:
-  case RISCVVector::BI__builtin_rvv_vaesdm_vv:
-  case RISCVVector::BI__builtin_rvv_vaesdm_vs:
-  case RISCVVector::BI__builtin_rvv_vaesef_vv:
-  case RISCVVector::BI__builtin_rvv_vaesef_vs:
-  case RISCVVector::BI__builtin_rvv_vaesem_vv:
-  case RISCVVector::BI__builtin_rvv_vaesem_vs:
-  case RISCVVector::BI__builtin_rvv_vaesz_vs:
-  case RISCVVector::BI__builtin_rvv_vsm4r_vv:
-  case RISCVVector::BI__builtin_rvv_vsm4r_vs:
-  case RISCVVector::BI__builtin_rvv_vaesdf_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vaesdf_vs_tu:
-  case RISCVVector::BI__builtin_rvv_vaesdm_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vaesdm_vs_tu:
-  case RISCVVector::BI__builtin_rvv_vaesef_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vaesef_vs_tu:
-  case RISCVVector::BI__builtin_rvv_vaesem_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vaesem_vs_tu:
-  case RISCVVector::BI__builtin_rvv_vaesz_vs_tu:
-  case RISCVVector::BI__builtin_rvv_vsm4r_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vsm4r_vs_tu: {
-    QualType Op1Type = TheCall->getArg(0)->getType();
-    QualType Op2Type = TheCall->getArg(1)->getType();
-    return CheckInvalidVLENandLMUL(TI, TheCall, *this, Op1Type, 128) ||
-           CheckInvalidVLENandLMUL(TI, TheCall, *this, Op2Type, 128);
-  }
-  case RISCVVector::BI__builtin_rvv_vsha2ch_vv:
-  case RISCVVector::BI__builtin_rvv_vsha2cl_vv:
-  case RISCVVector::BI__builtin_rvv_vsha2ms_vv:
-  case RISCVVector::BI__builtin_rvv_vsha2ch_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vsha2cl_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vsha2ms_vv_tu: {
-    QualType Op1Type = TheCall->getArg(0)->getType();
-    QualType Op2Type = TheCall->getArg(1)->getType();
-    QualType Op3Type = TheCall->getArg(2)->getType();
-    ASTContext::BuiltinVectorTypeInfo Info =
-        Context.getBuiltinVectorTypeInfo(Op1Type->castAs<BuiltinType>());
-    uint64_t ElemSize = Context.getTypeSize(Info.ElementType);
-    if (ElemSize == 64 && !TI.hasFeature("zvknhb"))
-      return Diag(TheCall->getBeginLoc(),
-                  diag::err_riscv_builtin_requires_extension)
-             << /* IsExtension */ true << TheCall->getSourceRange() << "zvknb";
-
-    return CheckInvalidVLENandLMUL(TI, TheCall, *this, Op1Type, ElemSize * 4) ||
-           CheckInvalidVLENandLMUL(TI, TheCall, *this, Op2Type, ElemSize * 4) ||
-           CheckInvalidVLENandLMUL(TI, TheCall, *this, Op3Type, ElemSize * 4);
-  }
-
-  case RISCVVector::BI__builtin_rvv_sf_vc_i_se:
-    // bit_27_26, bit_24_20, bit_11_7, simm5, sew, log2lmul
-    return BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
-           BuiltinConstantArgRange(TheCall, 1, 0, 31) ||
-           BuiltinConstantArgRange(TheCall, 2, 0, 31) ||
-           BuiltinConstantArgRange(TheCall, 3, -16, 15) ||
-           CheckRISCVLMUL(TheCall, 5);
-  case RISCVVector::BI__builtin_rvv_sf_vc_iv_se:
-    // bit_27_26, bit_11_7, vs2, simm5
-    return BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
-           BuiltinConstantArgRange(TheCall, 1, 0, 31) ||
-           BuiltinConstantArgRange(TheCall, 3, -16, 15);
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_i:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_i_se:
-    // bit_27_26, bit_24_20, simm5
-    return BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
-           BuiltinConstantArgRange(TheCall, 1, 0, 31) ||
-           BuiltinConstantArgRange(TheCall, 2, -16, 15);
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_iv:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_iv_se:
-    // bit_27_26, vs2, simm5
-    return BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
-           BuiltinConstantArgRange(TheCall, 2, -16, 15);
-  case RISCVVector::BI__builtin_rvv_sf_vc_ivv_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_ivw_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_ivv:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_ivw:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_ivv_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_ivw_se:
-    // bit_27_26, vd, vs2, simm5
-    return BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
-           BuiltinConstantArgRange(TheCall, 3, -16, 15);
-  case RISCVVector::BI__builtin_rvv_sf_vc_x_se:
-    // bit_27_26, bit_24_20, bit_11_7, xs1, sew, log2lmul
-    return BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
-           BuiltinConstantArgRange(TheCall, 1, 0, 31) ||
-           BuiltinConstantArgRange(TheCall, 2, 0, 31) ||
-           CheckRISCVLMUL(TheCall, 5);
-  case RISCVVector::BI__builtin_rvv_sf_vc_xv_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_vv_se:
-    // bit_27_26, bit_11_7, vs2, xs1/vs1
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_x:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_x_se:
-    // bit_27_26, bit_24-20, xs1
-    return BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
-           BuiltinConstantArgRange(TheCall, 1, 0, 31);
-  case RISCVVector::BI__builtin_rvv_sf_vc_vvv_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_xvv_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_vvw_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_xvw_se:
-    // bit_27_26, vd, vs2, xs1
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_xv:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_vv:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_xv_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_vv_se:
-    // bit_27_26, vs2, xs1/vs1
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_xvv:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_vvv:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_xvw:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_vvw:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_xvv_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_vvv_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_xvw_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_vvw_se:
-    // bit_27_26, vd, vs2, xs1/vs1
-    return BuiltinConstantArgRange(TheCall, 0, 0, 3);
-  case RISCVVector::BI__builtin_rvv_sf_vc_fv_se:
-    // bit_26, bit_11_7, vs2, fs1
-    return BuiltinConstantArgRange(TheCall, 0, 0, 1) ||
-           BuiltinConstantArgRange(TheCall, 1, 0, 31);
-  case RISCVVector::BI__builtin_rvv_sf_vc_fvv_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_fvw_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_fvv:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_fvw:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_fvv_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_fvw_se:
-    // bit_26, vd, vs2, fs1
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_fv:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_fv_se:
-    // bit_26, vs2, fs1
-    return BuiltinConstantArgRange(TheCall, 0, 0, 1);
-  // Check if byteselect is in [0, 3]
-  case RISCV::BI__builtin_riscv_aes32dsi:
-  case RISCV::BI__builtin_riscv_aes32dsmi:
-  case RISCV::BI__builtin_riscv_aes32esi:
-  case RISCV::BI__builtin_riscv_aes32esmi:
-  case RISCV::BI__builtin_riscv_sm4ks:
-  case RISCV::BI__builtin_riscv_sm4ed:
-    return BuiltinConstantArgRange(TheCall, 2, 0, 3);
-  // Check if rnum is in [0, 10]
-  case RISCV::BI__builtin_riscv_aes64ks1i:
-    return BuiltinConstantArgRange(TheCall, 1, 0, 10);
-  // Check if value range for vxrm is in [0, 3]
-  case RISCVVector::BI__builtin_rvv_vaaddu_vv:
-  case RISCVVector::BI__builtin_rvv_vaaddu_vx:
-  case RISCVVector::BI__builtin_rvv_vaadd_vv:
-  case RISCVVector::BI__builtin_rvv_vaadd_vx:
-  case RISCVVector::BI__builtin_rvv_vasubu_vv:
-  case RISCVVector::BI__builtin_rvv_vasubu_vx:
-  case RISCVVector::BI__builtin_rvv_vasub_vv:
-  case RISCVVector::BI__builtin_rvv_vasub_vx:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx:
-  case RISCVVector::BI__builtin_rvv_vssra_vv:
-  case RISCVVector::BI__builtin_rvv_vssra_vx:
-  case RISCVVector::BI__builtin_rvv_vssrl_vv:
-  case RISCVVector::BI__builtin_rvv_vssrl_vx:
-  case RISCVVector::BI__builtin_rvv_vnclip_wv:
-  case RISCVVector::BI__builtin_rvv_vnclip_wx:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wv:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wx:
-    return BuiltinConstantArgRange(TheCall, 2, 0, 3);
-  case RISCVVector::BI__builtin_rvv_vaaddu_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vaaddu_vx_tu:
-  case RISCVVector::BI__builtin_rvv_vaadd_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vaadd_vx_tu:
-  case RISCVVector::BI__builtin_rvv_vasubu_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vasubu_vx_tu:
-  case RISCVVector::BI__builtin_rvv_vasub_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vasub_vx_tu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx_tu:
-  case RISCVVector::BI__builtin_rvv_vssra_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vssra_vx_tu:
-  case RISCVVector::BI__builtin_rvv_vssrl_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vssrl_vx_tu:
-  case RISCVVector::BI__builtin_rvv_vnclip_wv_tu:
-  case RISCVVector::BI__builtin_rvv_vnclip_wx_tu:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wv_tu:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wx_tu:
-  case RISCVVector::BI__builtin_rvv_vaaddu_vv_m:
-  case RISCVVector::BI__builtin_rvv_vaaddu_vx_m:
-  case RISCVVector::BI__builtin_rvv_vaadd_vv_m:
-  case RISCVVector::BI__builtin_rvv_vaadd_vx_m:
-  case RISCVVector::BI__builtin_rvv_vasubu_vv_m:
-  case RISCVVector::BI__builtin_rvv_vasubu_vx_m:
-  case RISCVVector::BI__builtin_rvv_vasub_vv_m:
-  case RISCVVector::BI__builtin_rvv_vasub_vx_m:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv_m:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx_m:
-  case RISCVVector::BI__builtin_rvv_vssra_vv_m:
-  case RISCVVector::BI__builtin_rvv_vssra_vx_m:
-  case RISCVVector::BI__builtin_rvv_vssrl_vv_m:
-  case RISCVVector::BI__builtin_rvv_vssrl_vx_m:
-  case RISCVVector::BI__builtin_rvv_vnclip_wv_m:
-  case RISCVVector::BI__builtin_rvv_vnclip_wx_m:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wv_m:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wx_m:
-    return BuiltinConstantArgRange(TheCall, 3, 0, 3);
-  case RISCVVector::BI__builtin_rvv_vaaddu_vv_tum:
-  case RISCVVector::BI__builtin_rvv_vaaddu_vv_tumu:
-  case RISCVVector::BI__builtin_rvv_vaaddu_vv_mu:
-  case RISCVVector::BI__builtin_rvv_vaaddu_vx_tum:
-  case RISCVVector::BI__builtin_rvv_vaaddu_vx_tumu:
-  case RISCVVector::BI__builtin_rvv_vaaddu_vx_mu:
-  case RISCVVector::BI__builtin_rvv_vaadd_vv_tum:
-  case RISCVVector::BI__builtin_rvv_vaadd_vv_tumu:
-  case RISCVVector::BI__builtin_rvv_vaadd_vv_mu:
-  case RISCVVector::BI__builtin_rvv_vaadd_vx_tum:
-  case RISCVVector::BI__builtin_rvv_vaadd_vx_tumu:
-  case RISCVVector::BI__builtin_rvv_vaadd_vx_mu:
-  case RISCVVector::BI__builtin_rvv_vasubu_vv_tum:
-  case RISCVVector::BI__builtin_rvv_vasubu_vv_tumu:
-  case RISCVVector::BI__builtin_rvv_vasubu_vv_mu:
-  case RISCVVector::BI__builtin_rvv_vasubu_vx_tum:
-  case RISCVVector::BI__builtin_rvv_vasubu_vx_tumu:
-  case RISCVVector::BI__builtin_rvv_vasubu_vx_mu:
-  case RISCVVector::BI__builtin_rvv_vasub_vv_tum:
-  case RISCVVector::BI__builtin_rvv_vasub_vv_tumu:
-  case RISCVVector::BI__builtin_rvv_vasub_vv_mu:
-  case RISCVVector::BI__builtin_rvv_vasub_vx_tum:
-  case RISCVVector::BI__builtin_rvv_vasub_vx_tumu:
-  case RISCVVector::BI__builtin_rvv_vasub_vx_mu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv_mu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx_mu:
-  case RISCVVector::BI__builtin_rvv_vssra_vv_mu:
-  case RISCVVector::BI__builtin_rvv_vssra_vx_mu:
-  case RISCVVector::BI__builtin_rvv_vssrl_vv_mu:
-  case RISCVVector::BI__builtin_rvv_vssrl_vx_mu:
-  case RISCVVector::BI__builtin_rvv_vnclip_wv_mu:
-  case RISCVVector::BI__builtin_rvv_vnclip_wx_mu:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wv_mu:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wx_mu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv_tum:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx_tum:
-  case RISCVVector::BI__builtin_rvv_vssra_vv_tum:
-  case RISCVVector::BI__builtin_rvv_vssra_vx_tum:
-  case RISCVVector::BI__builtin_rvv_vssrl_vv_tum:
-  case RISCVVector::BI__builtin_rvv_vssrl_vx_tum:
-  case RISCVVector::BI__builtin_rvv_vnclip_wv_tum:
-  case RISCVVector::BI__builtin_rvv_vnclip_wx_tum:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wv_tum:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wx_tum:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv_tumu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx_tumu:
-  case RISCVVector::BI__builtin_rvv_vssra_vv_tumu:
-  case RISCVVector::BI__builtin_rvv_vssra_vx_tumu:
-  case RISCVVector::BI__builtin_rvv_vssrl_vv_tumu:
-  case RISCVVector::BI__builtin_rvv_vssrl_vx_tumu:
-  case RISCVVector::BI__builtin_rvv_vnclip_wv_tumu:
-  case RISCVVector::BI__builtin_rvv_vnclip_wx_tumu:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wv_tumu:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wx_tumu:
-    return BuiltinConstantArgRange(TheCall, 4, 0, 3);
-  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm:
-  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm:
-  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm:
-  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm:
-  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm:
-  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm:
-    return BuiltinConstantArgRange(TheCall, 1, 0, 4);
-  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm:
-  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfredosum_vs_rm:
-  case RISCVVector::BI__builtin_rvv_vfredusum_vs_rm:
-  case RISCVVector::BI__builtin_rvv_vfwredosum_vs_rm:
-  case RISCVVector::BI__builtin_rvv_vfwredusum_vs_rm:
-  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm_m:
-    return BuiltinConstantArgRange(TheCall, 2, 0, 4);
-  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfredosum_vs_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfredusum_vs_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwredosum_vs_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwredusum_vs_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfredosum_vs_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfredusum_vs_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwredosum_vs_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwredusum_vs_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm_mu:
-    return BuiltinConstantArgRange(TheCall, 3, 0, 4);
-  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfredosum_vs_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfredusum_vs_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwredosum_vs_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwredusum_vs_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm_mu:
-    return BuiltinConstantArgRange(TheCall, 4, 0, 4);
-  case RISCV::BI__builtin_riscv_ntl_load:
-  case RISCV::BI__builtin_riscv_ntl_store:
-    DeclRefExpr *DRE =
-        cast<DeclRefExpr>(TheCall->getCallee()->IgnoreParenCasts());
-    assert((BuiltinID == RISCV::BI__builtin_riscv_ntl_store ||
-            BuiltinID == RISCV::BI__builtin_riscv_ntl_load) &&
-           "Unexpected RISC-V nontemporal load/store builtin!");
-    bool IsStore = BuiltinID == RISCV::BI__builtin_riscv_ntl_store;
-    unsigned NumArgs = IsStore ? 3 : 2;
-
-    if (checkArgCountAtLeast(*this, TheCall, NumArgs - 1))
-      return true;
-
-    if (checkArgCountAtMost(*this, TheCall, NumArgs))
-      return true;
-
-    // Domain value should be compile-time constant.
-    // 2 <= domain <= 5
-    if (TheCall->getNumArgs() == NumArgs &&
-        BuiltinConstantArgRange(TheCall, NumArgs - 1, 2, 5))
-      return true;
-
-    Expr *PointerArg = TheCall->getArg(0);
-    ExprResult PointerArgResult =
-        DefaultFunctionArrayLvalueConversion(PointerArg);
-
-    if (PointerArgResult.isInvalid())
-      return true;
-    PointerArg = PointerArgResult.get();
-
-    const PointerType *PtrType = PointerArg->getType()->getAs<PointerType>();
-    if (!PtrType) {
-      Diag(DRE->getBeginLoc(), diag::err_nontemporal_builtin_must_be_pointer)
-          << PointerArg->getType() << PointerArg->getSourceRange();
-      return true;
-    }
-
-    QualType ValType = PtrType->getPointeeType();
-    ValType = ValType.getUnqualifiedType();
-    if (!ValType->isIntegerType() && !ValType->isAnyPointerType() &&
-        !ValType->isBlockPointerType() && !ValType->isFloatingType() &&
-        !ValType->isVectorType() && !ValType->isRVVSizelessBuiltinType()) {
-      Diag(DRE->getBeginLoc(),
-           diag::err_nontemporal_builtin_must_be_pointer_intfltptr_or_vector)
-          << PointerArg->getType() << PointerArg->getSourceRange();
-      return true;
-    }
-
-    if (!IsStore) {
-      TheCall->setType(ValType);
-      return false;
-    }
-
-    ExprResult ValArg = TheCall->getArg(1);
-    InitializedEntity Entity = InitializedEntity::InitializeParameter(
-        Context, ValType, /*consume*/ false);
-    ValArg = PerformCopyInitialization(Entity, SourceLocation(), ValArg);
-    if (ValArg.isInvalid())
-      return true;
-
-    TheCall->setArg(1, ValArg.get());
-    TheCall->setType(Context.VoidTy);
-    return false;
-  }
-
-  return false;
-}
-
 bool Sema::CheckSystemZBuiltinFunctionCall(unsigned BuiltinID,
                                            CallExpr *TheCall) {
   if (BuiltinID == SystemZ::BI__builtin_tabort) {
@@ -6708,38 +5872,6 @@ bool Sema::CheckWebAssemblyBuiltinFunctionCall(const TargetInfo &TI,
   return false;
 }
 
-void Sema::checkRVVTypeSupport(QualType Ty, SourceLocation Loc, Decl *D,
-                               const llvm::StringMap<bool> &FeatureMap) {
-  ASTContext::BuiltinVectorTypeInfo Info =
-      Context.getBuiltinVectorTypeInfo(Ty->castAs<BuiltinType>());
-  unsigned EltSize = Context.getTypeSize(Info.ElementType);
-  unsigned MinElts = Info.EC.getKnownMinValue();
-
-  if (Info.ElementType->isSpecificBuiltinType(BuiltinType::Double) &&
-      !FeatureMap.lookup("zve64d"))
-    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve64d";
-  // (ELEN, LMUL) pairs of (8, mf8), (16, mf4), (32, mf2), (64, m1) requires at
-  // least zve64x
-  else if (((EltSize == 64 && Info.ElementType->isIntegerType()) ||
-            MinElts == 1) &&
-           !FeatureMap.lookup("zve64x"))
-    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve64x";
-  else if (Info.ElementType->isFloat16Type() && !FeatureMap.lookup("zvfh") &&
-           !FeatureMap.lookup("zvfhmin"))
-    Diag(Loc, diag::err_riscv_type_requires_extension, D)
-        << Ty << "zvfh or zvfhmin";
-  else if (Info.ElementType->isBFloat16Type() &&
-           !FeatureMap.lookup("experimental-zvfbfmin"))
-    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zvfbfmin";
-  else if (Info.ElementType->isSpecificBuiltinType(BuiltinType::Float) &&
-           !FeatureMap.lookup("zve32f"))
-    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve32f";
-  // Given that caller already checked isRVVType() before calling this function,
-  // if we don't have at least zve32x supported, then we need to emit error.
-  else if (!FeatureMap.lookup("zve32x"))
-    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve32x";
-}
-
 bool Sema::CheckNVPTXBuiltinFunctionCall(const TargetInfo &TI,
                                          unsigned BuiltinID,
                                          CallExpr *TheCall) {
@@ -6748,862 +5880,12 @@ bool Sema::CheckNVPTXBuiltinFunctionCall(const TargetInfo &TI,
   case NVPTX::BI__nvvm_cp_async_ca_shared_global_8:
   case NVPTX::BI__nvvm_cp_async_ca_shared_global_16:
   case NVPTX::BI__nvvm_cp_async_cg_shared_global_16:
-    return checkArgCountAtMost(*this, TheCall, 3);
-  }
-
-  return false;
-}
-
-// Check if the rounding mode is legal.
-bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
-  // Indicates if this instruction has rounding control or just SAE.
-  bool HasRC = false;
-
-  unsigned ArgNum = 0;
-  switch (BuiltinID) {
-  default:
-    return false;
-  case X86::BI__builtin_ia32_vcvttsd2si32:
-  case X86::BI__builtin_ia32_vcvttsd2si64:
-  case X86::BI__builtin_ia32_vcvttsd2usi32:
-  case X86::BI__builtin_ia32_vcvttsd2usi64:
-  case X86::BI__builtin_ia32_vcvttss2si32:
-  case X86::BI__builtin_ia32_vcvttss2si64:
-  case X86::BI__builtin_ia32_vcvttss2usi32:
-  case X86::BI__builtin_ia32_vcvttss2usi64:
-  case X86::BI__builtin_ia32_vcvttsh2si32:
-  case X86::BI__builtin_ia32_vcvttsh2si64:
-  case X86::BI__builtin_ia32_vcvttsh2usi32:
-  case X86::BI__builtin_ia32_vcvttsh2usi64:
-    ArgNum = 1;
-    break;
-  case X86::BI__builtin_ia32_maxpd512:
-  case X86::BI__builtin_ia32_maxps512:
-  case X86::BI__builtin_ia32_minpd512:
-  case X86::BI__builtin_ia32_minps512:
-  case X86::BI__builtin_ia32_maxph512:
-  case X86::BI__builtin_ia32_minph512:
-    ArgNum = 2;
-    break;
-  case X86::BI__builtin_ia32_vcvtph2pd512_mask:
-  case X86::BI__builtin_ia32_vcvtph2psx512_mask:
-  case X86::BI__builtin_ia32_cvtps2pd512_mask:
-  case X86::BI__builtin_ia32_cvttpd2dq512_mask:
-  case X86::BI__builtin_ia32_cvttpd2qq512_mask:
-  case X86::BI__builtin_ia32_cvttpd2udq512_mask:
-  case X86::BI__builtin_ia32_cvttpd2uqq512_mask:
-  case X86::BI__builtin_ia32_cvttps2dq512_mask:
-  case X86::BI__builtin_ia32_cvttps2qq512_mask:
-  case X86::BI__builtin_ia32_cvttps2udq512_mask:
-  case X86::BI__builtin_ia32_cvttps2uqq512_mask:
-  case X86::BI__builtin_ia32_vcvttph2w512_mask:
-  case X86::BI__builtin_ia32_vcvttph2uw512_mask:
-  case X86::BI__builtin_ia32_vcvttph2dq512_mask:
-  case X86::BI__builtin_ia32_vcvttph2udq512_mask:
-  case X86::BI__builtin_ia32_vcvttph2qq512_mask:
-  case X86::BI__builtin_ia32_vcvttph2uqq512_mask:
-  case X86::BI__builtin_ia32_exp2pd_mask:
-  case X86::BI__builtin_ia32_exp2ps_mask:
-  case X86::BI__builtin_ia32_getexppd512_mask:
-  case X86::BI__builtin_ia32_getexpps512_mask:
-  case X86::BI__builtin_ia32_getexpph512_mask:
-  case X86::BI__builtin_ia32_rcp28pd_mask:
-  case X86::BI__builtin_ia32_rcp28ps_mask:
-  case X86::BI__builtin_ia32_rsqrt28pd_mask:
-  case X86::BI__builtin_ia32_rsqrt28ps_mask:
-  case X86::BI__builtin_ia32_vcomisd:
-  case X86::BI__builtin_ia32_vcomiss:
-  case X86::BI__builtin_ia32_vcomish:
-  case X86::BI__builtin_ia32_vcvtph2ps512_mask:
-    ArgNum = 3;
-    break;
-  case X86::BI__builtin_ia32_cmppd512_mask:
-  case X86::BI__builtin_ia32_cmpps512_mask:
-  case X86::BI__builtin_ia32_cmpsd_mask:
-  case X86::BI__builtin_ia32_cmpss_mask:
-  case X86::BI__builtin_ia32_cmpsh_mask:
-  case X86::BI__builtin_ia32_vcvtsh2sd_round_mask:
-  case X86::BI__builtin_ia32_vcvtsh2ss_round_mask:
-  case X86::BI__builtin_ia32_cvtss2sd_round_mask:
-  case X86::BI__builtin_ia32_getexpsd128_round_mask:
-  case X86::BI__builtin_ia32_getexpss128_round_mask:
-  case X86::BI__builtin_ia32_getexpsh128_round_mask:
-  case X86::BI__builtin_ia32_getmantpd512_mask:
-  case X86::BI__builtin_ia32_getmantps512_mask:
-  case X86::BI__builtin_ia32_getmantph512_mask:
-  case X86::BI__builtin_ia32_maxsd_round_mask:
-  case X86::BI__builtin_ia32_maxss_round_mask:
-  case X86::BI__builtin_ia32_maxsh_round_mask:
-  case X86::BI__builtin_ia32_minsd_round_mask:
-  case X86::BI__builtin_ia32_minss_round_mask:
-  case X86::BI__builtin_ia32_minsh_round_mask:
-  case X86::BI__builtin_ia32_rcp28sd_round_mask:
-  case X86::BI__builtin_ia32_rcp28ss_round_mask:
-  case X86::BI__builtin_ia32_reducepd512_mask:
-  case X86::BI__builtin_ia32_reduceps512_mask:
-  case X86::BI__builtin_ia32_reduceph512_mask:
-  case X86::BI__builtin_ia32_rndscalepd_mask:
-  case X86::BI__builtin_ia32_rndscaleps_mask:
-  case X86::BI__builtin_ia32_rndscaleph_mask:
-  case X86::BI__builtin_ia32_rsqrt28sd_round_mask:
-  case X86::BI__builtin_ia32_rsqrt28ss_round_mask:
-    ArgNum = 4;
-    break;
-  case X86::BI__builtin_ia32_fixupimmpd512_mask:
-  case X86::BI__builtin_ia32_fixupimmpd512_maskz:
-  case X86::BI__builtin_ia32_fixupimmps512_mask:
-  case X86::BI__builtin_ia32_fixupimmps512_maskz:
-  case X86::BI__builtin_ia32_fixupimmsd_mask:
-  case X86::BI__builtin_ia32_fixupimmsd_maskz:
-  case X86::BI__builtin_ia32_fixupimmss_mask:
-  case X86::BI__builtin_ia32_fixupimmss_maskz:
-  case X86::BI__builtin_ia32_getmantsd_round_mask:
-  case X86::BI__builtin_ia32_getmantss_round_mask:
-  case X86::BI__builtin_ia32_getmantsh_round_mask:
-  case X86::BI__builtin_ia32_rangepd512_mask:
-  case X86::BI__builtin_ia32_rangeps512_mask:
-  case X86::BI__builtin_ia32_rangesd128_round_mask:
-  case X86::BI__builtin_ia32_rangess128_round_mask:
-  case X86::BI__builtin_ia32_reducesd_mask:
-  case X86::BI__builtin_ia32_reducess_mask:
-  case X86::BI__builtin_ia32_reducesh_mask:
-  case X86::BI__builtin_ia32_rndscalesd_round_mask:
-  case X86::BI__builtin_ia32_rndscaless_round_mask:
-  case X86::BI__builtin_ia32_rndscalesh_round_mask:
-    ArgNum = 5;
-    break;
-  case X86::BI__builtin_ia32_vcvtsd2si64:
-  case X86::BI__builtin_ia32_vcvtsd2si32:
-  case X86::BI__builtin_ia32_vcvtsd2usi32:
-  case X86::BI__builtin_ia32_vcvtsd2usi64:
-  case X86::BI__builtin_ia32_vcvtss2si32:
-  case X86::BI__builtin_ia32_vcvtss2si64:
-  case X86::BI__builtin_ia32_vcvtss2usi32:
-  case X86::BI__builtin_ia32_vcvtss2usi64:
-  case X86::BI__builtin_ia32_vcvtsh2si32:
-  case X86::BI__builtin_ia32_vcvtsh2si64:
-  case X86::BI__builtin_ia32_vcvtsh2usi32:
-  case X86::BI__builtin_ia32_vcvtsh2usi64:
-  case X86::BI__builtin_ia32_sqrtpd512:
-  case X86::BI__builtin_ia32_sqrtps512:
-  case X86::BI__builtin_ia32_sqrtph512:
-    ArgNum = 1;
-    HasRC = true;
-    break;
-  case X86::BI__builtin_ia32_addph512:
-  case X86::BI__builtin_ia32_divph512:
-  case X86::BI__builtin_ia32_mulph512:
-  case X86::BI__builtin_ia32_subph512:
-  case X86::BI__builtin_ia32_addpd512:
-  case X86::BI__builtin_ia32_addps512:
-  case X86::BI__builtin_ia32_divpd512:
-  case X86::BI__builtin_ia32_divps512:
-  case X86::BI__builtin_ia32_mulpd512:
-  case X86::BI__builtin_ia32_mulps512:
-  case X86::BI__builtin_ia32_subpd512:
-  case X86::BI__builtin_ia32_subps512:
-  case X86::BI__builtin_ia32_cvtsi2sd64:
-  case X86::BI__builtin_ia32_cvtsi2ss32:
-  case X86::BI__builtin_ia32_cvtsi2ss64:
-  case X86::BI__builtin_ia32_cvtusi2sd64:
-  case X86::BI__builtin_ia32_cvtusi2ss32:
-  case X86::BI__builtin_ia32_cvtusi2ss64:
-  case X86::BI__builtin_ia32_vcvtusi2sh:
-  case X86::BI__builtin_ia32_vcvtusi642sh:
-  case X86::BI__builtin_ia32_vcvtsi2sh:
-  case X86::BI__builtin_ia32_vcvtsi642sh:
-    ArgNum = 2;
-    HasRC = true;
-    break;
-  case X86::BI__builtin_ia32_cvtdq2ps512_mask:
-  case X86::BI__builtin_ia32_cvtudq2ps512_mask:
-  case X86::BI__builtin_ia32_vcvtpd2ph512_mask:
-  case X86::BI__builtin_ia32_vcvtps2phx512_mask:
-  case X86::BI__builtin_ia32_cvtpd2ps512_mask:
-  case X86::BI__builtin_ia32_cvtpd2dq512_mask:
-  case X86::BI__builtin_ia32_cvtpd2qq512_mask:
-  case X86::BI__builtin_ia32_cvtpd2udq512_mask:
-  case X86::BI__builtin_ia32_cvtpd2uqq512_mask:
-  case X86::BI__builtin_ia32_cvtps2dq512_mask:
-  case X86::BI__builtin_ia32_cvtps2qq512_mask:
-  case X86::BI__builtin_ia32_cvtps2udq512_mask:
-  case X86::BI__builtin_ia32_cvtps2uqq512_mask:
-  case X86::BI__builtin_ia32_cvtqq2pd512_mask:
-  case X86::BI__builtin_ia32_cvtqq2ps512_mask:
-  case X86::BI__builtin_ia32_cvtuqq2pd512_mask:
-  case X86::BI__builtin_ia32_cvtuqq2ps512_mask:
-  case X86::BI__builtin_ia32_vcvtdq2ph512_mask:
-  case X86::BI__builtin_ia32_vcvtudq2ph512_mask:
-  case X86::BI__builtin_ia32_vcvtw2ph512_mask:
-  case X86::BI__builtin_ia32_vcvtuw2ph512_mask:
-  case X86::BI__builtin_ia32_vcvtph2w512_mask:
-  case X86::BI__builtin_ia32_vcvtph2uw512_mask:
-  case X86::BI__builtin_ia32_vcvtph2dq512_mask:
-  case X86::BI__builtin_ia32_vcvtph2udq512_mask:
-  case X86::BI__builtin_ia32_vcvtph2qq512_mask:
-  case X86::BI__builtin_ia32_vcvtph2uqq512_mask:
-  case X86::BI__builtin_ia32_vcvtqq2ph512_mask:
-  case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
-    ArgNum = 3;
-    HasRC = true;
-    break;
-  case X86::BI__builtin_ia32_addsh_round_mask:
-  case X86::BI__builtin_ia32_addss_round_mask:
-  case X86::BI__builtin_ia32_addsd_round_mask:
-  case X86::BI__builtin_ia32_divsh_round_mask:
-  case X86::BI__builtin_ia32_divss_round_mask:
-  case X86::BI__builtin_ia32_divsd_round_mask:
-  case X86::BI__builtin_ia32_mulsh_round_mask:
-  case X86::BI__builtin_ia32_mulss_round_mask:
-  case X86::BI__builtin_ia32_mulsd_round_mask:
-  case X86::BI__builtin_ia32_subsh_round_mask:
-  case X86::BI__builtin_ia32_subss_round_mask:
-  case X86::BI__builtin_ia32_subsd_round_mask:
-  case X86::BI__builtin_ia32_scalefph512_mask:
-  case X86::BI__builtin_ia32_scalefpd512_mask:
-  case X86::BI__builtin_ia32_scalefps512_mask:
-  case X86::BI__builtin_ia32_scalefsd_round_mask:
-  case X86::BI__builtin_ia32_scalefss_round_mask:
-  case X86::BI__builtin_ia32_scalefsh_round_mask:
-  case X86::BI__builtin_ia32_cvtsd2ss_round_mask:
-  case X86::BI__builtin_ia32_vcvtss2sh_round_mask:
-  case X86::BI__builtin_ia32_vcvtsd2sh_round_mask:
-  case X86::BI__builtin_ia32_sqrtsd_round_mask:
-  case X86::BI__builtin_ia32_sqrtss_round_mask:
-  case X86::BI__builtin_ia32_sqrtsh_round_mask:
-  case X86::BI__builtin_ia32_vfmaddsd3_mask:
-  case X86::BI__builtin_ia32_vfmaddsd3_maskz:
-  case X86::BI__builtin_ia32_vfmaddsd3_mask3:
-  case X86::BI__builtin_ia32_vfmaddss3_mask:
-  case X86::BI__builtin_ia32_vfmaddss3_maskz:
-  case X86::BI__builtin_ia32_vfmaddss3_mask3:
-  case X86::BI__builtin_ia32_vfmaddsh3_mask:
-  case X86::BI__builtin_ia32_vfmaddsh3_maskz:
-  case X86::BI__builtin_ia32_vfmaddsh3_mask3:
-  case X86::BI__builtin_ia32_vfmaddpd512_mask:
-  case X86::BI__builtin_ia32_vfmaddpd512_maskz:
-  case X86::BI__builtin_ia32_vfmaddpd512_mask3:
-  case X86::BI__builtin_ia32_vfmsubpd512_mask3:
-  case X86::BI__builtin_ia32_vfmaddps512_mask:
-  case X86::BI__builtin_ia32_vfmaddps512_maskz:
-  case X86::BI__builtin_ia32_vfmaddps512_mask3:
-  case X86::BI__builtin_ia32_vfmsubps512_mask3:
-  case X86::BI__builtin_ia32_vfmaddph512_mask:
-  case X86::BI__builtin_ia32_vfmaddph512_maskz:
-  case X86::BI__builtin_ia32_vfmaddph512_mask3:
-  case X86::BI__builtin_ia32_vfmsubph512_mask3:
-  case X86::BI__builtin_ia32_vfmaddsubpd512_mask:
-  case X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
-  case X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
-  case X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
-  case X86::BI__builtin_ia32_vfmaddsubps512_mask:
-  case X86::BI__builtin_ia32_vfmaddsubps512_maskz:
-  case X86::BI__builtin_ia32_vfmaddsubps512_mask3:
-  case X86::BI__builtin_ia32_vfmsubaddps512_mask3:
-  case X86::BI__builtin_ia32_vfmaddsubph512_mask:
-  case X86::BI__builtin_ia32_vfmaddsubph512_maskz:
-  case X86::BI__builtin_ia32_vfmaddsubph512_mask3:
-  case X86::BI__builtin_ia32_vfmsubaddph512_mask3:
-  case X86::BI__builtin_ia32_vfmaddcsh_mask:
-  case X86::BI__builtin_ia32_vfmaddcsh_round_mask:
-  case X86::BI__builtin_ia32_vfmaddcsh_round_mask3:
-  case X86::BI__builtin_ia32_vfmaddcph512_mask:
-  case X86::BI__builtin_ia32_vfmaddcph512_maskz:
-  case X86::BI__builtin_ia32_vfmaddcph512_mask3:
-  case X86::BI__builtin_ia32_vfcmaddcsh_mask:
-  case X86::BI__builtin_ia32_vfcmaddcsh_round_mask:
-  case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3:
-  case X86::BI__builtin_ia32_vfcmaddcph512_mask:
-  case X86::BI__builtin_ia32_vfcmaddcph512_maskz:
-  case X86::BI__builtin_ia32_vfcmaddcph512_mask3:
-  case X86::BI__builtin_ia32_vfmulcsh_mask:
-  case X86::BI__builtin_ia32_vfmulcph512_mask:
-  case X86::BI__builtin_ia32_vfcmulcsh_mask:
-  case X86::BI__builtin_ia32_vfcmulcph512_mask:
-    ArgNum = 4;
-    HasRC = true;
-    break;
-  }
-
-  llvm::APSInt Result;
-
-  // We can't check the value of a dependent argument.
-  Expr *Arg = TheCall->getArg(ArgNum);
-  if (Arg->isTypeDependent() || Arg->isValueDependent())
-    return false;
-
-  // Check constant-ness first.
-  if (BuiltinConstantArg(TheCall, ArgNum, Result))
-    return true;
-
-  // Make sure rounding mode is either ROUND_CUR_DIRECTION or ROUND_NO_EXC bit
-  // is set. If the intrinsic has rounding control(bits 1:0), make sure its only
-  // combined with ROUND_NO_EXC. If the intrinsic does not have rounding
-  // control, allow ROUND_NO_EXC and ROUND_CUR_DIRECTION together.
-  if (Result == 4/*ROUND_CUR_DIRECTION*/ ||
-      Result == 8/*ROUND_NO_EXC*/ ||
-      (!HasRC && Result == 12/*ROUND_CUR_DIRECTION|ROUND_NO_EXC*/) ||
-      (HasRC && Result.getZExtValue() >= 8 && Result.getZExtValue() <= 11))
-    return false;
-
-  return Diag(TheCall->getBeginLoc(), diag::err_x86_builtin_invalid_rounding)
-         << Arg->getSourceRange();
-}
-
-// Check if the gather/scatter scale is legal.
-bool Sema::CheckX86BuiltinGatherScatterScale(unsigned BuiltinID,
-                                             CallExpr *TheCall) {
-  unsigned ArgNum = 0;
-  switch (BuiltinID) {
-  default:
-    return false;
-  case X86::BI__builtin_ia32_gatherpfdpd:
-  case X86::BI__builtin_ia32_gatherpfdps:
-  case X86::BI__builtin_ia32_gatherpfqpd:
-  case X86::BI__builtin_ia32_gatherpfqps:
-  case X86::BI__builtin_ia32_scatterpfdpd:
-  case X86::BI__builtin_ia32_scatterpfdps:
-  case X86::BI__builtin_ia32_scatterpfqpd:
-  case X86::BI__builtin_ia32_scatterpfqps:
-    ArgNum = 3;
-    break;
-  case X86::BI__builtin_ia32_gatherd_pd:
-  case X86::BI__builtin_ia32_gatherd_pd256:
-  case X86::BI__builtin_ia32_gatherq_pd:
-  case X86::BI__builtin_ia32_gatherq_pd256:
-  case X86::BI__builtin_ia32_gatherd_ps:
-  case X86::BI__builtin_ia32_gatherd_ps256:
-  case X86::BI__builtin_ia32_gatherq_ps:
-  case X86::BI__builtin_ia32_gatherq_ps256:
-  case X86::BI__builtin_ia32_gatherd_q:
-  case X86::BI__builtin_ia32_gatherd_q256:
-  case X86::BI__builtin_ia32_gatherq_q:
-  case X86::BI__builtin_ia32_gatherq_q256:
-  case X86::BI__builtin_ia32_gatherd_d:
-  case X86::BI__builtin_ia32_gatherd_d256:
-  case X86::BI__builtin_ia32_gatherq_d:
-  case X86::BI__builtin_ia32_gatherq_d256:
-  case X86::BI__builtin_ia32_gather3div2df:
-  case X86::BI__builtin_ia32_gather3div2di:
-  case X86::BI__builtin_ia32_gather3div4df:
-  case X86::BI__builtin_ia32_gather3div4di:
-  case X86::BI__builtin_ia32_gather3div4sf:
-  case X86::BI__builtin_ia32_gather3div4si:
-  case X86::BI__builtin_ia32_gather3div8sf:
-  case X86::BI__builtin_ia32_gather3div8si:
-  case X86::BI__builtin_ia32_gather3siv2df:
-  case X86::BI__builtin_ia32_gather3siv2di:
-  case X86::BI__builtin_ia32_gather3siv4df:
-  case X86::BI__builtin_ia32_gather3siv4di:
-  case X86::BI__builtin_ia32_gather3siv4sf:
-  case X86::BI__builtin_ia32_gather3siv4si:
-  case X86::BI__builtin_ia32_gather3siv8sf:
-  case X86::BI__builtin_ia32_gather3siv8si:
-  case X86::BI__builtin_ia32_gathersiv8df:
-  case X86::BI__builtin_ia32_gathersiv16sf:
-  case X86::BI__builtin_ia32_gatherdiv8df:
-  case X86::BI__builtin_ia32_gatherdiv16sf:
-  case X86::BI__builtin_ia32_gathersiv8di:
-  case X86::BI__builtin_ia32_gathersiv16si:
-  case X86::BI__builtin_ia32_gatherdiv8di:
-  case X86::BI__builtin_ia32_gatherdiv16si:
-  case X86::BI__builtin_ia32_scatterdiv2df:
-  case X86::BI__builtin_ia32_scatterdiv2di:
-  case X86::BI__builtin_ia32_scatterdiv4df:
-  case X86::BI__builtin_ia32_scatterdiv4di:
-  case X86::BI__builtin_ia32_scatterdiv4sf:
-  case X86::BI__builtin_ia32_scatterdiv4si:
-  case X86::BI__builtin_ia32_scatterdiv8sf:
-  case X86::BI__builtin_ia32_scatterdiv8si:
-  case X86::BI__builtin_ia32_scattersiv2df:
-  case X86::BI__builtin_ia32_scattersiv2di:
-  case X86::BI__builtin_ia32_scattersiv4df:
-  case X86::BI__builtin_ia32_scattersiv4di:
-  case X86::BI__builtin_ia32_scattersiv4sf:
-  case X86::BI__builtin_ia32_scattersiv4si:
-  case X86::BI__builtin_ia32_scattersiv8sf:
-  case X86::BI__builtin_ia32_scattersiv8si:
-  case X86::BI__builtin_ia32_scattersiv8df:
-  case X86::BI__builtin_ia32_scattersiv16sf:
-  case X86::BI__builtin_ia32_scatterdiv8df:
-  case X86::BI__builtin_ia32_scatterdiv16sf:
-  case X86::BI__builtin_ia32_scattersiv8di:
-  case X86::BI__builtin_ia32_scattersiv16si:
-  case X86::BI__builtin_ia32_scatterdiv8di:
-  case X86::BI__builtin_ia32_scatterdiv16si:
-    ArgNum = 4;
-    break;
-  }
-
-  llvm::APSInt Result;
-
-  // We can't check the value of a dependent argument.
-  Expr *Arg = TheCall->getArg(ArgNum);
-  if (Arg->isTypeDependent() || Arg->isValueDependent())
-    return false;
-
-  // Check constant-ness first.
-  if (BuiltinConstantArg(TheCall, ArgNum, Result))
-    return true;
-
-  if (Result == 1 || Result == 2 || Result == 4 || Result == 8)
-    return false;
-
-  return Diag(TheCall->getBeginLoc(), diag::err_x86_builtin_invalid_scale)
-         << Arg->getSourceRange();
-}
-
-enum { TileRegLow = 0, TileRegHigh = 7 };
-
-bool Sema::CheckX86BuiltinTileArgumentsRange(CallExpr *TheCall,
-                                             ArrayRef<int> ArgNums) {
-  for (int ArgNum : ArgNums) {
-    if (BuiltinConstantArgRange(TheCall, ArgNum, TileRegLow, TileRegHigh))
-      return true;
-  }
-  return false;
-}
-
-bool Sema::CheckX86BuiltinTileDuplicate(CallExpr *TheCall,
-                                        ArrayRef<int> ArgNums) {
-  // Because the max number of tile register is TileRegHigh + 1, so here we use
-  // each bit to represent the usage of them in bitset.
-  std::bitset<TileRegHigh + 1> ArgValues;
-  for (int ArgNum : ArgNums) {
-    Expr *Arg = TheCall->getArg(ArgNum);
-    if (Arg->isTypeDependent() || Arg->isValueDependent())
-      continue;
-
-    llvm::APSInt Result;
-    if (BuiltinConstantArg(TheCall, ArgNum, Result))
-      return true;
-    int ArgExtValue = Result.getExtValue();
-    assert((ArgExtValue >= TileRegLow && ArgExtValue <= TileRegHigh) &&
-           "Incorrect tile register num.");
-    if (ArgValues.test(ArgExtValue))
-      return Diag(TheCall->getBeginLoc(),
-                  diag::err_x86_builtin_tile_arg_duplicate)
-             << TheCall->getArg(ArgNum)->getSourceRange();
-    ArgValues.set(ArgExtValue);
-  }
-  return false;
-}
-
-bool Sema::CheckX86BuiltinTileRangeAndDuplicate(CallExpr *TheCall,
-                                                ArrayRef<int> ArgNums) {
-  return CheckX86BuiltinTileArgumentsRange(TheCall, ArgNums) ||
-         CheckX86BuiltinTileDuplicate(TheCall, ArgNums);
-}
-
-bool Sema::CheckX86BuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
-  switch (BuiltinID) {
-  default:
-    return false;
-  case X86::BI__builtin_ia32_tileloadd64:
-  case X86::BI__builtin_ia32_tileloaddt164:
-  case X86::BI__builtin_ia32_tilestored64:
-  case X86::BI__builtin_ia32_tilezero:
-    return CheckX86BuiltinTileArgumentsRange(TheCall, 0);
-  case X86::BI__builtin_ia32_tdpbssd:
-  case X86::BI__builtin_ia32_tdpbsud:
-  case X86::BI__builtin_ia32_tdpbusd:
-  case X86::BI__builtin_ia32_tdpbuud:
-  case X86::BI__builtin_ia32_tdpbf16ps:
-  case X86::BI__builtin_ia32_tdpfp16ps:
-  case X86::BI__builtin_ia32_tcmmimfp16ps:
-  case X86::BI__builtin_ia32_tcmmrlfp16ps:
-    return CheckX86BuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2});
-  }
-}
-static bool isX86_32Builtin(unsigned BuiltinID) {
-  // These builtins only work on x86-32 targets.
-  switch (BuiltinID) {
-  case X86::BI__builtin_ia32_readeflags_u32:
-  case X86::BI__builtin_ia32_writeeflags_u32:
-    return true;
+    return checkArgCountAtMost(TheCall, 3);
   }
 
   return false;
 }
 
-bool Sema::CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
-                                       CallExpr *TheCall) {
-  // Check for 32-bit only builtins on a 64-bit target.
-  const llvm::Triple &TT = TI.getTriple();
-  if (TT.getArch() != llvm::Triple::x86 && isX86_32Builtin(BuiltinID))
-    return Diag(TheCall->getCallee()->getBeginLoc(),
-                diag::err_32_bit_builtin_64_bit_tgt);
-
-  // If the intrinsic has rounding or SAE make sure its valid.
-  if (CheckX86BuiltinRoundingOrSAE(BuiltinID, TheCall))
-    return true;
-
-  // If the intrinsic has a gather/scatter scale immediate make sure its valid.
-  if (CheckX86BuiltinGatherScatterScale(BuiltinID, TheCall))
-    return true;
-
-  // If the intrinsic has a tile arguments, make sure they are valid.
-  if (CheckX86BuiltinTileArguments(BuiltinID, TheCall))
-    return true;
-
-  // For intrinsics which take an immediate value as part of the instruction,
-  // range check them here.
-  int i = 0, l = 0, u = 0;
-  switch (BuiltinID) {
-  default:
-    return false;
-  case X86::BI__builtin_ia32_vec_ext_v2si:
-  case X86::BI__builtin_ia32_vec_ext_v2di:
-  case X86::BI__builtin_ia32_vextractf128_pd256:
-  case X86::BI__builtin_ia32_vextractf128_ps256:
-  case X86::BI__builtin_ia32_vextractf128_si256:
-  case X86::BI__builtin_ia32_extract128i256:
-  case X86::BI__builtin_ia32_extractf64x4_mask:
-  case X86::BI__builtin_ia32_extracti64x4_mask:
-  case X86::BI__builtin_ia32_extractf32x8_mask:
-  case X86::BI__builtin_ia32_extracti32x8_mask:
-  case X86::BI__builtin_ia32_extractf64x2_256_mask:
-  case X86::BI__builtin_ia32_extracti64x2_256_mask:
-  case X86::BI__builtin_ia32_extractf32x4_256_mask:
-  case X86::BI__builtin_ia32_extracti32x4_256_mask:
-    i = 1; l = 0; u = 1;
-    break;
-  case X86::BI__builtin_ia32_vec_set_v2di:
-  case X86::BI__builtin_ia32_vinsertf128_pd256:
-  case X86::BI__builtin_ia32_vinsertf128_ps256:
-  case X86::BI__builtin_ia32_vinsertf128_si256:
-  case X86::BI__builtin_ia32_insert128i256:
-  case X86::BI__builtin_ia32_insertf32x8:
-  case X86::BI__builtin_ia32_inserti32x8:
-  case X86::BI__builtin_ia32_insertf64x4:
-  case X86::BI__builtin_ia32_inserti64x4:
-  case X86::BI__builtin_ia32_insertf64x2_256:
-  case X86::BI__builtin_ia32_inserti64x2_256:
-  case X86::BI__builtin_ia32_insertf32x4_256:
-  case X86::BI__builtin_ia32_inserti32x4_256:
-    i = 2; l = 0; u = 1;
-    break;
-  case X86::BI__builtin_ia32_vpermilpd:
-  case X86::BI__builtin_ia32_vec_ext_v4hi:
-  case X86::BI__builtin_ia32_vec_ext_v4si:
-  case X86::BI__builtin_ia32_vec_ext_v4sf:
-  case X86::BI__builtin_ia32_vec_ext_v4di:
-  case X86::BI__builtin_ia32_extractf32x4_mask:
-  case X86::BI__builtin_ia32_extracti32x4_mask:
-  case X86::BI__builtin_ia32_extractf64x2_512_mask:
-  case X86::BI__builtin_ia32_extracti64x2_512_mask:
-    i = 1; l = 0; u = 3;
-    break;
-  case X86::BI_mm_prefetch:
-  case X86::BI__builtin_ia32_vec_ext_v8hi:
-  case X86::BI__builtin_ia32_vec_ext_v8si:
-    i = 1; l = 0; u = 7;
-    break;
-  case X86::BI__builtin_ia32_sha1rnds4:
-  case X86::BI__builtin_ia32_blendpd:
-  case X86::BI__builtin_ia32_shufpd:
-  case X86::BI__builtin_ia32_vec_set_v4hi:
-  case X86::BI__builtin_ia32_vec_set_v4si:
-  case X86::BI__builtin_ia32_vec_set_v4di:
-  case X86::BI__builtin_ia32_shuf_f32x4_256:
-  case X86::BI__builtin_ia32_shuf_f64x2_256:
-  case X86::BI__builtin_ia32_shuf_i32x4_256:
-  case X86::BI__builtin_ia32_shuf_i64x2_256:
-  case X86::BI__builtin_ia32_insertf64x2_512:
-  case X86::BI__builtin_ia32_inserti64x2_512:
-  case X86::BI__builtin_ia32_insertf32x4:
-  case X86::BI__builtin_ia32_inserti32x4:
-    i = 2; l = 0; u = 3;
-    break;
-  case X86::BI__builtin_ia32_vpermil2pd:
-  case X86::BI__builtin_ia32_vpermil2pd256:
-  case X86::BI__builtin_ia32_vpermil2ps:
-  case X86::BI__builtin_ia32_vpermil2ps256:
-    i = 3; l = 0; u = 3;
-    break;
-  case X86::BI__builtin_ia32_cmpb128_mask:
-  case X86::BI__builtin_ia32_cmpw128_mask:
-  case X86::BI__builtin_ia32_cmpd128_mask:
-  case X86::BI__builtin_ia32_cmpq128_mask:
-  case X86::BI__builtin_ia32_cmpb256_mask:
-  case X86::BI__builtin_ia32_cmpw256_mask:
-  case X86::BI__builtin_ia32_cmpd256_mask:
-  case X86::BI__builtin_ia32_cmpq256_mask:
-  case X86::BI__builtin_ia32_cmpb512_mask:
-  case X86::BI__builtin_ia32_cmpw512_mask:
-  case X86::BI__builtin_ia32_cmpd512_mask:
-  case X86::BI__builtin_ia32_cmpq512_mask:
-  case X86::BI__builtin_ia32_ucmpb128_mask:
-  case X86::BI__builtin_ia32_ucmpw128_mask:
-  case X86::BI__builtin_ia32_ucmpd128_mask:
-  case X86::BI__builtin_ia32_ucmpq128_mask:
-  case X86::BI__builtin_ia32_ucmpb256_mask:
-  case X86::BI__builtin_ia32_ucmpw256_mask:
-  case X86::BI__builtin_ia32_ucmpd256_mask:
-  case X86::BI__builtin_ia32_ucmpq256_mask:
-  case X86::BI__builtin_ia32_ucmpb512_mask:
-  case X86::BI__builtin_ia32_ucmpw512_mask:
-  case X86::BI__builtin_ia32_ucmpd512_mask:
-  case X86::BI__builtin_ia32_ucmpq512_mask:
-  case X86::BI__builtin_ia32_vpcomub:
-  case X86::BI__builtin_ia32_vpcomuw:
-  case X86::BI__builtin_ia32_vpcomud:
-  case X86::BI__builtin_ia32_vpcomuq:
-  case X86::BI__builtin_ia32_vpcomb:
-  case X86::BI__builtin_ia32_vpcomw:
-  case X86::BI__builtin_ia32_vpcomd:
-  case X86::BI__builtin_ia32_vpcomq:
-  case X86::BI__builtin_ia32_vec_set_v8hi:
-  case X86::BI__builtin_ia32_vec_set_v8si:
-    i = 2; l = 0; u = 7;
-    break;
-  case X86::BI__builtin_ia32_vpermilpd256:
-  case X86::BI__builtin_ia32_roundps:
-  case X86::BI__builtin_ia32_roundpd:
-  case X86::BI__builtin_ia32_roundps256:
-  case X86::BI__builtin_ia32_roundpd256:
-  case X86::BI__builtin_ia32_getmantpd128_mask:
-  case X86::BI__builtin_ia32_getmantpd256_mask:
-  case X86::BI__builtin_ia32_getmantps128_mask:
-  case X86::BI__builtin_ia32_getmantps256_mask:
-  case X86::BI__builtin_ia32_getmantpd512_mask:
-  case X86::BI__builtin_ia32_getmantps512_mask:
-  case X86::BI__builtin_ia32_getmantph128_mask:
-  case X86::BI__builtin_ia32_getmantph256_mask:
-  case X86::BI__builtin_ia32_getmantph512_mask:
-  case X86::BI__builtin_ia32_vec_ext_v16qi:
-  case X86::BI__builtin_ia32_vec_ext_v16hi:
-    i = 1; l = 0; u = 15;
-    break;
-  case X86::BI__builtin_ia32_pblendd128:
-  case X86::BI__builtin_ia32_blendps:
-  case X86::BI__builtin_ia32_blendpd256:
-  case X86::BI__builtin_ia32_shufpd256:
-  case X86::BI__builtin_ia32_roundss:
-  case X86::BI__builtin_ia32_roundsd:
-  case X86::BI__builtin_ia32_rangepd128_mask:
-  case X86::BI__builtin_ia32_rangepd256_mask:
-  case X86::BI__builtin_ia32_rangepd512_mask:
-  case X86::BI__builtin_ia32_rangeps128_mask:
-  case X86::BI__builtin_ia32_rangeps256_mask:
-  case X86::BI__builtin_ia32_rangeps512_mask:
-  case X86::BI__builtin_ia32_getmantsd_round_mask:
-  case X86::BI__builtin_ia32_getmantss_round_mask:
-  case X86::BI__builtin_ia32_getmantsh_round_mask:
-  case X86::BI__builtin_ia32_vec_set_v16qi:
-  case X86::BI__builtin_ia32_vec_set_v16hi:
-    i = 2; l = 0; u = 15;
-    break;
-  case X86::BI__builtin_ia32_vec_ext_v32qi:
-    i = 1; l = 0; u = 31;
-    break;
-  case X86::BI__builtin_ia32_cmpps:
-  case X86::BI__builtin_ia32_cmpss:
-  case X86::BI__builtin_ia32_cmppd:
-  case X86::BI__builtin_ia32_cmpsd:
-  case X86::BI__builtin_ia32_cmpps256:
-  case X86::BI__builtin_ia32_cmppd256:
-  case X86::BI__builtin_ia32_cmpps128_mask:
-  case X86::BI__builtin_ia32_cmppd128_mask:
-  case X86::BI__builtin_ia32_cmpps256_mask:
-  case X86::BI__builtin_ia32_cmppd256_mask:
-  case X86::BI__builtin_ia32_cmpps512_mask:
-  case X86::BI__builtin_ia32_cmppd512_mask:
-  case X86::BI__builtin_ia32_cmpsd_mask:
-  case X86::BI__builtin_ia32_cmpss_mask:
-  case X86::BI__builtin_ia32_vec_set_v32qi:
-    i = 2; l = 0; u = 31;
-    break;
-  case X86::BI__builtin_ia32_permdf256:
-  case X86::BI__builtin_ia32_permdi256:
-  case X86::BI__builtin_ia32_permdf512:
-  case X86::BI__builtin_ia32_permdi512:
-  case X86::BI__builtin_ia32_vpermilps:
-  case X86::BI__builtin_ia32_vpermilps256:
-  case X86::BI__builtin_ia32_vpermilpd512:
-  case X86::BI__builtin_ia32_vpermilps512:
-  case X86::BI__builtin_ia32_pshufd:
-  case X86::BI__builtin_ia32_pshufd256:
-  case X86::BI__builtin_ia32_pshufd512:
-  case X86::BI__builtin_ia32_pshufhw:
-  case X86::BI__builtin_ia32_pshufhw256:
-  case X86::BI__builtin_ia32_pshufhw512:
-  case X86::BI__builtin_ia32_pshuflw:
-  case X86::BI__builtin_ia32_pshuflw256:
-  case X86::BI__builtin_ia32_pshuflw512:
-  case X86::BI__builtin_ia32_vcvtps2ph:
-  case X86::BI__builtin_ia32_vcvtps2ph_mask:
-  case X86::BI__builtin_ia32_vcvtps2ph256:
-  case X86::BI__builtin_ia32_vcvtps2ph256_mask:
-  case X86::BI__builtin_ia32_vcvtps2ph512_mask:
-  case X86::BI__builtin_ia32_rndscaleps_128_mask:
-  case X86::BI__builtin_ia32_rndscalepd_128_mask:
-  case X86::BI__builtin_ia32_rndscaleps_256_mask:
-  case X86::BI__builtin_ia32_rndscalepd_256_mask:
-  case X86::BI__builtin_ia32_rndscaleps_mask:
-  case X86::BI__builtin_ia32_rndscalepd_mask:
-  case X86::BI__builtin_ia32_rndscaleph_mask:
-  case X86::BI__builtin_ia32_reducepd128_mask:
-  case X86::BI__builtin_ia32_reducepd256_mask:
-  case X86::BI__builtin_ia32_reducepd512_mask:
-  case X86::BI__builtin_ia32_reduceps128_mask:
-  case X86::BI__builtin_ia32_reduceps256_mask:
-  case X86::BI__builtin_ia32_reduceps512_mask:
-  case X86::BI__builtin_ia32_reduceph128_mask:
-  case X86::BI__builtin_ia32_reduceph256_mask:
-  case X86::BI__builtin_ia32_reduceph512_mask:
-  case X86::BI__builtin_ia32_prold512:
-  case X86::BI__builtin_ia32_prolq512:
-  case X86::BI__builtin_ia32_prold128:
-  case X86::BI__builtin_ia32_prold256:
-  case X86::BI__builtin_ia32_prolq128:
-  case X86::BI__builtin_ia32_prolq256:
-  case X86::BI__builtin_ia32_prord512:
-  case X86::BI__builtin_ia32_prorq512:
-  case X86::BI__builtin_ia32_prord128:
-  case X86::BI__builtin_ia32_prord256:
-  case X86::BI__builtin_ia32_prorq128:
-  case X86::BI__builtin_ia32_prorq256:
-  case X86::BI__builtin_ia32_fpclasspd128_mask:
-  case X86::BI__builtin_ia32_fpclasspd256_mask:
-  case X86::BI__builtin_ia32_fpclassps128_mask:
-  case X86::BI__builtin_ia32_fpclassps256_mask:
-  case X86::BI__builtin_ia32_fpclassps512_mask:
-  case X86::BI__builtin_ia32_fpclasspd512_mask:
-  case X86::BI__builtin_ia32_fpclassph128_mask:
-  case X86::BI__builtin_ia32_fpclassph256_mask:
-  case X86::BI__builtin_ia32_fpclassph512_mask:
-  case X86::BI__builtin_ia32_fpclasssd_mask:
-  case X86::BI__builtin_ia32_fpclassss_mask:
-  case X86::BI__builtin_ia32_fpclasssh_mask:
-  case X86::BI__builtin_ia32_pslldqi128_byteshift:
-  case X86::BI__builtin_ia32_pslldqi256_byteshift:
-  case X86::BI__builtin_ia32_pslldqi512_byteshift:
-  case X86::BI__builtin_ia32_psrldqi128_byteshift:
-  case X86::BI__builtin_ia32_psrldqi256_byteshift:
-  case X86::BI__builtin_ia32_psrldqi512_byteshift:
-  case X86::BI__builtin_ia32_kshiftliqi:
-  case X86::BI__builtin_ia32_kshiftlihi:
-  case X86::BI__builtin_ia32_kshiftlisi:
-  case X86::BI__builtin_ia32_kshiftlidi:
-  case X86::BI__builtin_ia32_kshiftriqi:
-  case X86::BI__builtin_ia32_kshiftrihi:
-  case X86::BI__builtin_ia32_kshiftrisi:
-  case X86::BI__builtin_ia32_kshiftridi:
-    i = 1; l = 0; u = 255;
-    break;
-  case X86::BI__builtin_ia32_vperm2f128_pd256:
-  case X86::BI__builtin_ia32_vperm2f128_ps256:
-  case X86::BI__builtin_ia32_vperm2f128_si256:
-  case X86::BI__builtin_ia32_permti256:
-  case X86::BI__builtin_ia32_pblendw128:
-  case X86::BI__builtin_ia32_pblendw256:
-  case X86::BI__builtin_ia32_blendps256:
-  case X86::BI__builtin_ia32_pblendd256:
-  case X86::BI__builtin_ia32_palignr128:
-  case X86::BI__builtin_ia32_palignr256:
-  case X86::BI__builtin_ia32_palignr512:
-  case X86::BI__builtin_ia32_alignq512:
-  case X86::BI__builtin_ia32_alignd512:
-  case X86::BI__builtin_ia32_alignd128:
-  case X86::BI__builtin_ia32_alignd256:
-  case X86::BI__builtin_ia32_alignq128:
-  case X86::BI__builtin_ia32_alignq256:
-  case X86::BI__builtin_ia32_vcomisd:
-  case X86::BI__builtin_ia32_vcomiss:
-  case X86::BI__builtin_ia32_shuf_f32x4:
-  case X86::BI__builtin_ia32_shuf_f64x2:
-  case X86::BI__builtin_ia32_shuf_i32x4:
-  case X86::BI__builtin_ia32_shuf_i64x2:
-  case X86::BI__builtin_ia32_shufpd512:
-  case X86::BI__builtin_ia32_shufps:
-  case X86::BI__builtin_ia32_shufps256:
-  case X86::BI__builtin_ia32_shufps512:
-  case X86::BI__builtin_ia32_dbpsadbw128:
-  case X86::BI__builtin_ia32_dbpsadbw256:
-  case X86::BI__builtin_ia32_dbpsadbw512:
-  case X86::BI__builtin_ia32_vpshldd128:
-  case X86::BI__builtin_ia32_vpshldd256:
-  case X86::BI__builtin_ia32_vpshldd512:
-  case X86::BI__builtin_ia32_vpshldq128:
-  case X86::BI__builtin_ia32_vpshldq256:
-  case X86::BI__builtin_ia32_vpshldq512:
-  case X86::BI__builtin_ia32_vpshldw128:
-  case X86::BI__builtin_ia32_vpshldw256:
-  case X86::BI__builtin_ia32_vpshldw512:
-  case X86::BI__builtin_ia32_vpshrdd128:
-  case X86::BI__builtin_ia32_vpshrdd256:
-  case X86::BI__builtin_ia32_vpshrdd512:
-  case X86::BI__builtin_ia32_vpshrdq128:
-  case X86::BI__builtin_ia32_vpshrdq256:
-  case X86::BI__builtin_ia32_vpshrdq512:
-  case X86::BI__builtin_ia32_vpshrdw128:
-  case X86::BI__builtin_ia32_vpshrdw256:
-  case X86::BI__builtin_ia32_vpshrdw512:
-    i = 2; l = 0; u = 255;
-    break;
-  case X86::BI__builtin_ia32_fixupimmpd512_mask:
-  case X86::BI__builtin_ia32_fixupimmpd512_maskz:
-  case X86::BI__builtin_ia32_fixupimmps512_mask:
-  case X86::BI__builtin_ia32_fixupimmps512_maskz:
-  case X86::BI__builtin_ia32_fixupimmsd_mask:
-  case X86::BI__builtin_ia32_fixupimmsd_maskz:
-  case X86::BI__builtin_ia32_fixupimmss_mask:
-  case X86::BI__builtin_ia32_fixupimmss_maskz:
-  case X86::BI__builtin_ia32_fixupimmpd128_mask:
-  case X86::BI__builtin_ia32_fixupimmpd128_maskz:
-  case X86::BI__builtin_ia32_fixupimmpd256_mask:
-  case X86::BI__builtin_ia32_fixupimmpd256_maskz:
-  case X86::BI__builtin_ia32_fixupimmps128_mask:
-  case X86::BI__builtin_ia32_fixupimmps128_maskz:
-  case X86::BI__builtin_ia32_fixupimmps256_mask:
-  case X86::BI__builtin_ia32_fixupimmps256_maskz:
-  case X86::BI__builtin_ia32_pternlogd512_mask:
-  case X86::BI__builtin_ia32_pternlogd512_maskz:
-  case X86::BI__builtin_ia32_pternlogq512_mask:
-  case X86::BI__builtin_ia32_pternlogq512_maskz:
-  case X86::BI__builtin_ia32_pternlogd128_mask:
-  case X86::BI__builtin_ia32_pternlogd128_maskz:
-  case X86::BI__builtin_ia32_pternlogd256_mask:
-  case X86::BI__builtin_ia32_pternlogd256_maskz:
-  case X86::BI__builtin_ia32_pternlogq128_mask:
-  case X86::BI__builtin_ia32_pternlogq128_maskz:
-  case X86::BI__builtin_ia32_pternlogq256_mask:
-  case X86::BI__builtin_ia32_pternlogq256_maskz:
-  case X86::BI__builtin_ia32_vsm3rnds2:
-    i = 3; l = 0; u = 255;
-    break;
-  case X86::BI__builtin_ia32_gatherpfdpd:
-  case X86::BI__builtin_ia32_gatherpfdps:
-  case X86::BI__builtin_ia32_gatherpfqpd:
-  case X86::BI__builtin_ia32_gatherpfqps:
-  case X86::BI__builtin_ia32_scatterpfdpd:
-  case X86::BI__builtin_ia32_scatterpfdps:
-  case X86::BI__builtin_ia32_scatterpfqpd:
-  case X86::BI__builtin_ia32_scatterpfqps:
-    i = 4; l = 2; u = 3;
-    break;
-  case X86::BI__builtin_ia32_reducesd_mask:
-  case X86::BI__builtin_ia32_reducess_mask:
-  case X86::BI__builtin_ia32_rndscalesd_round_mask:
-  case X86::BI__builtin_ia32_rndscaless_round_mask:
-  case X86::BI__builtin_ia32_rndscalesh_round_mask:
-  case X86::BI__builtin_ia32_reducesh_mask:
-    i = 4; l = 0; u = 255;
-    break;
-  case X86::BI__builtin_ia32_cmpccxadd32:
-  case X86::BI__builtin_ia32_cmpccxadd64:
-    i = 3; l = 0; u = 15;
-    break;
-  }
-
-  // Note that we don't force a hard error on the range check here, allowing
-  // template-generated or macro-generated dead code to potentially have out-of-
-  // range values. These need to code generate, but don't need to necessarily
-  // make any sense. We use a warning that defaults to an error.
-  return BuiltinConstantArgRange(TheCall, i, l, u, /*RangeIsError*/ false);
-}
-
 /// Given a FunctionDecl's FormatAttr, attempts to populate the FomatStringInfo
 /// parameter with the FormatAttr's correct format_idx and firstDataArg.
 /// Returns true when the format fits the function and the FormatStringInfo has
@@ -9302,7 +7584,7 @@ ExprResult Sema::BuiltinNontemporalOverloaded(ExprResult TheCallResult) {
   unsigned numArgs = isStore ? 2 : 1;
 
   // Ensure that we have the proper number of arguments.
-  if (checkArgCount(*this, TheCall, numArgs))
+  if (checkArgCount(TheCall, numArgs))
     return ExprError();
 
   // Inspect the last argument of the nontemporal builtin.  This should always
@@ -9467,7 +7749,7 @@ bool Sema::BuiltinVAStart(unsigned BuiltinID, CallExpr *TheCall) {
   // In C23 mode, va_start only needs one argument. However, the builtin still
   // requires two arguments (which matches the behavior of the GCC builtin),
   // <stdarg.h> passes `0` as the second argument in C23 mode.
-  if (checkArgCount(*this, TheCall, 2))
+  if (checkArgCount(TheCall, 2))
     return true;
 
   // Type-check the first argument normally.
@@ -9598,7 +7880,7 @@ bool Sema::BuiltinVAStartARMMicrosoft(CallExpr *Call) {
 /// BuiltinUnorderedCompare - Handle functions like __builtin_isgreater and
 /// friends.  This is declared to take (...), so we have to check everything.
 bool Sema::BuiltinUnorderedCompare(CallExpr *TheCall, unsigned BuiltinID) {
-  if (checkArgCount(*this, TheCall, 2))
+  if (checkArgCount(TheCall, 2))
     return true;
 
   if (BuiltinID == Builtin::BI__builtin_isunordered &&
@@ -9642,7 +7924,7 @@ bool Sema::BuiltinUnorderedCompare(CallExpr *TheCall, unsigned BuiltinID) {
 /// to check everything.
 bool Sema::BuiltinFPClassification(CallExpr *TheCall, unsigned NumArgs,
                                    unsigned BuiltinID) {
-  if (checkArgCount(*this, TheCall, NumArgs))
+  if (checkArgCount(TheCall, NumArgs))
     return true;
 
   FPOptions FPO = TheCall->getFPFeaturesInEffect(getLangOpts());
@@ -9727,7 +8009,7 @@ bool Sema::BuiltinFPClassification(CallExpr *TheCall, unsigned NumArgs,
 
 /// Perform semantic analysis for a call to __builtin_complex.
 bool Sema::BuiltinComplex(CallExpr *TheCall) {
-  if (checkArgCount(*this, TheCall, 2))
+  if (checkArgCount(TheCall, 2))
     return true;
 
   bool Dependent = false;
@@ -9789,7 +8071,7 @@ bool Sema::BuiltinComplex(CallExpr *TheCall) {
 // vector short vec_xxsldwi(vector short, vector short, int);
 bool Sema::BuiltinVSX(CallExpr *TheCall) {
   unsigned ExpectedNumArgs = 3;
-  if (checkArgCount(*this, TheCall, ExpectedNumArgs))
+  if (checkArgCount(TheCall, ExpectedNumArgs))
     return true;
 
   // Check the third argument is a compile time constant
@@ -9976,7 +8258,7 @@ bool Sema::BuiltinArithmeticFence(CallExpr *TheCall) {
   if (!Context.getTargetInfo().checkArithmeticFenceSupported())
     return Diag(TheCall->getBeginLoc(), diag::err_builtin_target_unsupported)
            << SourceRange(TheCall->getBeginLoc(), TheCall->getEndLoc());
-  if (checkArgCount(*this, TheCall, 1))
+  if (checkArgCount(TheCall, 1))
     return true;
   Expr *Arg = TheCall->getArg(0);
   if (Arg->isInstantiationDependent())
@@ -10046,7 +8328,7 @@ bool Sema::BuiltinAllocaWithAlign(CallExpr *TheCall) {
 /// Handle __builtin_assume_aligned. This is declared
 /// as (const void*, size_t, ...) and can take one optional constant int arg.
 bool Sema::BuiltinAssumeAligned(CallExpr *TheCall) {
-  if (checkArgCountRange(*this, TheCall, 2, 3))
+  if (checkArgCountRange(TheCall, 2, 3))
     return true;
 
   unsigned NumArgs = TheCall->getNumArgs();
@@ -10349,7 +8631,7 @@ bool Sema::BuiltinConstantArgShiftedByteOrXXFF(CallExpr *TheCall, int ArgNum,
 /// BuiltinARMMemoryTaggingCall - Handle calls of memory tagging extensions
 bool Sema::BuiltinARMMemoryTaggingCall(unsigned BuiltinID, CallExpr *TheCall) {
   if (BuiltinID == AArch64::BI__builtin_arm_irg) {
-    if (checkArgCount(*this, TheCall, 2))
+    if (checkArgCount(TheCall, 2))
       return true;
     Expr *Arg0 = TheCall->getArg(0);
     Expr *Arg1 = TheCall->getArg(1);
@@ -10377,7 +8659,7 @@ bool Sema::BuiltinARMMemoryTaggingCall(unsigned BuiltinID, CallExpr *TheCall) {
   }
 
   if (BuiltinID == AArch64::BI__builtin_arm_addg) {
-    if (checkArgCount(*this, TheCall, 2))
+    if (checkArgCount(TheCall, 2))
       return true;
 
     Expr *Arg0 = TheCall->getArg(0);
@@ -10398,7 +8680,7 @@ bool Sema::BuiltinARMMemoryTaggingCall(unsigned BuiltinID, CallExpr *TheCall) {
   }
 
   if (BuiltinID == AArch64::BI__builtin_arm_gmi) {
-    if (checkArgCount(*this, TheCall, 2))
+    if (checkArgCount(TheCall, 2))
       return true;
     Expr *Arg0 = TheCall->getArg(0);
     Expr *Arg1 = TheCall->getArg(1);
@@ -10421,7 +8703,7 @@ bool Sema::BuiltinARMMemoryTaggingCall(unsigned BuiltinID, CallExpr *TheCall) {
 
   if (BuiltinID == AArch64::BI__builtin_arm_ldg ||
       BuiltinID == AArch64::BI__builtin_arm_stg) {
-    if (checkArgCount(*this, TheCall, 1))
+    if (checkArgCount(TheCall, 1))
       return true;
     Expr *Arg0 = TheCall->getArg(0);
     ExprResult FirstArg = DefaultFunctionArrayLvalueConversion(Arg0);
@@ -10694,7 +8976,7 @@ bool Sema::BuiltinPPCMMACall(CallExpr *TheCall, unsigned BuiltinID,
     (void) DecodePPCMMATypeFromStr(Context, TypeStr, Mask);
     ArgNum++;
   }
-  if (checkArgCount(*this, TheCall, ArgNum))
+  if (checkArgCount(TheCall, ArgNum))
     return true;
 
   return false;
@@ -19706,7 +17988,7 @@ void Sema::CheckAddressOfPackedMember(Expr *rhs) {
 }
 
 bool Sema::PrepareBuiltinElementwiseMathOneArgCall(CallExpr *TheCall) {
-  if (checkArgCount(*this, TheCall, 1))
+  if (checkArgCount(TheCall, 1))
     return true;
 
   ExprResult A = UsualUnaryConversions(TheCall->getArg(0));
@@ -19745,7 +18027,7 @@ bool Sema::BuiltinVectorToScalarMath(CallExpr *TheCall) {
 }
 
 bool Sema::BuiltinVectorMath(CallExpr *TheCall, QualType &Res) {
-  if (checkArgCount(*this, TheCall, 2))
+  if (checkArgCount(TheCall, 2))
     return true;
 
   ExprResult A = TheCall->getArg(0);
@@ -19774,7 +18056,7 @@ bool Sema::BuiltinVectorMath(CallExpr *TheCall, QualType &Res) {
 
 bool Sema::BuiltinElementwiseTernaryMath(CallExpr *TheCall,
                                          bool CheckForFloatArgs) {
-  if (checkArgCount(*this, TheCall, 3))
+  if (checkArgCount(TheCall, 3))
     return true;
 
   Expr *Args[3];
@@ -19817,7 +18099,7 @@ bool Sema::BuiltinElementwiseTernaryMath(CallExpr *TheCall,
 }
 
 bool Sema::PrepareBuiltinReduceMathOneArgCall(CallExpr *TheCall) {
-  if (checkArgCount(*this, TheCall, 1))
+  if (checkArgCount(TheCall, 1))
     return true;
 
   ExprResult A = UsualUnaryConversions(TheCall->getArg(0));
@@ -19829,7 +18111,7 @@ bool Sema::PrepareBuiltinReduceMathOneArgCall(CallExpr *TheCall) {
 }
 
 bool Sema::BuiltinNonDeterministicValue(CallExpr *TheCall) {
-  if (checkArgCount(*this, TheCall, 1))
+  if (checkArgCount(TheCall, 1))
     return true;
 
   ExprResult Arg = TheCall->getArg(0);
@@ -19845,7 +18127,7 @@ bool Sema::BuiltinNonDeterministicValue(CallExpr *TheCall) {
 
 ExprResult Sema::BuiltinMatrixTranspose(CallExpr *TheCall,
                                         ExprResult CallResult) {
-  if (checkArgCount(*this, TheCall, 1))
+  if (checkArgCount(TheCall, 1))
     return ExprError();
 
   ExprResult MatrixArg = DefaultLvalueConversion(TheCall->getArg(0));
@@ -19900,7 +18182,7 @@ ExprResult Sema::BuiltinMatrixColumnMajorLoad(CallExpr *TheCall,
     return ExprError();
   }
 
-  if (checkArgCount(*this, TheCall, 4))
+  if (checkArgCount(TheCall, 4))
     return ExprError();
 
   unsigned PtrArgIdx = 0;
@@ -20011,7 +18293,7 @@ ExprResult Sema::BuiltinMatrixColumnMajorLoad(CallExpr *TheCall,
 
 ExprResult Sema::BuiltinMatrixColumnMajorStore(CallExpr *TheCall,
                                                ExprResult CallResult) {
-  if (checkArgCount(*this, TheCall, 3))
+  if (checkArgCount(TheCall, 3))
     return ExprError();
 
   unsigned PtrArgIdx = 1;
@@ -20137,7 +18419,7 @@ static bool CheckWasmBuiltinArgIsInteger(Sema &S, CallExpr *E,
 /// Check that the first argument is a WebAssembly table, and the second
 /// is an index to use as index into the table.
 bool Sema::BuiltinWasmTableGet(CallExpr *TheCall) {
-  if (checkArgCount(*this, TheCall, 2))
+  if (checkArgCount(TheCall, 2))
     return true;
 
   QualType ElTy;
@@ -20160,7 +18442,7 @@ bool Sema::BuiltinWasmTableGet(CallExpr *TheCall) {
 /// an index to use as index into the table and the third is the reference
 /// type to set into the table.
 bool Sema::BuiltinWasmTableSet(CallExpr *TheCall) {
-  if (checkArgCount(*this, TheCall, 3))
+  if (checkArgCount(TheCall, 3))
     return true;
 
   QualType ElTy;
@@ -20178,7 +18460,7 @@ bool Sema::BuiltinWasmTableSet(CallExpr *TheCall) {
 
 /// Check that the argument is a WebAssembly table.
 bool Sema::BuiltinWasmTableSize(CallExpr *TheCall) {
-  if (checkArgCount(*this, TheCall, 1))
+  if (checkArgCount(TheCall, 1))
     return true;
 
   QualType ElTy;
@@ -20192,7 +18474,7 @@ bool Sema::BuiltinWasmTableSize(CallExpr *TheCall) {
 /// value to use for new elements (of a type matching the table type), the
 /// third value is an integer.
 bool Sema::BuiltinWasmTableGrow(CallExpr *TheCall) {
-  if (checkArgCount(*this, TheCall, 3))
+  if (checkArgCount(TheCall, 3))
     return true;
 
   QualType ElTy;
@@ -20216,7 +18498,7 @@ bool Sema::BuiltinWasmTableGrow(CallExpr *TheCall) {
 /// integer, the third is the value to use to fill the table (of a type
 /// matching the table type), and the fourth is an integer.
 bool Sema::BuiltinWasmTableFill(CallExpr *TheCall) {
-  if (checkArgCount(*this, TheCall, 4))
+  if (checkArgCount(TheCall, 4))
     return true;
 
   QualType ElTy;
@@ -20243,7 +18525,7 @@ bool Sema::BuiltinWasmTableFill(CallExpr *TheCall) {
 /// WebAssembly table (of the same element type), and the third to fifth
 /// arguments are integers.
 bool Sema::BuiltinWasmTableCopy(CallExpr *TheCall) {
-  if (checkArgCount(*this, TheCall, 5))
+  if (checkArgCount(TheCall, 5))
     return true;
 
   QualType XElTy;
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 6764a97..2a87b26 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -50,6 +50,7 @@
 #include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/SemaObjC.h"
 #include "clang/Sema/SemaOpenMP.h"
+#include "clang/Sema/SemaRISCV.h"
 #include "clang/Sema/Template.h"
 #include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ADT/SmallString.h"
@@ -4985,7 +4986,7 @@ void Sema::setTagNameForLinkagePurposes(TagDecl *TagFromDeclSpec,
   if (TagFromDeclSpec->hasNameForLinkage())
     return;
 
-  // A well-formed anonymous tag must always be a TUK_Definition.
+  // A well-formed anonymous tag must always be a TagUseKind::Definition.
   assert(TagFromDeclSpec->isThisDeclarationADefinition());
 
   // The type must match the tag exactly;  no qualifiers allowed.
@@ -8926,8 +8927,8 @@ void Sema::CheckVariableDeclarationType(VarDecl *NewVD) {
     const FunctionDecl *FD = cast<FunctionDecl>(CurContext);
     llvm::StringMap<bool> CallerFeatureMap;
     Context.getFunctionFeatureMap(CallerFeatureMap, FD);
-    checkRVVTypeSupport(T, NewVD->getLocation(), cast<Decl>(CurContext),
-                        CallerFeatureMap);
+    RISCV().checkRVVTypeSupport(T, NewVD->getLocation(), cast<Decl>(CurContext),
+                                CallerFeatureMap);
   }
 }
 
@@ -11867,8 +11868,8 @@ static bool CheckMultiVersionFunction(Sema &S, FunctionDecl *NewFD,
     return false;
 
   if (!OldDecl || !OldDecl->getAsFunction() ||
-      OldDecl->getDeclContext()->getRedeclContext() !=
-          NewFD->getDeclContext()->getRedeclContext()) {
+      !OldDecl->getDeclContext()->getRedeclContext()->Equals(
+          NewFD->getDeclContext()->getRedeclContext())) {
     // If there's no previous declaration, AND this isn't attempting to cause
     // multiversioning, this isn't an error condition.
     if (MVKind == MultiVersionKind::None)
@@ -17238,9 +17239,9 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
                OffsetOfKind OOK, SkipBodyInfo *SkipBody) {
   // If this is not a definition, it must have a name.
   IdentifierInfo *OrigName = Name;
-  assert((Name != nullptr || TUK == TUK_Definition) &&
+  assert((Name != nullptr || TUK == TagUseKind::Definition) &&
          "Nameless record must be a definition!");
-  assert(TemplateParameterLists.size() == 0 || TUK != TUK_Reference);
+  assert(TemplateParameterLists.size() == 0 || TUK != TagUseKind::Reference);
 
   OwnedDecl = false;
   TagTypeKind Kind = TypeWithKeyword::getTagTypeKindForTypeSpec(TagSpec);
@@ -17254,11 +17255,11 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
   // or a scope specifier, which also conveniently avoids this work
   // for non-C++ cases.
   if (TemplateParameterLists.size() > 0 ||
-      (SS.isNotEmpty() && TUK != TUK_Reference)) {
+      (SS.isNotEmpty() && TUK != TagUseKind::Reference)) {
     TemplateParameterList *TemplateParams =
         MatchTemplateParametersToScopeSpecifier(
             KWLoc, NameLoc, SS, nullptr, TemplateParameterLists,
-            TUK == TUK_Friend, isMemberSpecialization, Invalid);
+            TUK == TagUseKind::Friend, isMemberSpecialization, Invalid);
 
     // C++23 [dcl.type.elab] p2:
     //   If an elaborated-type-specifier is the sole constituent of a
@@ -17273,7 +17274,8 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
     // FIXME: Class template partial specializations can be forward declared
     // per CWG2213, but the resolution failed to allow qualified forward
     // declarations. This is almost certainly unintentional, so we allow them.
-    if (TUK == TUK_Declaration && SS.isNotEmpty() && !isMemberSpecialization)
+    if (TUK == TagUseKind::Declaration && SS.isNotEmpty() &&
+        !isMemberSpecialization)
       Diag(SS.getBeginLoc(), diag::err_standalone_class_nested_name_specifier)
           << TypeWithKeyword::getTagTypeKindName(Kind) << SS.getRange();
 
@@ -17310,7 +17312,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
       return true;
   }
 
-  if (TUK == TUK_Friend && Kind == TagTypeKind::Enum) {
+  if (TUK == TagUseKind::Friend && Kind == TagTypeKind::Enum) {
     // C++23 [dcl.type.elab]p4:
     //   If an elaborated-type-specifier appears with the friend specifier as
     //   an entire member-declaration, the member-declaration shall have one
@@ -17361,7 +17363,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
       // of 'int'. However, if this is an unfixed forward declaration, don't set
       // the underlying type unless the user enables -fms-compatibility. This
       // makes unfixed forward declared enums incomplete and is more conforming.
-      if (TUK == TUK_Definition || getLangOpts().MSVCCompat)
+      if (TUK == TagUseKind::Definition || getLangOpts().MSVCCompat)
         EnumUnderlying = Context.IntTy.getTypePtr();
     }
   }
@@ -17372,7 +17374,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
   bool isStdAlignValT = false;
 
   RedeclarationKind Redecl = forRedeclarationInCurContext();
-  if (TUK == TUK_Friend || TUK == TUK_Reference)
+  if (TUK == TagUseKind::Friend || TUK == TagUseKind::Reference)
     Redecl = RedeclarationKind::NotForRedeclaration;
 
   /// Create a new tag decl in C/ObjC. Since the ODR-like semantics for ObjC/C
@@ -17390,7 +17392,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
       New = EnumDecl::Create(Context, SearchDC, KWLoc, Loc, Name, nullptr,
                              ScopedEnum, ScopedEnumUsesClassTag, IsFixed);
       // If this is an undefined enum, bail.
-      if (TUK != TUK_Definition && !Invalid)
+      if (TUK != TagUseKind::Definition && !Invalid)
         return nullptr;
       if (EnumUnderlying) {
         EnumDecl *ED = cast<EnumDecl>(New);
@@ -17418,7 +17420,8 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
       // many points during the parsing of a struct declaration (because
       // the #pragma tokens are effectively skipped over during the
       // parsing of the struct).
-      if (TUK == TUK_Definition && (!SkipBody || !SkipBody->ShouldSkip)) {
+      if (TUK == TagUseKind::Definition &&
+          (!SkipBody || !SkipBody->ShouldSkip)) {
         AddAlignmentAttributesForRecord(RD);
         AddMsStructLayoutForRecord(RD);
       }
@@ -17439,7 +17442,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
 
     // If this is a friend or a reference to a class in a dependent
     // context, don't try to make a decl for it.
-    if (TUK == TUK_Friend || TUK == TUK_Reference) {
+    if (TUK == TagUseKind::Friend || TUK == TagUseKind::Reference) {
       DC = computeDeclContext(SS, false);
       if (!DC) {
         IsDependent = true;
@@ -17472,7 +17475,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
       // this as a dependent elaborated-type-specifier.
       // But this only makes any sense for reference-like lookups.
       if (Previous.wasNotFoundInCurrentInstantiation() &&
-          (TUK == TUK_Reference || TUK == TUK_Friend)) {
+          (TUK == TagUseKind::Reference || TUK == TagUseKind::Friend)) {
         IsDependent = true;
         return true;
       }
@@ -17489,7 +17492,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
     //   If T is the name of a class, then each of the following shall have a
     //   name different from T:
     //    -- every member of class T that is itself a type
-    if (TUK != TUK_Reference && TUK != TUK_Friend &&
+    if (TUK != TagUseKind::Reference && TUK != TagUseKind::Friend &&
         DiagnoseClassNameShadow(SearchDC, DeclarationNameInfo(Name, NameLoc)))
       return true;
 
@@ -17503,7 +17506,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
     // When declaring or defining a tag, ignore ambiguities introduced
     // by types using'ed into this scope.
     if (Previous.isAmbiguous() &&
-        (TUK == TUK_Definition || TUK == TUK_Declaration)) {
+        (TUK == TagUseKind::Definition || TUK == TagUseKind::Declaration)) {
       LookupResult::Filter F = Previous.makeFilter();
       while (F.hasNext()) {
         NamedDecl *ND = F.next();
@@ -17527,7 +17530,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
     //
     // Does it matter that this should be by scope instead of by
     // semantic context?
-    if (!Previous.empty() && TUK == TUK_Friend) {
+    if (!Previous.empty() && TUK == TagUseKind::Friend) {
       DeclContext *EnclosingNS = SearchDC->getEnclosingNamespaceContext();
       LookupResult::Filter F = Previous.makeFilter();
       bool FriendSawTagOutsideEnclosingNamespace = false;
@@ -17557,7 +17560,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
     if (Previous.isAmbiguous())
       return true;
 
-    if (!getLangOpts().CPlusPlus && TUK != TUK_Reference) {
+    if (!getLangOpts().CPlusPlus && TUK != TagUseKind::Reference) {
       // FIXME: This makes sure that we ignore the contexts associated
       // with C structs, unions, and enums when looking for a matching
       // tag declaration or definition. See the similar lookup tweak
@@ -17609,11 +17612,12 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
   // also need to do a redeclaration lookup there, just in case
   // there's a shadow friend decl.
   if (Name && Previous.empty() &&
-      (TUK == TUK_Reference || TUK == TUK_Friend || IsTemplateParamOrArg)) {
+      (TUK == TagUseKind::Reference || TUK == TagUseKind::Friend ||
+       IsTemplateParamOrArg)) {
     if (Invalid) goto CreateNewDecl;
     assert(SS.isEmpty());
 
-    if (TUK == TUK_Reference || IsTemplateParamOrArg) {
+    if (TUK == TagUseKind::Reference || IsTemplateParamOrArg) {
       // C++ [basic.scope.pdecl]p5:
       //   -- for an elaborated-type-specifier of the form
       //
@@ -17647,7 +17651,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
       // Find the scope where we'll be declaring the tag.
       S = getTagInjectionScope(S, getLangOpts());
     } else {
-      assert(TUK == TUK_Friend);
+      assert(TUK == TagUseKind::Friend);
       CXXRecordDecl *RD = dyn_cast<CXXRecordDecl>(SearchDC);
 
       // C++ [namespace.memdef]p3:
@@ -17712,7 +17716,8 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
     // redefinition if either context is within the other.
     if (auto *Shadow = dyn_cast<UsingShadowDecl>(DirectPrevDecl)) {
       auto *OldTag = dyn_cast<TagDecl>(PrevDecl);
-      if (SS.isEmpty() && TUK != TUK_Reference && TUK != TUK_Friend &&
+      if (SS.isEmpty() && TUK != TagUseKind::Reference &&
+          TUK != TagUseKind::Friend &&
           isDeclInScope(Shadow, SearchDC, S, isMemberSpecialization) &&
           !(OldTag && isAcceptableTagRedeclContext(
                           *this, OldTag->getDeclContext(), SearchDC))) {
@@ -17731,13 +17736,13 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
       // If this is a use of a previous tag, or if the tag is already declared
       // in the same scope (so that the definition/declaration completes or
       // rementions the tag), reuse the decl.
-      if (TUK == TUK_Reference || TUK == TUK_Friend ||
+      if (TUK == TagUseKind::Reference || TUK == TagUseKind::Friend ||
           isDeclInScope(DirectPrevDecl, SearchDC, S,
                         SS.isNotEmpty() || isMemberSpecialization)) {
         // Make sure that this wasn't declared as an enum and now used as a
         // struct or something similar.
         if (!isAcceptableTagRedeclaration(PrevTagDecl, Kind,
-                                          TUK == TUK_Definition, KWLoc,
+                                          TUK == TagUseKind::Definition, KWLoc,
                                           Name)) {
           bool SafeToContinue =
               (PrevTagDecl->getTagKind() != TagTypeKind::Enum &&
@@ -17764,7 +17769,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
         if (Kind == TagTypeKind::Enum &&
             PrevTagDecl->getTagKind() == TagTypeKind::Enum) {
           const EnumDecl *PrevEnum = cast<EnumDecl>(PrevTagDecl);
-          if (TUK == TUK_Reference || TUK == TUK_Friend)
+          if (TUK == TagUseKind::Reference || TUK == TagUseKind::Friend)
             return PrevTagDecl;
 
           QualType EnumUnderlyingTy;
@@ -17779,14 +17784,14 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
           if (CheckEnumRedeclaration(NameLoc.isValid() ? NameLoc : KWLoc,
                                      ScopedEnum, EnumUnderlyingTy,
                                      IsFixed, PrevEnum))
-            return TUK == TUK_Declaration ? PrevTagDecl : nullptr;
+            return TUK == TagUseKind::Declaration ? PrevTagDecl : nullptr;
         }
 
         // C++11 [class.mem]p1:
         //   A member shall not be declared twice in the member-specification,
         //   except that a nested class or member class template can be declared
         //   and then later defined.
-        if (TUK == TUK_Declaration && PrevDecl->isCXXClassMember() &&
+        if (TUK == TagUseKind::Declaration && PrevDecl->isCXXClassMember() &&
             S->isDeclScope(PrevDecl)) {
           Diag(NameLoc, diag::ext_member_redeclared);
           Diag(PrevTagDecl->getLocation(), diag::note_previous_declaration);
@@ -17795,11 +17800,11 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
         if (!Invalid) {
           // If this is a use, just return the declaration we found, unless
           // we have attributes.
-          if (TUK == TUK_Reference || TUK == TUK_Friend) {
+          if (TUK == TagUseKind::Reference || TUK == TagUseKind::Friend) {
             if (!Attrs.empty()) {
               // FIXME: Diagnose these attributes. For now, we create a new
               // declaration to hold them.
-            } else if (TUK == TUK_Reference &&
+            } else if (TUK == TagUseKind::Reference &&
                        (PrevTagDecl->getFriendObjectKind() ==
                             Decl::FOK_Undeclared ||
                         PrevDecl->getOwningModule() != getCurrentModule()) &&
@@ -17823,7 +17828,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
           }
 
           // Diagnose attempts to redefine a tag.
-          if (TUK == TUK_Definition) {
+          if (TUK == TagUseKind::Definition) {
             if (NamedDecl *Def = PrevTagDecl->getDefinition()) {
               // If we're defining a specialization and the previous definition
               // is from an implicit instantiation, don't emit an error
@@ -17903,7 +17908,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
           // Okay, we're going to make a redeclaration.  If this is some kind
           // of reference, make sure we build the redeclaration in the same DC
           // as the original, and ignore the current access specifier.
-          if (TUK == TUK_Friend || TUK == TUK_Reference) {
+          if (TUK == TagUseKind::Friend || TUK == TagUseKind::Reference) {
             SearchDC = PrevTagDecl->getDeclContext();
             AS = AS_none;
           }
@@ -17929,7 +17934,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
       // Use a better diagnostic if an elaborated-type-specifier
       // found the wrong kind of type on the first
       // (non-redeclaration) lookup.
-      if ((TUK == TUK_Reference || TUK == TUK_Friend) &&
+      if ((TUK == TagUseKind::Reference || TUK == TagUseKind::Friend) &&
           !Previous.isForRedeclaration()) {
         NonTagKind NTK = getNonTagTypeDeclKind(PrevDecl, Kind);
         Diag(NameLoc, diag::err_tag_reference_non_tag)
@@ -17943,7 +17948,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
         // do nothing
 
       // Diagnose implicit declarations introduced by elaborated types.
-      } else if (TUK == TUK_Reference || TUK == TUK_Friend) {
+      } else if (TUK == TagUseKind::Reference || TUK == TagUseKind::Friend) {
         NonTagKind NTK = getNonTagTypeDeclKind(PrevDecl, Kind);
         Diag(NameLoc, diag::err_tag_reference_conflict) << NTK;
         Diag(PrevDecl->getLocation(), diag::note_previous_decl) << PrevDecl;
@@ -18002,7 +18007,7 @@ CreateNewDecl:
       StdAlignValT = cast<EnumDecl>(New);
 
     // If this is an undefined enum, warn.
-    if (TUK != TUK_Definition && !Invalid) {
+    if (TUK != TagUseKind::Definition && !Invalid) {
       TagDecl *Def;
       if (IsFixed && cast<EnumDecl>(New)->isFixed()) {
         // C++0x: 7.2p2: opaque-enum-declaration.
@@ -18052,21 +18057,22 @@ CreateNewDecl:
   }
 
   // Only C23 and later allow defining new types in 'offsetof()'.
-  if (OOK != OOK_Outside && TUK == TUK_Definition && !getLangOpts().CPlusPlus &&
-      !getLangOpts().C23)
+  if (OOK != OOK_Outside && TUK == TagUseKind::Definition &&
+      !getLangOpts().CPlusPlus && !getLangOpts().C23)
     Diag(New->getLocation(), diag::ext_type_defined_in_offsetof)
         << (OOK == OOK_Macro) << New->getSourceRange();
 
   // C++11 [dcl.type]p3:
   //   A type-specifier-seq shall not define a class or enumeration [...].
   if (!Invalid && getLangOpts().CPlusPlus &&
-      (IsTypeSpecifier || IsTemplateParamOrArg) && TUK == TUK_Definition) {
+      (IsTypeSpecifier || IsTemplateParamOrArg) &&
+      TUK == TagUseKind::Definition) {
     Diag(New->getLocation(), diag::err_type_defined_in_type_specifier)
       << Context.getTagDeclType(New);
     Invalid = true;
   }
 
-  if (!Invalid && getLangOpts().CPlusPlus && TUK == TUK_Definition &&
+  if (!Invalid && getLangOpts().CPlusPlus && TUK == TagUseKind::Definition &&
       DC->getDeclKind() == Decl::Enum) {
     Diag(New->getLocation(), diag::err_type_defined_in_enum)
       << Context.getTagDeclType(New);
@@ -18078,7 +18084,7 @@ CreateNewDecl:
     if (SS.isSet()) {
       // If this is either a declaration or a definition, check the
       // nested-name-specifier against the current context.
-      if ((TUK == TUK_Definition || TUK == TUK_Declaration) &&
+      if ((TUK == TagUseKind::Definition || TUK == TagUseKind::Declaration) &&
           diagnoseQualifiedDeclaration(SS, DC, OrigName, Loc,
                                        /*TemplateId=*/nullptr,
                                        isMemberSpecialization))
@@ -18103,7 +18109,7 @@ CreateNewDecl:
     // many points during the parsing of a struct declaration (because
     // the #pragma tokens are effectively skipped over during the
     // parsing of the struct).
-    if (TUK == TUK_Definition && (!SkipBody || !SkipBody->ShouldSkip)) {
+    if (TUK == TagUseKind::Definition && (!SkipBody || !SkipBody->ShouldSkip)) {
       AddAlignmentAttributesForRecord(RD);
       AddMsStructLayoutForRecord(RD);
     }
@@ -18134,7 +18140,7 @@ CreateNewDecl:
     if (getLangOpts().CPlusPlus) {
       // C++ [dcl.fct]p6:
       //   Types shall not be defined in return or parameter types.
-      if (TUK == TUK_Definition && !IsTypeSpecifier) {
+      if (TUK == TagUseKind::Definition && !IsTypeSpecifier) {
         Diag(Loc, diag::err_type_defined_in_param_type)
             << Name;
         Invalid = true;
@@ -18155,7 +18161,7 @@ CreateNewDecl:
   // In Microsoft mode, a friend declaration also acts as a forward
   // declaration so we always pass true to setObjectOfFriendDecl to make
   // the tag name visible.
-  if (TUK == TUK_Friend)
+  if (TUK == TagUseKind::Friend)
     New->setObjectOfFriendDecl(getLangOpts().MSVCCompat);
 
   // Set the access specifier.
@@ -18165,14 +18171,14 @@ CreateNewDecl:
   if (PrevDecl)
     CheckRedeclarationInModule(New, PrevDecl);
 
-  if (TUK == TUK_Definition && (!SkipBody || !SkipBody->ShouldSkip))
+  if (TUK == TagUseKind::Definition && (!SkipBody || !SkipBody->ShouldSkip))
     New->startDefinition();
 
   ProcessDeclAttributeList(S, New, Attrs);
   AddPragmaAttributes(S, New);
 
   // If this has an identifier, add it to the scope stack.
-  if (TUK == TUK_Friend) {
+  if (TUK == TagUseKind::Friend) {
     // We might be replacing an existing declaration in the lookup tables;
     // if so, borrow its access specifier.
     if (PrevDecl)
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index ca59380..5041fd6 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8663,31 +8663,95 @@ static const RecordDecl *GetEnclosingNamedOrTopAnonRecord(const FieldDecl *FD) {
   return RD;
 }
 
-static bool
-CheckCountExpr(Sema &S, FieldDecl *FD, Expr *E,
-               llvm::SmallVectorImpl<TypeCoupledDeclRefInfo> &Decls) {
+enum class CountedByInvalidPointeeTypeKind {
+  INCOMPLETE,
+  SIZELESS,
+  FUNCTION,
+  FLEXIBLE_ARRAY_MEMBER,
+  VALID,
+};
+
+static bool CheckCountedByAttrOnField(
+    Sema &S, FieldDecl *FD, Expr *E,
+    llvm::SmallVectorImpl<TypeCoupledDeclRefInfo> &Decls) {
+  // Check the context the attribute is used in
+
   if (FD->getParent()->isUnion()) {
     S.Diag(FD->getBeginLoc(), diag::err_counted_by_attr_in_union)
         << FD->getSourceRange();
     return true;
   }
 
-  if (!E->getType()->isIntegerType() || E->getType()->isBooleanType()) {
-    S.Diag(E->getBeginLoc(), diag::err_counted_by_attr_argument_not_integer)
-        << E->getSourceRange();
+  const auto FieldTy = FD->getType();
+  if (!FieldTy->isArrayType() && !FieldTy->isPointerType()) {
+    S.Diag(FD->getBeginLoc(),
+           diag::err_counted_by_attr_not_on_ptr_or_flexible_array_member)
+        << FD->getLocation();
     return true;
   }
 
   LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel =
       LangOptions::StrictFlexArraysLevelKind::IncompleteOnly;
-
-  if (!Decl::isFlexibleArrayMemberLike(S.getASTContext(), FD, FD->getType(),
+  if (FieldTy->isArrayType() &&
+      !Decl::isFlexibleArrayMemberLike(S.getASTContext(), FD, FieldTy,
                                        StrictFlexArraysLevel, true)) {
-    // The "counted_by" attribute must be on a flexible array member.
-    SourceRange SR = FD->getLocation();
-    S.Diag(SR.getBegin(),
-           diag::err_counted_by_attr_not_on_flexible_array_member)
-        << SR;
+    S.Diag(FD->getBeginLoc(),
+           diag::err_counted_by_attr_on_array_not_flexible_array_member)
+        << FD->getLocation();
+    return true;
+  }
+
+  CountedByInvalidPointeeTypeKind InvalidTypeKind =
+      CountedByInvalidPointeeTypeKind::VALID;
+  QualType PointeeTy;
+  int SelectPtrOrArr = 0;
+  if (FieldTy->isPointerType()) {
+    PointeeTy = FieldTy->getPointeeType();
+    SelectPtrOrArr = 0;
+  } else {
+    assert(FieldTy->isArrayType());
+    const ArrayType *AT = S.getASTContext().getAsArrayType(FieldTy);
+    PointeeTy = AT->getElementType();
+    SelectPtrOrArr = 1;
+  }
+  // Note: The `Decl::isFlexibleArrayMemberLike` check earlier on means
+  // only `PointeeTy->isStructureTypeWithFlexibleArrayMember()` is reachable
+  // when `FieldTy->isArrayType()`.
+  bool ShouldWarn = false;
+  if (PointeeTy->isIncompleteType()) {
+    InvalidTypeKind = CountedByInvalidPointeeTypeKind::INCOMPLETE;
+  } else if (PointeeTy->isSizelessType()) {
+    InvalidTypeKind = CountedByInvalidPointeeTypeKind::SIZELESS;
+  } else if (PointeeTy->isFunctionType()) {
+    InvalidTypeKind = CountedByInvalidPointeeTypeKind::FUNCTION;
+  } else if (PointeeTy->isStructureTypeWithFlexibleArrayMember()) {
+    if (FieldTy->isArrayType()) {
+      // This is a workaround for the Linux kernel that has already adopted
+      // `counted_by` on a FAM where the pointee is a struct with a FAM. This
+      // should be an error because computing the bounds of the array cannot be
+      // done correctly without manually traversing every struct object in the
+      // array at runtime. To allow the code to be built this error is
+      // downgraded to a warning.
+      ShouldWarn = true;
+    }
+    InvalidTypeKind = CountedByInvalidPointeeTypeKind::FLEXIBLE_ARRAY_MEMBER;
+  }
+
+  if (InvalidTypeKind != CountedByInvalidPointeeTypeKind::VALID) {
+    unsigned DiagID = ShouldWarn
+                          ? diag::warn_counted_by_attr_elt_type_unknown_size
+                          : diag::err_counted_by_attr_pointee_unknown_size;
+    S.Diag(FD->getBeginLoc(), DiagID)
+        << SelectPtrOrArr << PointeeTy << (int)InvalidTypeKind
+        << (ShouldWarn ? 1 : 0) << FD->getSourceRange();
+    return true;
+  }
+
+  // Check the expression
+
+  if (!E->getType()->isIntegerType() || E->getType()->isBooleanType()) {
+    S.Diag(E->getBeginLoc(), diag::err_counted_by_attr_argument_not_integer)
+        << E->getSourceRange();
     return true;
   }
 
@@ -8750,10 +8814,11 @@ static void handleCountedByAttrField(Sema &S, Decl *D, const ParsedAttr &AL) {
     return;
 
   llvm::SmallVector<TypeCoupledDeclRefInfo, 1> Decls;
-  if (CheckCountExpr(S, FD, CountExpr, Decls))
+  if (CheckCountedByAttrOnField(S, FD, CountExpr, Decls))
     return;
 
-  QualType CAT = S.BuildCountAttributedArrayType(FD->getType(), CountExpr);
+  QualType CAT =
+      S.BuildCountAttributedArrayOrPointerType(FD->getType(), CountExpr);
   FD->setType(CAT);
 }
 
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 104e271..8ab429e 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -17580,11 +17580,12 @@ DeclResult Sema::ActOnTemplatedFriendTag(
       if (Invalid)
         return true;
 
-      return CheckClassTemplate(S, TagSpec, TUK_Friend, TagLoc, SS, Name,
-                                NameLoc, Attr, TemplateParams, AS_public,
+      return CheckClassTemplate(S, TagSpec, TagUseKind::Friend, TagLoc, SS,
+                                Name, NameLoc, Attr, TemplateParams, AS_public,
                                 /*ModulePrivateLoc=*/SourceLocation(),
                                 FriendLoc, TempParamLists.size() - 1,
-                                TempParamLists.data()).get();
+                                TempParamLists.data())
+          .get();
     } else {
       // The "template<>" header is extraneous.
       Diag(TemplateParams->getTemplateLoc(), diag::err_template_tag_noparams)
@@ -17612,8 +17613,8 @@ DeclResult Sema::ActOnTemplatedFriendTag(
     if (SS.isEmpty()) {
       bool Owned = false;
       bool IsDependent = false;
-      return ActOnTag(S, TagSpec, TUK_Friend, TagLoc, SS, Name, NameLoc, Attr,
-                      AS_public,
+      return ActOnTag(S, TagSpec, TagUseKind::Friend, TagLoc, SS, Name, NameLoc,
+                      Attr, AS_public,
                       /*ModulePrivateLoc=*/SourceLocation(),
                       MultiTemplateParamsArg(), Owned, IsDependent,
                       /*ScopedEnumKWLoc=*/SourceLocation(),
@@ -17728,7 +17729,7 @@ Decl *Sema::ActOnFriendTypeDecl(Scope *S, const DeclSpec &DS,
 
   // Try to convert the decl specifier to a type.  This works for
   // friend templates because ActOnTag never produces a ClassTemplateDecl
-  // for a TUK_Friend.
+  // for a TagUseKind::Friend.
   Declarator TheDeclarator(DS, ParsedAttributesView::none(),
                            DeclaratorContext::Member);
   TypeSourceInfo *TSI = GetTypeForDeclarator(TheDeclarator);
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 7bb34fd..ff9c5ea 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -5185,7 +5185,7 @@ Sema::CreateBuiltinArraySubscriptExpr(Expr *Base, SourceLocation LLoc,
   }
 
   // Perform default conversions.
-  if (!LHSExp->getType()->getAs<VectorType>()) {
+  if (!LHSExp->getType()->isSubscriptableVectorType()) {
     ExprResult Result = DefaultFunctionArrayLvalueConversion(LHSExp);
     if (Result.isInvalid())
       return ExprError();
@@ -5241,36 +5241,22 @@ Sema::CreateBuiltinArraySubscriptExpr(Expr *Base, SourceLocation LLoc,
         << ResultType << BaseExpr->getSourceRange();
       return ExprError();
     }
-  } else if (const VectorType *VTy = LHSTy->getAs<VectorType>()) {
-    BaseExpr = LHSExp;    // vectors: V[123]
-    IndexExpr = RHSExp;
-    // We apply C++ DR1213 to vector subscripting too.
-    if (getLangOpts().CPlusPlus11 && LHSExp->isPRValue()) {
-      ExprResult Materialized = TemporaryMaterializationConversion(LHSExp);
-      if (Materialized.isInvalid())
-        return ExprError();
-      LHSExp = Materialized.get();
+  } else if (LHSTy->isSubscriptableVectorType()) {
+    if (LHSTy->isBuiltinType() &&
+        LHSTy->getAs<BuiltinType>()->isSveVLSBuiltinType()) {
+      const BuiltinType *BTy = LHSTy->getAs<BuiltinType>();
+      if (BTy->isSVEBool())
+        return ExprError(Diag(LLoc, diag::err_subscript_svbool_t)
+                         << LHSExp->getSourceRange()
+                         << RHSExp->getSourceRange());
+      ResultType = BTy->getSveEltType(Context);
+    } else {
+      const VectorType *VTy = LHSTy->getAs<VectorType>();
+      ResultType = VTy->getElementType();
     }
-    VK = LHSExp->getValueKind();
-    if (VK != VK_PRValue)
-      OK = OK_VectorComponent;
-
-    ResultType = VTy->getElementType();
-    QualType BaseType = BaseExpr->getType();
-    Qualifiers BaseQuals = BaseType.getQualifiers();
-    Qualifiers MemberQuals = ResultType.getQualifiers();
-    Qualifiers Combined = BaseQuals + MemberQuals;
-    if (Combined != MemberQuals)
-      ResultType = Context.getQualifiedType(ResultType, Combined);
-  } else if (LHSTy->isBuiltinType() &&
-             LHSTy->getAs<BuiltinType>()->isSveVLSBuiltinType()) {
-    const BuiltinType *BTy = LHSTy->getAs<BuiltinType>();
-    if (BTy->isSVEBool())
-      return ExprError(Diag(LLoc, diag::err_subscript_svbool_t)
-                       << LHSExp->getSourceRange() << RHSExp->getSourceRange());
-
-    BaseExpr = LHSExp;
+    BaseExpr = LHSExp; // vectors: V[123]
     IndexExpr = RHSExp;
+    // We apply C++ DR1213 to vector subscripting too.
     if (getLangOpts().CPlusPlus11 && LHSExp->isPRValue()) {
       ExprResult Materialized = TemporaryMaterializationConversion(LHSExp);
       if (Materialized.isInvalid())
@@ -5281,8 +5267,6 @@ Sema::CreateBuiltinArraySubscriptExpr(Expr *Base, SourceLocation LLoc,
     if (VK != VK_PRValue)
       OK = OK_VectorComponent;
 
-    ResultType = BTy->getSveEltType(Context);
-
     QualType BaseType = BaseExpr->getType();
     Qualifiers BaseQuals = BaseType.getQualifiers();
     Qualifiers MemberQuals = ResultType.getQualifiers();
@@ -5579,10 +5563,9 @@ ExprResult Sema::BuildCXXDefaultArgExpr(SourceLocation CallLoc,
         Res = Immediate.TransformInitializer(Param->getInit(),
                                              /*NotCopy=*/false);
       });
-      if (Res.isInvalid())
-        return ExprError();
-      Res = ConvertParamDefaultArgument(Param, Res.get(),
-                                        Res.get()->getBeginLoc());
+      if (Res.isUsable())
+        Res = ConvertParamDefaultArgument(Param, Res.get(),
+                                          Res.get()->getBeginLoc());
       if (Res.isInvalid())
         return ExprError();
       Init = Res.get();
@@ -5616,9 +5599,10 @@ ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) {
     InitializationContext.emplace(Loc, Field, CurContext);
 
   Expr *Init = nullptr;
+  bool HasRewrittenInit = false;
 
   bool NestedDefaultChecking = isCheckingDefaultArgumentOrInitializer();
-
+  bool InLifetimeExtendingContext = isInLifetimeExtendingContext();
   EnterExpressionEvaluationContext EvalContext(
       *this, ExpressionEvaluationContext::PotentiallyEvaluated, Field);
 
@@ -5653,19 +5637,36 @@ ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) {
   ImmediateCallVisitor V(getASTContext());
   if (!NestedDefaultChecking)
     V.TraverseDecl(Field);
-  if (V.HasImmediateCalls) {
+
+  // CWG1815
+  // Support lifetime extension of temporary created by aggregate
+  // initialization using a default member initializer. We should always rebuild
+  // the initializer if it contains any temporaries (if the initializer
+  // expression is an ExprWithCleanups). Then make sure the normal lifetime
+  // extension code recurses into the default initializer and does lifetime
+  // extension when warranted.
+  bool ContainsAnyTemporaries =
+      isa_and_present<ExprWithCleanups>(Field->getInClassInitializer());
+  if (V.HasImmediateCalls || InLifetimeExtendingContext ||
+      ContainsAnyTemporaries) {
+    HasRewrittenInit = true;
     ExprEvalContexts.back().DelayedDefaultInitializationContext = {Loc, Field,
                                                                    CurContext};
     ExprEvalContexts.back().IsCurrentlyCheckingDefaultArgumentOrInitializer =
         NestedDefaultChecking;
-
+    // Pass down lifetime extending flag, and collect temporaries in
+    // CreateMaterializeTemporaryExpr when we rewrite the call argument.
+    keepInLifetimeExtendingContext();
     EnsureImmediateInvocationInDefaultArgs Immediate(*this);
     ExprResult Res;
+
+    // Rebuild CXXDefaultInitExpr might cause diagnostics.
+    SFINAETrap Trap(*this);
     runWithSufficientStackSpace(Loc, [&] {
       Res = Immediate.TransformInitializer(Field->getInClassInitializer(),
                                            /*CXXDirectInit=*/false);
     });
-    if (!Res.isInvalid())
+    if (Res.isUsable())
       Res = ConvertMemberDefaultInitExpression(Field, Res.get(), Loc);
     if (Res.isInvalid()) {
       Field->setInvalidDecl();
@@ -5692,7 +5693,7 @@ ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) {
 
     return CXXDefaultInitExpr::Create(Context, InitializationContext->Loc,
                                       Field, InitializationContext->Context,
-                                      Init);
+                                      HasRewrittenInit ? Init : nullptr);
   }
 
   // DR1351:
@@ -7543,27 +7544,6 @@ bool Sema::isValidSveBitcast(QualType srcTy, QualType destTy) {
          ValidScalableConversion(destTy, srcTy);
 }
 
-/// Are the two types RVV-bitcast-compatible types? I.e. is bitcasting from the
-/// first RVV type (e.g. an RVV scalable type) to the second type (e.g. an RVV
-/// VLS type) allowed?
-///
-/// This will also return false if the two given types do not make sense from
-/// the perspective of RVV bitcasts.
-bool Sema::isValidRVVBitcast(QualType srcTy, QualType destTy) {
-  assert(srcTy->isVectorType() || destTy->isVectorType());
-
-  auto ValidScalableConversion = [](QualType FirstType, QualType SecondType) {
-    if (!FirstType->isRVVSizelessBuiltinType())
-      return false;
-
-    const auto *VecTy = SecondType->getAs<VectorType>();
-    return VecTy && VecTy->getVectorKind() == VectorKind::RVVFixedLengthData;
-  };
-
-  return ValidScalableConversion(srcTy, destTy) ||
-         ValidScalableConversion(destTy, srcTy);
-}
-
 /// Are the two types matrix types and do they have the same dimensions i.e.
 /// do they have the same number of rows and the same number of columns?
 bool Sema::areMatrixTypesOfTheSameDimension(QualType srcTy, QualType destTy) {
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index f543e00..d3e9dcb 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -1554,9 +1554,6 @@ Sema::BuildCXXTypeConstructExpr(TypeSourceInfo *TInfo,
                                 bool ListInitialization) {
   QualType Ty = TInfo->getType();
   SourceLocation TyBeginLoc = TInfo->getTypeLoc().getBeginLoc();
-
-  assert((!ListInitialization || Exprs.size() == 1) &&
-         "List initialization must have exactly one expression.");
   SourceRange FullRange = SourceRange(TyBeginLoc, RParenOrBraceLoc);
 
   InitializedEntity Entity =
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 353e911..79bdc8e 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -8066,11 +8066,6 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
 enum PathLifetimeKind {
   /// Lifetime-extend along this path.
   Extend,
-  /// We should lifetime-extend, but we don't because (due to technical
-  /// limitations) we can't. This happens for default member initializers,
-  /// which we don't clone for every use, so we don't have a unique
-  /// MaterializeTemporaryExpr to update.
-  ShouldExtend,
   /// Do not lifetime extend along this path.
   NoExtend
 };
@@ -8082,7 +8077,7 @@ shouldLifetimeExtendThroughPath(const IndirectLocalPath &Path) {
   PathLifetimeKind Kind = PathLifetimeKind::Extend;
   for (auto Elem : Path) {
     if (Elem.Kind == IndirectLocalPathEntry::DefaultInit)
-      Kind = PathLifetimeKind::ShouldExtend;
+      Kind = PathLifetimeKind::Extend;
     else if (Elem.Kind != IndirectLocalPathEntry::LambdaCaptureInit)
       return PathLifetimeKind::NoExtend;
   }
@@ -8202,18 +8197,6 @@ void Sema::checkInitializerLifetime(const InitializedEntity &Entity,
                               ExtendingEntity->allocateManglingNumber());
         // Also visit the temporaries lifetime-extended by this initializer.
         return true;
-
-      case PathLifetimeKind::ShouldExtend:
-        // We're supposed to lifetime-extend the temporary along this path (per
-        // the resolution of DR1815), but we don't support that yet.
-        //
-        // FIXME: Properly handle this situation. Perhaps the easiest approach
-        // would be to clone the initializer expression on each use that would
-        // lifetime extend its temporaries.
-        Diag(DiagLoc, diag::warn_unsupported_lifetime_extension)
-            << RK << DiagRange;
-        break;
-
       case PathLifetimeKind::NoExtend:
         // If the path goes through the initialization of a variable or field,
         // it can't possibly reach a temporary created in this full-expression.
diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp
index 1743afa..276a43a 100644
--- a/clang/lib/Sema/SemaLambda.cpp
+++ b/clang/lib/Sema/SemaLambda.cpp
@@ -12,6 +12,7 @@
 #include "clang/Sema/SemaLambda.h"
 #include "TypeLocBuilder.h"
 #include "clang/AST/ASTLambda.h"
+#include "clang/AST/CXXInheritance.h"
 #include "clang/AST/ExprCXX.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Sema/DeclSpec.h"
@@ -386,30 +387,69 @@ buildTypeForLambdaCallOperator(Sema &S, clang::CXXRecordDecl *Class,
 //  parameter, if any, of the lambda's function call operator (possibly
 //  instantiated from a function call operator template) shall be either:
 //  - the closure type,
-//  - class type derived from the closure type, or
+//  - class type publicly and unambiguously derived from the closure type, or
 //  - a reference to a possibly cv-qualified such type.
-void Sema::DiagnoseInvalidExplicitObjectParameterInLambda(
-    CXXMethodDecl *Method) {
+bool Sema::DiagnoseInvalidExplicitObjectParameterInLambda(
+    CXXMethodDecl *Method, SourceLocation CallLoc) {
   if (!isLambdaCallWithExplicitObjectParameter(Method))
-    return;
+    return false;
   CXXRecordDecl *RD = Method->getParent();
   if (Method->getType()->isDependentType())
-    return;
+    return false;
   if (RD->isCapturelessLambda())
-    return;
-  QualType ExplicitObjectParameterType = Method->getParamDecl(0)
-                                             ->getType()
+    return false;
+
+  ParmVarDecl *Param = Method->getParamDecl(0);
+  QualType ExplicitObjectParameterType = Param->getType()
                                              .getNonReferenceType()
                                              .getUnqualifiedType()
                                              .getDesugaredType(getASTContext());
   QualType LambdaType = getASTContext().getRecordType(RD);
   if (LambdaType == ExplicitObjectParameterType)
-    return;
-  if (IsDerivedFrom(RD->getLocation(), ExplicitObjectParameterType, LambdaType))
-    return;
-  Diag(Method->getParamDecl(0)->getLocation(),
-       diag::err_invalid_explicit_object_type_in_lambda)
-      << ExplicitObjectParameterType;
+    return false;
+
+  // Don't check the same instantiation twice.
+  //
+  // If this call operator is ill-formed, there is no point in issuing
+  // a diagnostic every time it is called because the problem is in the
+  // definition of the derived type, not at the call site.
+  //
+  // FIXME: Move this check to where we instantiate the method? This should
+  // be possible, but the naive approach of just marking the method as invalid
+  // leads to us emitting more diagnostics than we should have to for this case
+  // (1 error here *and* 1 error about there being no matching overload at the
+  // call site). It might be possible to avoid that by also checking if there
+  // is an empty cast path for the method stored in the context (signalling that
+  // we've already diagnosed it) and then just not building the call, but that
+  // doesn't really seem any simpler than diagnosing it at the call site...
+  if (auto It = Context.LambdaCastPaths.find(Method);
+      It != Context.LambdaCastPaths.end())
+    return It->second.empty();
+
+  CXXCastPath &Path = Context.LambdaCastPaths[Method];
+  CXXBasePaths Paths(/*FindAmbiguities=*/true, /*RecordPaths=*/true,
+                     /*DetectVirtual=*/false);
+  if (!IsDerivedFrom(RD->getLocation(), ExplicitObjectParameterType, LambdaType,
+                     Paths)) {
+    Diag(Param->getLocation(), diag::err_invalid_explicit_object_type_in_lambda)
+        << ExplicitObjectParameterType;
+    return true;
+  }
+
+  if (Paths.isAmbiguous(LambdaType->getCanonicalTypeUnqualified())) {
+    std::string PathsDisplay = getAmbiguousPathsDisplayString(Paths);
+    Diag(CallLoc, diag::err_explicit_object_lambda_ambiguous_base)
+        << LambdaType << PathsDisplay;
+    return true;
+  }
+
+  if (CheckBaseClassAccess(CallLoc, LambdaType, ExplicitObjectParameterType,
+                           Paths.front(),
+                           diag::err_explicit_object_lambda_inaccessible_base))
+    return true;
+
+  BuildBasePathArray(Paths, Path);
+  return false;
 }
 
 void Sema::handleLambdaNumbering(
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index e4d4cd7..ef0a655 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -34,6 +34,7 @@
 #include "clang/Sema/ScopeInfo.h"
 #include "clang/Sema/Sema.h"
 #include "clang/Sema/SemaInternal.h"
+#include "clang/Sema/SemaRISCV.h"
 #include "clang/Sema/TemplateDeduction.h"
 #include "clang/Sema/TypoCorrection.h"
 #include "llvm/ADT/STLExtras.h"
@@ -945,13 +946,13 @@ bool Sema::LookupBuiltin(LookupResult &R) {
         }
       }
 
-      if (DeclareRISCVVBuiltins || DeclareRISCVSiFiveVectorBuiltins) {
-        if (!RVIntrinsicManager)
-          RVIntrinsicManager = CreateRISCVIntrinsicManager(*this);
+      if (RISCV().DeclareRVVBuiltins || RISCV().DeclareSiFiveVectorBuiltins) {
+        if (!RISCV().IntrinsicManager)
+          RISCV().IntrinsicManager = CreateRISCVIntrinsicManager(*this);
 
-        RVIntrinsicManager->InitIntrinsicList();
+        RISCV().IntrinsicManager->InitIntrinsicList();
 
-        if (RVIntrinsicManager->CreateIntrinsicIfFound(R, II, PP))
+        if (RISCV().IntrinsicManager->CreateIntrinsicIfFound(R, II, PP))
           return true;
       }
 
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 6110e52..bab61e8 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -9815,6 +9815,25 @@ static Stmt *buildPreInits(ASTContext &Context,
   return nullptr;
 }
 
+/// Append the \p Item or the content of a CompoundStmt to the list \p
+/// TargetList.
+///
+/// A CompoundStmt is used as container in case multiple statements need to be
+/// stored in lieu of using an explicit list. Flattening is necessary because
+/// contained DeclStmts need to be visible after the execution of the list. Used
+/// for OpenMP pre-init declarations/statements.
+static void appendFlattendedStmtList(SmallVectorImpl<Stmt *> &TargetList,
+                                     Stmt *Item) {
+  // nullptr represents an empty list.
+  if (!Item)
+    return;
+
+  if (auto *CS = dyn_cast<CompoundStmt>(Item))
+    llvm::append_range(TargetList, CS->body());
+  else
+    TargetList.push_back(Item);
+}
+
 /// Build preinits statement for the given declarations.
 static Stmt *
 buildPreInits(ASTContext &Context,
@@ -9828,6 +9847,17 @@ buildPreInits(ASTContext &Context,
   return nullptr;
 }
 
+/// Build pre-init statement for the given statements.
+static Stmt *buildPreInits(ASTContext &Context, ArrayRef<Stmt *> PreInits) {
+  if (PreInits.empty())
+    return nullptr;
+
+  SmallVector<Stmt *> Stmts;
+  for (Stmt *S : PreInits)
+    appendFlattendedStmtList(Stmts, S);
+  return CompoundStmt::Create(Context, PreInits, FPOptionsOverride(), {}, {});
+}
+
 /// Build postupdate expression for the given list of postupdates expressions.
 static Expr *buildPostUpdate(Sema &S, ArrayRef<Expr *> PostUpdates) {
   Expr *PostUpdate = nullptr;
@@ -9924,11 +9954,21 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
             Stmt *DependentPreInits = Transform->getPreInits();
             if (!DependentPreInits)
               return;
-            for (Decl *C : cast<DeclStmt>(DependentPreInits)->getDeclGroup()) {
-              auto *D = cast<VarDecl>(C);
-              DeclRefExpr *Ref = buildDeclRefExpr(SemaRef, D, D->getType(),
-                                                  Transform->getBeginLoc());
-              Captures[Ref] = Ref;
+
+            // Search for pre-init declared variables that need to be captured
+            // to be referenceable inside the directive.
+            SmallVector<Stmt *> Constituents;
+            appendFlattendedStmtList(Constituents, DependentPreInits);
+            for (Stmt *S : Constituents) {
+              if (auto *DC = dyn_cast<DeclStmt>(S)) {
+                for (Decl *C : DC->decls()) {
+                  auto *D = cast<VarDecl>(C);
+                  DeclRefExpr *Ref = buildDeclRefExpr(
+                      SemaRef, D, D->getType().getNonReferenceType(),
+                      Transform->getBeginLoc());
+                  Captures[Ref] = Ref;
+                }
+              }
             }
           }))
     return 0;
@@ -15059,9 +15099,7 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeSimdDirective(
 bool SemaOpenMP::checkTransformableLoopNest(
     OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops,
     SmallVectorImpl<OMPLoopBasedDirective::HelperExprs> &LoopHelpers,
-    Stmt *&Body,
-    SmallVectorImpl<SmallVector<llvm::PointerUnion<Stmt *, Decl *>, 0>>
-        &OriginalInits) {
+    Stmt *&Body, SmallVectorImpl<SmallVector<Stmt *, 0>> &OriginalInits) {
   OriginalInits.emplace_back();
   bool Result = OMPLoopBasedDirective::doForAllLoops(
       AStmt->IgnoreContainers(), /*TryImperfectlyNestedLoops=*/false, NumLoops,
@@ -15095,16 +15133,70 @@ bool SemaOpenMP::checkTransformableLoopNest(
           DependentPreInits = Dir->getPreInits();
         else
           llvm_unreachable("Unhandled loop transformation");
-        if (!DependentPreInits)
-          return;
-        llvm::append_range(OriginalInits.back(),
-                           cast<DeclStmt>(DependentPreInits)->getDeclGroup());
+
+        appendFlattendedStmtList(OriginalInits.back(), DependentPreInits);
       });
   assert(OriginalInits.back().empty() && "No preinit after innermost loop");
   OriginalInits.pop_back();
   return Result;
 }
 
+/// Add preinit statements that need to be propageted from the selected loop.
+static void addLoopPreInits(ASTContext &Context,
+                            OMPLoopBasedDirective::HelperExprs &LoopHelper,
+                            Stmt *LoopStmt, ArrayRef<Stmt *> OriginalInit,
+                            SmallVectorImpl<Stmt *> &PreInits) {
+
+  // For range-based for-statements, ensure that their syntactic sugar is
+  // executed by adding them as pre-init statements.
+  if (auto *CXXRangeFor = dyn_cast<CXXForRangeStmt>(LoopStmt)) {
+    Stmt *RangeInit = CXXRangeFor->getInit();
+    if (RangeInit)
+      PreInits.push_back(RangeInit);
+
+    DeclStmt *RangeStmt = CXXRangeFor->getRangeStmt();
+    PreInits.push_back(new (Context) DeclStmt(RangeStmt->getDeclGroup(),
+                                              RangeStmt->getBeginLoc(),
+                                              RangeStmt->getEndLoc()));
+
+    DeclStmt *RangeEnd = CXXRangeFor->getEndStmt();
+    PreInits.push_back(new (Context) DeclStmt(RangeEnd->getDeclGroup(),
+                                              RangeEnd->getBeginLoc(),
+                                              RangeEnd->getEndLoc()));
+  }
+
+  llvm::append_range(PreInits, OriginalInit);
+
+  // List of OMPCapturedExprDecl, for __begin, __end, and NumIterations
+  if (auto *PI = cast_or_null<DeclStmt>(LoopHelper.PreInits)) {
+    PreInits.push_back(new (Context) DeclStmt(
+        PI->getDeclGroup(), PI->getBeginLoc(), PI->getEndLoc()));
+  }
+
+  // Gather declarations for the data members used as counters.
+  for (Expr *CounterRef : LoopHelper.Counters) {
+    auto *CounterDecl = cast<DeclRefExpr>(CounterRef)->getDecl();
+    if (isa<OMPCapturedExprDecl>(CounterDecl))
+      PreInits.push_back(new (Context) DeclStmt(
+          DeclGroupRef(CounterDecl), SourceLocation(), SourceLocation()));
+  }
+}
+
+/// Collect the loop statements (ForStmt or CXXRangeForStmt) of the affected
+/// loop of a construct.
+static void collectLoopStmts(Stmt *AStmt, MutableArrayRef<Stmt *> LoopStmts) {
+  size_t NumLoops = LoopStmts.size();
+  OMPLoopBasedDirective::doForAllLoops(
+      AStmt, /*TryImperfectlyNestedLoops=*/false, NumLoops,
+      [LoopStmts](unsigned Cnt, Stmt *CurStmt) {
+        assert(!LoopStmts[Cnt] && "Loop statement must not yet be assigned");
+        LoopStmts[Cnt] = CurStmt;
+        return false;
+      });
+  assert(!is_contained(LoopStmts, nullptr) &&
+         "Expecting a loop statement for each affected loop");
+}
+
 StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
                                                 Stmt *AStmt,
                                                 SourceLocation StartLoc,
@@ -15126,8 +15218,7 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
   // Verify and diagnose loop nest.
   SmallVector<OMPLoopBasedDirective::HelperExprs, 4> LoopHelpers(NumLoops);
   Stmt *Body = nullptr;
-  SmallVector<SmallVector<llvm::PointerUnion<Stmt *, Decl *>, 0>, 4>
-      OriginalInits;
+  SmallVector<SmallVector<Stmt *, 0>, 4> OriginalInits;
   if (!checkTransformableLoopNest(OMPD_tile, AStmt, NumLoops, LoopHelpers, Body,
                                   OriginalInits))
     return StmtError();
@@ -15144,7 +15235,11 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
          "Expecting loop iteration space dimensionality to match number of "
          "affected loops");
 
-  SmallVector<Decl *, 4> PreInits;
+  // Collect all affected loop statements.
+  SmallVector<Stmt *> LoopStmts(NumLoops, nullptr);
+  collectLoopStmts(AStmt, LoopStmts);
+
+  SmallVector<Stmt *, 4> PreInits;
   CaptureVars CopyTransformer(SemaRef);
 
   // Create iteration variables for the generated loops.
@@ -15184,20 +15279,9 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
           &SemaRef.PP.getIdentifierTable().get(TileCntName));
       TileIndVars[I] = TileCntDecl;
     }
-    for (auto &P : OriginalInits[I]) {
-      if (auto *D = P.dyn_cast<Decl *>())
-        PreInits.push_back(D);
-      else if (auto *PI = dyn_cast_or_null<DeclStmt>(P.dyn_cast<Stmt *>()))
-        PreInits.append(PI->decl_begin(), PI->decl_end());
-    }
-    if (auto *PI = cast_or_null<DeclStmt>(LoopHelper.PreInits))
-      PreInits.append(PI->decl_begin(), PI->decl_end());
-    // Gather declarations for the data members used as counters.
-    for (Expr *CounterRef : LoopHelper.Counters) {
-      auto *CounterDecl = cast<DeclRefExpr>(CounterRef)->getDecl();
-      if (isa<OMPCapturedExprDecl>(CounterDecl))
-        PreInits.push_back(CounterDecl);
-    }
+
+    addLoopPreInits(Context, LoopHelper, LoopStmts[I], OriginalInits[I],
+                    PreInits);
   }
 
   // Once the original iteration values are set, append the innermost body.
@@ -15246,19 +15330,20 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
     OMPLoopBasedDirective::HelperExprs &LoopHelper = LoopHelpers[I];
     Expr *NumIterations = LoopHelper.NumIterations;
     auto *OrigCntVar = cast<DeclRefExpr>(LoopHelper.Counters[0]);
-    QualType CntTy = OrigCntVar->getType();
+    QualType IVTy = NumIterations->getType();
+    Stmt *LoopStmt = LoopStmts[I];
 
     // Commonly used variables. One of the constraints of an AST is that every
     // node object must appear at most once, hence we define lamdas that create
     // a new AST node at every use.
-    auto MakeTileIVRef = [&SemaRef = this->SemaRef, &TileIndVars, I, CntTy,
+    auto MakeTileIVRef = [&SemaRef = this->SemaRef, &TileIndVars, I, IVTy,
                           OrigCntVar]() {
-      return buildDeclRefExpr(SemaRef, TileIndVars[I], CntTy,
+      return buildDeclRefExpr(SemaRef, TileIndVars[I], IVTy,
                               OrigCntVar->getExprLoc());
     };
-    auto MakeFloorIVRef = [&SemaRef = this->SemaRef, &FloorIndVars, I, CntTy,
+    auto MakeFloorIVRef = [&SemaRef = this->SemaRef, &FloorIndVars, I, IVTy,
                            OrigCntVar]() {
-      return buildDeclRefExpr(SemaRef, FloorIndVars[I], CntTy,
+      return buildDeclRefExpr(SemaRef, FloorIndVars[I], IVTy,
                               OrigCntVar->getExprLoc());
     };
 
@@ -15320,6 +15405,8 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
     // further into the inner loop.
     SmallVector<Stmt *, 4> BodyParts;
     BodyParts.append(LoopHelper.Updates.begin(), LoopHelper.Updates.end());
+    if (auto *SourceCXXFor = dyn_cast<CXXForRangeStmt>(LoopStmt))
+      BodyParts.push_back(SourceCXXFor->getLoopVarStmt());
     BodyParts.push_back(Inner);
     Inner = CompoundStmt::Create(Context, BodyParts, FPOptionsOverride(),
                                  Inner->getBeginLoc(), Inner->getEndLoc());
@@ -15334,12 +15421,14 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
     auto &LoopHelper = LoopHelpers[I];
     Expr *NumIterations = LoopHelper.NumIterations;
     DeclRefExpr *OrigCntVar = cast<DeclRefExpr>(LoopHelper.Counters[0]);
-    QualType CntTy = OrigCntVar->getType();
+    QualType IVTy = NumIterations->getType();
 
-    // Commonly used variables.
-    auto MakeFloorIVRef = [&SemaRef = this->SemaRef, &FloorIndVars, I, CntTy,
+    // Commonly used variables. One of the constraints of an AST is that every
+    // node object must appear at most once, hence we define lamdas that create
+    // a new AST node at every use.
+    auto MakeFloorIVRef = [&SemaRef = this->SemaRef, &FloorIndVars, I, IVTy,
                            OrigCntVar]() {
-      return buildDeclRefExpr(SemaRef, FloorIndVars[I], CntTy,
+      return buildDeclRefExpr(SemaRef, FloorIndVars[I], IVTy,
                               OrigCntVar->getExprLoc());
     };
 
@@ -15405,8 +15494,7 @@ StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
   Stmt *Body = nullptr;
   SmallVector<OMPLoopBasedDirective::HelperExprs, NumLoops> LoopHelpers(
       NumLoops);
-  SmallVector<SmallVector<llvm::PointerUnion<Stmt *, Decl *>, 0>, NumLoops + 1>
-      OriginalInits;
+  SmallVector<SmallVector<Stmt *, 0>, NumLoops + 1> OriginalInits;
   if (!checkTransformableLoopNest(OMPD_unroll, AStmt, NumLoops, LoopHelpers,
                                   Body, OriginalInits))
     return StmtError();
@@ -15418,6 +15506,10 @@ StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
     return OMPUnrollDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
                                       NumGeneratedLoops, nullptr, nullptr);
 
+  assert(LoopHelpers.size() == NumLoops &&
+         "Expecting a single-dimensional loop iteration space");
+  assert(OriginalInits.size() == NumLoops &&
+         "Expecting a single-dimensional loop iteration space");
   OMPLoopBasedDirective::HelperExprs &LoopHelper = LoopHelpers.front();
 
   if (FullClause) {
@@ -15481,24 +15573,13 @@ StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
   // of a canonical loop nest where these PreInits are emitted before the
   // outermost directive.
 
+  // Find the loop statement.
+  Stmt *LoopStmt = nullptr;
+  collectLoopStmts(AStmt, {LoopStmt});
+
   // Determine the PreInit declarations.
-  SmallVector<Decl *, 4> PreInits;
-  assert(OriginalInits.size() == 1 &&
-         "Expecting a single-dimensional loop iteration space");
-  for (auto &P : OriginalInits[0]) {
-    if (auto *D = P.dyn_cast<Decl *>())
-      PreInits.push_back(D);
-    else if (auto *PI = dyn_cast_or_null<DeclStmt>(P.dyn_cast<Stmt *>()))
-      PreInits.append(PI->decl_begin(), PI->decl_end());
-  }
-  if (auto *PI = cast_or_null<DeclStmt>(LoopHelper.PreInits))
-    PreInits.append(PI->decl_begin(), PI->decl_end());
-  // Gather declarations for the data members used as counters.
-  for (Expr *CounterRef : LoopHelper.Counters) {
-    auto *CounterDecl = cast<DeclRefExpr>(CounterRef)->getDecl();
-    if (isa<OMPCapturedExprDecl>(CounterDecl))
-      PreInits.push_back(CounterDecl);
-  }
+  SmallVector<Stmt *, 4> PreInits;
+  addLoopPreInits(Context, LoopHelper, LoopStmt, OriginalInits[0], PreInits);
 
   auto *IterationVarRef = cast<DeclRefExpr>(LoopHelper.IterationVarRef);
   QualType IVTy = IterationVarRef->getType();
@@ -15604,6 +15685,8 @@ StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
   // Inner For statement.
   SmallVector<Stmt *> InnerBodyStmts;
   InnerBodyStmts.append(LoopHelper.Updates.begin(), LoopHelper.Updates.end());
+  if (auto *CXXRangeFor = dyn_cast<CXXForRangeStmt>(LoopStmt))
+    InnerBodyStmts.push_back(CXXRangeFor->getLoopVarStmt());
   InnerBodyStmts.push_back(Body);
   CompoundStmt *InnerBody =
       CompoundStmt::Create(getASTContext(), InnerBodyStmts, FPOptionsOverride(),
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 2eb2523..0c89fca 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -6472,17 +6472,20 @@ ExprResult Sema::InitializeExplicitObjectArgument(Sema &S, Expr *Obj,
       Obj->getExprLoc(), Obj);
 }
 
-static void PrepareExplicitObjectArgument(Sema &S, CXXMethodDecl *Method,
+static bool PrepareExplicitObjectArgument(Sema &S, CXXMethodDecl *Method,
                                           Expr *Object, MultiExprArg &Args,
                                           SmallVectorImpl<Expr *> &NewArgs) {
   assert(Method->isExplicitObjectMemberFunction() &&
          "Method is not an explicit member function");
   assert(NewArgs.empty() && "NewArgs should be empty");
+
   NewArgs.reserve(Args.size() + 1);
   Expr *This = GetExplicitObjectExpr(S, Object, Method);
   NewArgs.push_back(This);
   NewArgs.append(Args.begin(), Args.end());
   Args = NewArgs;
+  return S.DiagnoseInvalidExplicitObjectParameterInLambda(
+      Method, Object->getBeginLoc());
 }
 
 /// Determine whether the provided type is an integral type, or an enumeration
@@ -15612,8 +15615,10 @@ ExprResult Sema::BuildCallToMemberFunction(Scope *S, Expr *MemExprE,
   CallExpr *TheCall = nullptr;
   llvm::SmallVector<Expr *, 8> NewArgs;
   if (Method->isExplicitObjectMemberFunction()) {
-    PrepareExplicitObjectArgument(*this, Method, MemExpr->getBase(), Args,
-                                  NewArgs);
+    if (PrepareExplicitObjectArgument(*this, Method, MemExpr->getBase(), Args,
+                                      NewArgs))
+      return ExprError();
+
     // Build the actual expression node.
     ExprResult FnExpr =
         CreateFunctionRefExpr(*this, Method, FoundDecl, MemExpr,
@@ -15927,9 +15932,7 @@ Sema::BuildCallToObjectOfClassType(Scope *S, Expr *Obj,
   // Initialize the object parameter.
   llvm::SmallVector<Expr *, 8> NewArgs;
   if (Method->isExplicitObjectMemberFunction()) {
-    // FIXME: we should do that during the definition of the lambda when we can.
-    DiagnoseInvalidExplicitObjectParameterInLambda(Method);
-    PrepareExplicitObjectArgument(*this, Method, Obj, Args, NewArgs);
+    IsError |= PrepareExplicitObjectArgument(*this, Method, Obj, Args, NewArgs);
   } else {
     ExprResult ObjRes = PerformImplicitObjectArgumentInitialization(
         Object.get(), /*Qualifier=*/nullptr, Best->FoundDecl, Method);
diff --git a/clang/lib/Sema/SemaRISCV.cpp b/clang/lib/Sema/SemaRISCV.cpp
index 26e13e8..ea6e3f7 100644
--- a/clang/lib/Sema/SemaRISCV.cpp
+++ b/clang/lib/Sema/SemaRISCV.cpp
@@ -1,4 +1,4 @@
-//==- SemaRISCVVectorLookup.cpp - Name Lookup for RISC-V Vector Intrinsic -==//
+//===------ SemaRISCV.cpp ------- RISC-V target-specific routines ---------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,20 +6,24 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This file implements name lookup for RISC-V vector intrinsic.
+//  This file implements semantic analysis functions specific to RISC-V.
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/Sema/SemaRISCV.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
 #include "clang/Basic/Builtins.h"
+#include "clang/Basic/TargetBuiltins.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Lex/Preprocessor.h"
+#include "clang/Sema/Initialization.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/RISCVIntrinsicManager.h"
 #include "clang/Sema/Sema.h"
 #include "clang/Support/RISCVVIntrinsicUtils.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/TargetParser/RISCVTargetParser.h"
 #include <optional>
 #include <string>
 #include <vector>
@@ -166,7 +170,6 @@ private:
   // Mapping function name to RVVOverloadIntrinsicDef.
   StringMap<RVVOverloadIntrinsicDef> OverloadIntrinsics;
 
-
   // Create RVVIntrinsicDef.
   void InitRVVIntrinsic(const RVVIntrinsicRecord &Record, StringRef SuffixStr,
                         StringRef OverloadedSuffixStr, bool IsMask,
@@ -342,18 +345,17 @@ void RISCVIntrinsicManagerImpl::ConstructRVVIntrinsics(
                            /*IsMask=*/true, *PolicyTypes, MaskedHasPolicy, P);
         }
       } // End for different LMUL
-    }   // End for different TypeRange
+    } // End for different TypeRange
   }
 }
 
 void RISCVIntrinsicManagerImpl::InitIntrinsicList() {
 
-  if (S.DeclareRISCVVBuiltins && !ConstructedRISCVVBuiltins) {
+  if (S.RISCV().DeclareRVVBuiltins && !ConstructedRISCVVBuiltins) {
     ConstructedRISCVVBuiltins = true;
-    ConstructRVVIntrinsics(RVVIntrinsicRecords,
-                           IntrinsicKind::RVV);
+    ConstructRVVIntrinsics(RVVIntrinsicRecords, IntrinsicKind::RVV);
   }
-  if (S.DeclareRISCVSiFiveVectorBuiltins &&
+  if (S.RISCV().DeclareSiFiveVectorBuiltins &&
       !ConstructedRISCVSiFiveVectorBuiltins) {
     ConstructedRISCVSiFiveVectorBuiltins = true;
     ConstructRVVIntrinsics(RVSiFiveVectorIntrinsicRecords,
@@ -501,4 +503,925 @@ std::unique_ptr<clang::sema::RISCVIntrinsicManager>
 CreateRISCVIntrinsicManager(Sema &S) {
   return std::make_unique<RISCVIntrinsicManagerImpl>(S);
 }
+
+bool SemaRISCV::CheckLMUL(CallExpr *TheCall, unsigned ArgNum) {
+  llvm::APSInt Result;
+
+  // We can't check the value of a dependent argument.
+  Expr *Arg = TheCall->getArg(ArgNum);
+  if (Arg->isTypeDependent() || Arg->isValueDependent())
+    return false;
+
+  // Check constant-ness first.
+  if (SemaRef.BuiltinConstantArg(TheCall, ArgNum, Result))
+    return true;
+
+  int64_t Val = Result.getSExtValue();
+  if ((Val >= 0 && Val <= 3) || (Val >= 5 && Val <= 7))
+    return false;
+
+  return Diag(TheCall->getBeginLoc(), diag::err_riscv_builtin_invalid_lmul)
+         << Arg->getSourceRange();
+}
+
+static bool CheckInvalidVLENandLMUL(const TargetInfo &TI, CallExpr *TheCall,
+                                    Sema &S, QualType Type, int EGW) {
+  assert((EGW == 128 || EGW == 256) && "EGW can only be 128 or 256 bits");
+
+  // LMUL * VLEN >= EGW
+  ASTContext::BuiltinVectorTypeInfo Info =
+      S.Context.getBuiltinVectorTypeInfo(Type->castAs<BuiltinType>());
+  unsigned ElemSize = S.Context.getTypeSize(Info.ElementType);
+  unsigned MinElemCount = Info.EC.getKnownMinValue();
+
+  unsigned EGS = EGW / ElemSize;
+  // If EGS is less than or equal to the minimum number of elements, then the
+  // type is valid.
+  if (EGS <= MinElemCount)
+    return false;
+
+  // Otherwise, we need vscale to be at least EGS / MinElemCont.
+  assert(EGS % MinElemCount == 0);
+  unsigned VScaleFactor = EGS / MinElemCount;
+  // Vscale is VLEN/RVVBitsPerBlock.
+  unsigned MinRequiredVLEN = VScaleFactor * llvm::RISCV::RVVBitsPerBlock;
+  std::string RequiredExt = "zvl" + std::to_string(MinRequiredVLEN) + "b";
+  if (!TI.hasFeature(RequiredExt))
+    return S.Diag(TheCall->getBeginLoc(),
+                  diag::err_riscv_type_requires_extension)
+           << Type << RequiredExt;
+
+  return false;
+}
+
+bool SemaRISCV::CheckBuiltinFunctionCall(const TargetInfo &TI,
+                                         unsigned BuiltinID,
+                                         CallExpr *TheCall) {
+  ASTContext &Context = getASTContext();
+  // vmulh.vv, vmulh.vx, vmulhu.vv, vmulhu.vx, vmulhsu.vv, vmulhsu.vx,
+  // vsmul.vv, vsmul.vx are not included for EEW=64 in Zve64*.
+  switch (BuiltinID) {
+  default:
+    break;
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vv:
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vx:
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vx_tu:
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vv_m:
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vx_m:
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vv_mu:
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vx_mu:
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vv_tum:
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vx_tum:
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vv_tumu:
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vx_tumu:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vv:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vx:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vx_tu:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vv_m:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vx_m:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vv_mu:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vx_mu:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vv_tum:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vx_tum:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vv_tumu:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vx_tumu:
+  case RISCVVector::BI__builtin_rvv_vmulh_vv:
+  case RISCVVector::BI__builtin_rvv_vmulh_vx:
+  case RISCVVector::BI__builtin_rvv_vmulh_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vmulh_vx_tu:
+  case RISCVVector::BI__builtin_rvv_vmulh_vv_m:
+  case RISCVVector::BI__builtin_rvv_vmulh_vx_m:
+  case RISCVVector::BI__builtin_rvv_vmulh_vv_mu:
+  case RISCVVector::BI__builtin_rvv_vmulh_vx_mu:
+  case RISCVVector::BI__builtin_rvv_vmulh_vv_tum:
+  case RISCVVector::BI__builtin_rvv_vmulh_vx_tum:
+  case RISCVVector::BI__builtin_rvv_vmulh_vv_tumu:
+  case RISCVVector::BI__builtin_rvv_vmulh_vx_tumu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx_tu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv_m:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx_m:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv_mu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx_mu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv_tum:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx_tum:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv_tumu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx_tumu: {
+    ASTContext::BuiltinVectorTypeInfo Info = Context.getBuiltinVectorTypeInfo(
+        TheCall->getType()->castAs<BuiltinType>());
+
+    if (Context.getTypeSize(Info.ElementType) == 64 && !TI.hasFeature("v"))
+      return Diag(TheCall->getBeginLoc(),
+                  diag::err_riscv_builtin_requires_extension)
+             << /* IsExtension */ true << TheCall->getSourceRange() << "v";
+
+    break;
+  }
+  }
+
+  switch (BuiltinID) {
+  case RISCVVector::BI__builtin_rvv_vsetvli:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 3) ||
+           CheckLMUL(TheCall, 2);
+  case RISCVVector::BI__builtin_rvv_vsetvlimax:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
+           CheckLMUL(TheCall, 1);
+  case RISCVVector::BI__builtin_rvv_vget_v: {
+    ASTContext::BuiltinVectorTypeInfo ResVecInfo =
+        Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(
+            TheCall->getType().getCanonicalType().getTypePtr()));
+    ASTContext::BuiltinVectorTypeInfo VecInfo =
+        Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(
+            TheCall->getArg(0)->getType().getCanonicalType().getTypePtr()));
+    unsigned MaxIndex;
+    if (VecInfo.NumVectors != 1) // vget for tuple type
+      MaxIndex = VecInfo.NumVectors;
+    else // vget for non-tuple type
+      MaxIndex = (VecInfo.EC.getKnownMinValue() * VecInfo.NumVectors) /
+                 (ResVecInfo.EC.getKnownMinValue() * ResVecInfo.NumVectors);
+    return SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, MaxIndex - 1);
+  }
+  case RISCVVector::BI__builtin_rvv_vset_v: {
+    ASTContext::BuiltinVectorTypeInfo ResVecInfo =
+        Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(
+            TheCall->getType().getCanonicalType().getTypePtr()));
+    ASTContext::BuiltinVectorTypeInfo VecInfo =
+        Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(
+            TheCall->getArg(2)->getType().getCanonicalType().getTypePtr()));
+    unsigned MaxIndex;
+    if (ResVecInfo.NumVectors != 1) // vset for tuple type
+      MaxIndex = ResVecInfo.NumVectors;
+    else // vset fo non-tuple type
+      MaxIndex = (ResVecInfo.EC.getKnownMinValue() * ResVecInfo.NumVectors) /
+                 (VecInfo.EC.getKnownMinValue() * VecInfo.NumVectors);
+    return SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, MaxIndex - 1);
+  }
+  // Vector Crypto
+  case RISCVVector::BI__builtin_rvv_vaeskf1_vi_tu:
+  case RISCVVector::BI__builtin_rvv_vaeskf2_vi_tu:
+  case RISCVVector::BI__builtin_rvv_vaeskf2_vi:
+  case RISCVVector::BI__builtin_rvv_vsm4k_vi_tu: {
+    QualType Op1Type = TheCall->getArg(0)->getType();
+    QualType Op2Type = TheCall->getArg(1)->getType();
+    return CheckInvalidVLENandLMUL(TI, TheCall, SemaRef, Op1Type, 128) ||
+           CheckInvalidVLENandLMUL(TI, TheCall, SemaRef, Op2Type, 128) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 31);
+  }
+  case RISCVVector::BI__builtin_rvv_vsm3c_vi_tu:
+  case RISCVVector::BI__builtin_rvv_vsm3c_vi: {
+    QualType Op1Type = TheCall->getArg(0)->getType();
+    return CheckInvalidVLENandLMUL(TI, TheCall, SemaRef, Op1Type, 256) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 31);
+  }
+  case RISCVVector::BI__builtin_rvv_vaeskf1_vi:
+  case RISCVVector::BI__builtin_rvv_vsm4k_vi: {
+    QualType Op1Type = TheCall->getArg(0)->getType();
+    return CheckInvalidVLENandLMUL(TI, TheCall, SemaRef, Op1Type, 128) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 31);
+  }
+  case RISCVVector::BI__builtin_rvv_vaesdf_vv:
+  case RISCVVector::BI__builtin_rvv_vaesdf_vs:
+  case RISCVVector::BI__builtin_rvv_vaesdm_vv:
+  case RISCVVector::BI__builtin_rvv_vaesdm_vs:
+  case RISCVVector::BI__builtin_rvv_vaesef_vv:
+  case RISCVVector::BI__builtin_rvv_vaesef_vs:
+  case RISCVVector::BI__builtin_rvv_vaesem_vv:
+  case RISCVVector::BI__builtin_rvv_vaesem_vs:
+  case RISCVVector::BI__builtin_rvv_vaesz_vs:
+  case RISCVVector::BI__builtin_rvv_vsm4r_vv:
+  case RISCVVector::BI__builtin_rvv_vsm4r_vs:
+  case RISCVVector::BI__builtin_rvv_vaesdf_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vaesdf_vs_tu:
+  case RISCVVector::BI__builtin_rvv_vaesdm_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vaesdm_vs_tu:
+  case RISCVVector::BI__builtin_rvv_vaesef_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vaesef_vs_tu:
+  case RISCVVector::BI__builtin_rvv_vaesem_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vaesem_vs_tu:
+  case RISCVVector::BI__builtin_rvv_vaesz_vs_tu:
+  case RISCVVector::BI__builtin_rvv_vsm4r_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vsm4r_vs_tu: {
+    QualType Op1Type = TheCall->getArg(0)->getType();
+    QualType Op2Type = TheCall->getArg(1)->getType();
+    return CheckInvalidVLENandLMUL(TI, TheCall, SemaRef, Op1Type, 128) ||
+           CheckInvalidVLENandLMUL(TI, TheCall, SemaRef, Op2Type, 128);
+  }
+  case RISCVVector::BI__builtin_rvv_vsha2ch_vv:
+  case RISCVVector::BI__builtin_rvv_vsha2cl_vv:
+  case RISCVVector::BI__builtin_rvv_vsha2ms_vv:
+  case RISCVVector::BI__builtin_rvv_vsha2ch_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vsha2cl_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vsha2ms_vv_tu: {
+    QualType Op1Type = TheCall->getArg(0)->getType();
+    QualType Op2Type = TheCall->getArg(1)->getType();
+    QualType Op3Type = TheCall->getArg(2)->getType();
+    ASTContext::BuiltinVectorTypeInfo Info =
+        Context.getBuiltinVectorTypeInfo(Op1Type->castAs<BuiltinType>());
+    uint64_t ElemSize = Context.getTypeSize(Info.ElementType);
+    if (ElemSize == 64 && !TI.hasFeature("zvknhb"))
+      return Diag(TheCall->getBeginLoc(),
+                  diag::err_riscv_builtin_requires_extension)
+             << /* IsExtension */ true << TheCall->getSourceRange() << "zvknb";
+
+    return CheckInvalidVLENandLMUL(TI, TheCall, SemaRef, Op1Type,
+                                   ElemSize * 4) ||
+           CheckInvalidVLENandLMUL(TI, TheCall, SemaRef, Op2Type,
+                                   ElemSize * 4) ||
+           CheckInvalidVLENandLMUL(TI, TheCall, SemaRef, Op3Type, ElemSize * 4);
+  }
+
+  case RISCVVector::BI__builtin_rvv_sf_vc_i_se:
+    // bit_27_26, bit_24_20, bit_11_7, simm5, sew, log2lmul
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 31) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 31) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 3, -16, 15) ||
+           CheckLMUL(TheCall, 5);
+  case RISCVVector::BI__builtin_rvv_sf_vc_iv_se:
+    // bit_27_26, bit_11_7, vs2, simm5
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 31) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 3, -16, 15);
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_i:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_i_se:
+    // bit_27_26, bit_24_20, simm5
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 31) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 2, -16, 15);
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_iv:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_iv_se:
+    // bit_27_26, vs2, simm5
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 2, -16, 15);
+  case RISCVVector::BI__builtin_rvv_sf_vc_ivv_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_ivw_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_ivv:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_ivw:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_ivv_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_ivw_se:
+    // bit_27_26, vd, vs2, simm5
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 3, -16, 15);
+  case RISCVVector::BI__builtin_rvv_sf_vc_x_se:
+    // bit_27_26, bit_24_20, bit_11_7, xs1, sew, log2lmul
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 31) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 31) ||
+           CheckLMUL(TheCall, 5);
+  case RISCVVector::BI__builtin_rvv_sf_vc_xv_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_vv_se:
+    // bit_27_26, bit_11_7, vs2, xs1/vs1
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_x:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_x_se:
+    // bit_27_26, bit_24-20, xs1
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 31);
+  case RISCVVector::BI__builtin_rvv_sf_vc_vvv_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_xvv_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_vvw_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_xvw_se:
+    // bit_27_26, vd, vs2, xs1
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_xv:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_vv:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_xv_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_vv_se:
+    // bit_27_26, vs2, xs1/vs1
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_xvv:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_vvv:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_xvw:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_vvw:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_xvv_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_vvv_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_xvw_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_vvw_se:
+    // bit_27_26, vd, vs2, xs1/vs1
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 3);
+  case RISCVVector::BI__builtin_rvv_sf_vc_fv_se:
+    // bit_26, bit_11_7, vs2, fs1
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 1) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 31);
+  case RISCVVector::BI__builtin_rvv_sf_vc_fvv_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_fvw_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_fvv:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_fvw:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_fvv_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_fvw_se:
+    // bit_26, vd, vs2, fs1
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_fv:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_fv_se:
+    // bit_26, vs2, fs1
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 1);
+  // Check if byteselect is in [0, 3]
+  case RISCV::BI__builtin_riscv_aes32dsi:
+  case RISCV::BI__builtin_riscv_aes32dsmi:
+  case RISCV::BI__builtin_riscv_aes32esi:
+  case RISCV::BI__builtin_riscv_aes32esmi:
+  case RISCV::BI__builtin_riscv_sm4ks:
+  case RISCV::BI__builtin_riscv_sm4ed:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 3);
+  // Check if rnum is in [0, 10]
+  case RISCV::BI__builtin_riscv_aes64ks1i:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 10);
+  // Check if value range for vxrm is in [0, 3]
+  case RISCVVector::BI__builtin_rvv_vaaddu_vv:
+  case RISCVVector::BI__builtin_rvv_vaaddu_vx:
+  case RISCVVector::BI__builtin_rvv_vaadd_vv:
+  case RISCVVector::BI__builtin_rvv_vaadd_vx:
+  case RISCVVector::BI__builtin_rvv_vasubu_vv:
+  case RISCVVector::BI__builtin_rvv_vasubu_vx:
+  case RISCVVector::BI__builtin_rvv_vasub_vv:
+  case RISCVVector::BI__builtin_rvv_vasub_vx:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx:
+  case RISCVVector::BI__builtin_rvv_vssra_vv:
+  case RISCVVector::BI__builtin_rvv_vssra_vx:
+  case RISCVVector::BI__builtin_rvv_vssrl_vv:
+  case RISCVVector::BI__builtin_rvv_vssrl_vx:
+  case RISCVVector::BI__builtin_rvv_vnclip_wv:
+  case RISCVVector::BI__builtin_rvv_vnclip_wx:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wv:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wx:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 3);
+  case RISCVVector::BI__builtin_rvv_vaaddu_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vaaddu_vx_tu:
+  case RISCVVector::BI__builtin_rvv_vaadd_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vaadd_vx_tu:
+  case RISCVVector::BI__builtin_rvv_vasubu_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vasubu_vx_tu:
+  case RISCVVector::BI__builtin_rvv_vasub_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vasub_vx_tu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx_tu:
+  case RISCVVector::BI__builtin_rvv_vssra_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vssra_vx_tu:
+  case RISCVVector::BI__builtin_rvv_vssrl_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vssrl_vx_tu:
+  case RISCVVector::BI__builtin_rvv_vnclip_wv_tu:
+  case RISCVVector::BI__builtin_rvv_vnclip_wx_tu:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wv_tu:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wx_tu:
+  case RISCVVector::BI__builtin_rvv_vaaddu_vv_m:
+  case RISCVVector::BI__builtin_rvv_vaaddu_vx_m:
+  case RISCVVector::BI__builtin_rvv_vaadd_vv_m:
+  case RISCVVector::BI__builtin_rvv_vaadd_vx_m:
+  case RISCVVector::BI__builtin_rvv_vasubu_vv_m:
+  case RISCVVector::BI__builtin_rvv_vasubu_vx_m:
+  case RISCVVector::BI__builtin_rvv_vasub_vv_m:
+  case RISCVVector::BI__builtin_rvv_vasub_vx_m:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv_m:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx_m:
+  case RISCVVector::BI__builtin_rvv_vssra_vv_m:
+  case RISCVVector::BI__builtin_rvv_vssra_vx_m:
+  case RISCVVector::BI__builtin_rvv_vssrl_vv_m:
+  case RISCVVector::BI__builtin_rvv_vssrl_vx_m:
+  case RISCVVector::BI__builtin_rvv_vnclip_wv_m:
+  case RISCVVector::BI__builtin_rvv_vnclip_wx_m:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wv_m:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wx_m:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 3, 0, 3);
+  case RISCVVector::BI__builtin_rvv_vaaddu_vv_tum:
+  case RISCVVector::BI__builtin_rvv_vaaddu_vv_tumu:
+  case RISCVVector::BI__builtin_rvv_vaaddu_vv_mu:
+  case RISCVVector::BI__builtin_rvv_vaaddu_vx_tum:
+  case RISCVVector::BI__builtin_rvv_vaaddu_vx_tumu:
+  case RISCVVector::BI__builtin_rvv_vaaddu_vx_mu:
+  case RISCVVector::BI__builtin_rvv_vaadd_vv_tum:
+  case RISCVVector::BI__builtin_rvv_vaadd_vv_tumu:
+  case RISCVVector::BI__builtin_rvv_vaadd_vv_mu:
+  case RISCVVector::BI__builtin_rvv_vaadd_vx_tum:
+  case RISCVVector::BI__builtin_rvv_vaadd_vx_tumu:
+  case RISCVVector::BI__builtin_rvv_vaadd_vx_mu:
+  case RISCVVector::BI__builtin_rvv_vasubu_vv_tum:
+  case RISCVVector::BI__builtin_rvv_vasubu_vv_tumu:
+  case RISCVVector::BI__builtin_rvv_vasubu_vv_mu:
+  case RISCVVector::BI__builtin_rvv_vasubu_vx_tum:
+  case RISCVVector::BI__builtin_rvv_vasubu_vx_tumu:
+  case RISCVVector::BI__builtin_rvv_vasubu_vx_mu:
+  case RISCVVector::BI__builtin_rvv_vasub_vv_tum:
+  case RISCVVector::BI__builtin_rvv_vasub_vv_tumu:
+  case RISCVVector::BI__builtin_rvv_vasub_vv_mu:
+  case RISCVVector::BI__builtin_rvv_vasub_vx_tum:
+  case RISCVVector::BI__builtin_rvv_vasub_vx_tumu:
+  case RISCVVector::BI__builtin_rvv_vasub_vx_mu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv_mu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx_mu:
+  case RISCVVector::BI__builtin_rvv_vssra_vv_mu:
+  case RISCVVector::BI__builtin_rvv_vssra_vx_mu:
+  case RISCVVector::BI__builtin_rvv_vssrl_vv_mu:
+  case RISCVVector::BI__builtin_rvv_vssrl_vx_mu:
+  case RISCVVector::BI__builtin_rvv_vnclip_wv_mu:
+  case RISCVVector::BI__builtin_rvv_vnclip_wx_mu:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wv_mu:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wx_mu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv_tum:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx_tum:
+  case RISCVVector::BI__builtin_rvv_vssra_vv_tum:
+  case RISCVVector::BI__builtin_rvv_vssra_vx_tum:
+  case RISCVVector::BI__builtin_rvv_vssrl_vv_tum:
+  case RISCVVector::BI__builtin_rvv_vssrl_vx_tum:
+  case RISCVVector::BI__builtin_rvv_vnclip_wv_tum:
+  case RISCVVector::BI__builtin_rvv_vnclip_wx_tum:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wv_tum:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wx_tum:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv_tumu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx_tumu:
+  case RISCVVector::BI__builtin_rvv_vssra_vv_tumu:
+  case RISCVVector::BI__builtin_rvv_vssra_vx_tumu:
+  case RISCVVector::BI__builtin_rvv_vssrl_vv_tumu:
+  case RISCVVector::BI__builtin_rvv_vssrl_vx_tumu:
+  case RISCVVector::BI__builtin_rvv_vnclip_wv_tumu:
+  case RISCVVector::BI__builtin_rvv_vnclip_wx_tumu:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wv_tumu:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wx_tumu:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 4, 0, 3);
+  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm:
+  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm:
+  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm:
+  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm:
+  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm:
+  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 4);
+  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm:
+  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfredosum_vs_rm:
+  case RISCVVector::BI__builtin_rvv_vfredusum_vs_rm:
+  case RISCVVector::BI__builtin_rvv_vfwredosum_vs_rm:
+  case RISCVVector::BI__builtin_rvv_vfwredusum_vs_rm:
+  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm_m:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 4);
+  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfredosum_vs_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfredusum_vs_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwredosum_vs_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwredusum_vs_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfredosum_vs_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfredusum_vs_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwredosum_vs_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwredusum_vs_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm_mu:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 3, 0, 4);
+  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfredosum_vs_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfredusum_vs_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwredosum_vs_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwredusum_vs_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm_mu:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 4, 0, 4);
+  case RISCV::BI__builtin_riscv_ntl_load:
+  case RISCV::BI__builtin_riscv_ntl_store:
+    DeclRefExpr *DRE =
+        cast<DeclRefExpr>(TheCall->getCallee()->IgnoreParenCasts());
+    assert((BuiltinID == RISCV::BI__builtin_riscv_ntl_store ||
+            BuiltinID == RISCV::BI__builtin_riscv_ntl_load) &&
+           "Unexpected RISC-V nontemporal load/store builtin!");
+    bool IsStore = BuiltinID == RISCV::BI__builtin_riscv_ntl_store;
+    unsigned NumArgs = IsStore ? 3 : 2;
+
+    if (SemaRef.checkArgCountAtLeast(TheCall, NumArgs - 1))
+      return true;
+
+    if (SemaRef.checkArgCountAtMost(TheCall, NumArgs))
+      return true;
+
+    // Domain value should be compile-time constant.
+    // 2 <= domain <= 5
+    if (TheCall->getNumArgs() == NumArgs &&
+        SemaRef.BuiltinConstantArgRange(TheCall, NumArgs - 1, 2, 5))
+      return true;
+
+    Expr *PointerArg = TheCall->getArg(0);
+    ExprResult PointerArgResult =
+        SemaRef.DefaultFunctionArrayLvalueConversion(PointerArg);
+
+    if (PointerArgResult.isInvalid())
+      return true;
+    PointerArg = PointerArgResult.get();
+
+    const PointerType *PtrType = PointerArg->getType()->getAs<PointerType>();
+    if (!PtrType) {
+      Diag(DRE->getBeginLoc(), diag::err_nontemporal_builtin_must_be_pointer)
+          << PointerArg->getType() << PointerArg->getSourceRange();
+      return true;
+    }
+
+    QualType ValType = PtrType->getPointeeType();
+    ValType = ValType.getUnqualifiedType();
+    if (!ValType->isIntegerType() && !ValType->isAnyPointerType() &&
+        !ValType->isBlockPointerType() && !ValType->isFloatingType() &&
+        !ValType->isVectorType() && !ValType->isRVVSizelessBuiltinType()) {
+      Diag(DRE->getBeginLoc(),
+           diag::err_nontemporal_builtin_must_be_pointer_intfltptr_or_vector)
+          << PointerArg->getType() << PointerArg->getSourceRange();
+      return true;
+    }
+
+    if (!IsStore) {
+      TheCall->setType(ValType);
+      return false;
+    }
+
+    ExprResult ValArg = TheCall->getArg(1);
+    InitializedEntity Entity = InitializedEntity::InitializeParameter(
+        Context, ValType, /*consume*/ false);
+    ValArg =
+        SemaRef.PerformCopyInitialization(Entity, SourceLocation(), ValArg);
+    if (ValArg.isInvalid())
+      return true;
+
+    TheCall->setArg(1, ValArg.get());
+    TheCall->setType(Context.VoidTy);
+    return false;
+  }
+
+  return false;
+}
+
+void SemaRISCV::checkRVVTypeSupport(QualType Ty, SourceLocation Loc, Decl *D,
+                                    const llvm::StringMap<bool> &FeatureMap) {
+  ASTContext::BuiltinVectorTypeInfo Info =
+      SemaRef.Context.getBuiltinVectorTypeInfo(Ty->castAs<BuiltinType>());
+  unsigned EltSize = SemaRef.Context.getTypeSize(Info.ElementType);
+  unsigned MinElts = Info.EC.getKnownMinValue();
+
+  if (Info.ElementType->isSpecificBuiltinType(BuiltinType::Double) &&
+      !FeatureMap.lookup("zve64d"))
+    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve64d";
+  // (ELEN, LMUL) pairs of (8, mf8), (16, mf4), (32, mf2), (64, m1) requires at
+  // least zve64x
+  else if (((EltSize == 64 && Info.ElementType->isIntegerType()) ||
+            MinElts == 1) &&
+           !FeatureMap.lookup("zve64x"))
+    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve64x";
+  else if (Info.ElementType->isFloat16Type() && !FeatureMap.lookup("zvfh") &&
+           !FeatureMap.lookup("zvfhmin"))
+    Diag(Loc, diag::err_riscv_type_requires_extension, D)
+        << Ty << "zvfh or zvfhmin";
+  else if (Info.ElementType->isBFloat16Type() &&
+           !FeatureMap.lookup("experimental-zvfbfmin"))
+    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zvfbfmin";
+  else if (Info.ElementType->isSpecificBuiltinType(BuiltinType::Float) &&
+           !FeatureMap.lookup("zve32f"))
+    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve32f";
+  // Given that caller already checked isRVVType() before calling this function,
+  // if we don't have at least zve32x supported, then we need to emit error.
+  else if (!FeatureMap.lookup("zve32x"))
+    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve32x";
+}
+
+/// Are the two types RVV-bitcast-compatible types? I.e. is bitcasting from the
+/// first RVV type (e.g. an RVV scalable type) to the second type (e.g. an RVV
+/// VLS type) allowed?
+///
+/// This will also return false if the two given types do not make sense from
+/// the perspective of RVV bitcasts.
+bool SemaRISCV::isValidRVVBitcast(QualType srcTy, QualType destTy) {
+  assert(srcTy->isVectorType() || destTy->isVectorType());
+
+  auto ValidScalableConversion = [](QualType FirstType, QualType SecondType) {
+    if (!FirstType->isRVVSizelessBuiltinType())
+      return false;
+
+    const auto *VecTy = SecondType->getAs<VectorType>();
+    return VecTy && VecTy->getVectorKind() == VectorKind::RVVFixedLengthData;
+  };
+
+  return ValidScalableConversion(srcTy, destTy) ||
+         ValidScalableConversion(destTy, srcTy);
+}
+
+SemaRISCV::SemaRISCV(Sema &S) : SemaBase(S) {}
+
 } // namespace clang
diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp
index 36f8eca..8735d96 100644
--- a/clang/lib/Sema/SemaStmtAttr.cpp
+++ b/clang/lib/Sema/SemaStmtAttr.cpp
@@ -665,7 +665,8 @@ bool Sema::CheckRebuiltStmtAttributes(ArrayRef<const Attr *> Attrs) {
 ExprResult Sema::ActOnCXXAssumeAttr(Stmt *St, const ParsedAttr &A,
                                     SourceRange Range) {
   if (A.getNumArgs() != 1 || !A.getArgAsExpr(0)) {
-    Diag(A.getLoc(), diag::err_assume_attr_args) << A.getAttrName() << Range;
+    Diag(A.getLoc(), diag::err_attribute_wrong_number_arguments)
+        << A.getAttrName() << 1 << Range;
     return ExprError();
   }
 
@@ -682,8 +683,11 @@ ExprResult Sema::ActOnCXXAssumeAttr(Stmt *St, const ParsedAttr &A,
     Assumption = Res.get();
   }
 
-  if (!getLangOpts().CPlusPlus23)
+  if (!getLangOpts().CPlusPlus23 &&
+      A.getSyntax() == AttributeCommonInfo::AS_CXX11) {
+    llvm::dbgs() << "Syntax: " << int(A.getSyntax()) << "\n";
     Diag(A.getLoc(), diag::ext_cxx23_attr) << A << Range;
+  }
 
   return Assumption;
 }
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 02d9b64..39e9dbe 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -1071,7 +1071,8 @@ NamedDecl *Sema::ActOnTypeParameter(Scope *S, bool Typename,
       return Param;
     }
 
-    Param->setDefaultArgument(DefaultTInfo);
+    Param->setDefaultArgument(
+        Context, TemplateArgumentLoc(DefaultTInfo->getType(), DefaultTInfo));
   }
 
   return Param;
@@ -1598,7 +1599,9 @@ NamedDecl *Sema::ActOnNonTypeTemplateParameter(Scope *S, Declarator &D,
     if (DiagnoseUnexpandedParameterPack(Default, UPPC_DefaultArgument))
       return Param;
 
-    Param->setDefaultArgument(Default);
+    Param->setDefaultArgument(
+        Context, getTrivialTemplateArgumentLoc(TemplateArgument(Default),
+                                               QualType(), SourceLocation()));
   }
 
   return Param;
@@ -1839,7 +1842,8 @@ DeclResult Sema::CheckClassTemplate(
     TemplateParameterList **OuterTemplateParamLists, SkipBodyInfo *SkipBody) {
   assert(TemplateParams && TemplateParams->size() > 0 &&
          "No template parameters");
-  assert(TUK != TUK_Reference && "Can only declare or define class templates");
+  assert(TUK != TagUseKind::Reference &&
+         "Can only declare or define class templates");
   bool Invalid = false;
 
   // Check that we can declare a template here.
@@ -1861,8 +1865,9 @@ DeclResult Sema::CheckClassTemplate(
   // C++11 [basic.lookup.elab]p2).
   DeclContext *SemanticContext;
   LookupResult Previous(*this, Name, NameLoc,
-                        (SS.isEmpty() && TUK == TUK_Friend)
-                          ? LookupTagName : LookupOrdinaryName,
+                        (SS.isEmpty() && TUK == TagUseKind::Friend)
+                            ? LookupTagName
+                            : LookupOrdinaryName,
                         forRedeclarationInCurContext());
   if (SS.isNotEmpty() && !SS.isInvalid()) {
     SemanticContext = computeDeclContext(SS, true);
@@ -1870,11 +1875,11 @@ DeclResult Sema::CheckClassTemplate(
       // FIXME: Horrible, horrible hack! We can't currently represent this
       // in the AST, and historically we have just ignored such friend
       // class templates, so don't complain here.
-      Diag(NameLoc, TUK == TUK_Friend
+      Diag(NameLoc, TUK == TagUseKind::Friend
                         ? diag::warn_template_qualified_friend_ignored
                         : diag::err_template_qualified_declarator_no_match)
           << SS.getScopeRep() << SS.getRange();
-      return TUK != TUK_Friend;
+      return TUK != TagUseKind::Friend;
     }
 
     if (RequireCompleteDeclContext(SS, SemanticContext))
@@ -1889,7 +1894,7 @@ DeclResult Sema::CheckClassTemplate(
         Invalid = true;
     }
 
-    if (TUK != TUK_Friend && TUK != TUK_Reference)
+    if (TUK != TagUseKind::Friend && TUK != TagUseKind::Reference)
       diagnoseQualifiedDeclaration(SS, SemanticContext, Name, NameLoc,
                                    /*TemplateId-*/ nullptr,
                                    /*IsMemberSpecialization*/ false);
@@ -1902,7 +1907,7 @@ DeclResult Sema::CheckClassTemplate(
     //   If T is the name of a class, then each of the following shall have a
     //   name different from T:
     //    -- every member template of class T
-    if (TUK != TUK_Friend &&
+    if (TUK != TagUseKind::Friend &&
         DiagnoseClassNameShadow(SemanticContext,
                                 DeclarationNameInfo(Name, NameLoc)))
       return true;
@@ -1944,7 +1949,7 @@ DeclResult Sema::CheckClassTemplate(
     }
   }
 
-  if (TUK == TUK_Friend) {
+  if (TUK == TagUseKind::Friend) {
     // C++ [namespace.memdef]p3:
     //   [...] When looking for a prior declaration of a class or a function
     //   declared as a friend, and when the name of the friend class or
@@ -1981,9 +1986,8 @@ DeclResult Sema::CheckClassTemplate(
           PrevDecl = (*Previous.begin())->getUnderlyingDecl();
       }
     }
-  } else if (PrevDecl &&
-             !isDeclInScope(Previous.getRepresentativeDecl(), SemanticContext,
-                            S, SS.isValid()))
+  } else if (PrevDecl && !isDeclInScope(Previous.getRepresentativeDecl(),
+                                        SemanticContext, S, SS.isValid()))
     PrevDecl = PrevClassTemplate = nullptr;
 
   if (auto *Shadow = dyn_cast_or_null<UsingShadowDecl>(
@@ -2005,7 +2009,7 @@ DeclResult Sema::CheckClassTemplate(
     // Ensure that the template parameter lists are compatible. Skip this check
     // for a friend in a dependent context: the template parameter list itself
     // could be dependent.
-    if (!(TUK == TUK_Friend && CurContext->isDependentContext()) &&
+    if (!(TUK == TagUseKind::Friend && CurContext->isDependentContext()) &&
         !TemplateParameterListsAreEqual(
             TemplateCompareNewDeclInfo(SemanticContext ? SemanticContext
                                                        : CurContext,
@@ -2021,8 +2025,8 @@ DeclResult Sema::CheckClassTemplate(
     //   the class-key shall agree in kind with the original class
     //   template declaration (7.1.5.3).
     RecordDecl *PrevRecordDecl = PrevClassTemplate->getTemplatedDecl();
-    if (!isAcceptableTagRedeclaration(PrevRecordDecl, Kind,
-                                      TUK == TUK_Definition,  KWLoc, Name)) {
+    if (!isAcceptableTagRedeclaration(
+            PrevRecordDecl, Kind, TUK == TagUseKind::Definition, KWLoc, Name)) {
       Diag(KWLoc, diag::err_use_with_wrong_tag)
         << Name
         << FixItHint::CreateReplacement(KWLoc, PrevRecordDecl->getKindName());
@@ -2031,7 +2035,7 @@ DeclResult Sema::CheckClassTemplate(
     }
 
     // Check for redefinition of this class template.
-    if (TUK == TUK_Definition) {
+    if (TUK == TagUseKind::Definition) {
       if (TagDecl *Def = PrevRecordDecl->getDefinition()) {
         // If we have a prior definition that is not visible, treat this as
         // simply making that previous definition visible.
@@ -2068,7 +2072,7 @@ DeclResult Sema::CheckClassTemplate(
   // merging in the template parameter list from the previous class
   // template declaration. Skip this check for a friend in a dependent
   // context, because the template parameter list might be dependent.
-  if (!(TUK == TUK_Friend && CurContext->isDependentContext()) &&
+  if (!(TUK == TagUseKind::Friend && CurContext->isDependentContext()) &&
       CheckTemplateParameterList(
           TemplateParams,
           PrevClassTemplate ? GetTemplateParameterList(PrevClassTemplate)
@@ -2076,8 +2080,8 @@ DeclResult Sema::CheckClassTemplate(
           (SS.isSet() && SemanticContext && SemanticContext->isRecord() &&
            SemanticContext->isDependentContext())
               ? TPC_ClassTemplateMember
-          : TUK == TUK_Friend ? TPC_FriendClassTemplate
-                              : TPC_ClassTemplate,
+          : TUK == TagUseKind::Friend ? TPC_FriendClassTemplate
+                                      : TPC_ClassTemplate,
           SkipBody))
     Invalid = true;
 
@@ -2085,9 +2089,10 @@ DeclResult Sema::CheckClassTemplate(
     // If the name of the template was qualified, we must be defining the
     // template out-of-line.
     if (!SS.isInvalid() && !Invalid && !PrevClassTemplate) {
-      Diag(NameLoc, TUK == TUK_Friend ? diag::err_friend_decl_does_not_match
-                                      : diag::err_member_decl_does_not_match)
-        << Name << SemanticContext << /*IsDefinition*/true << SS.getRange();
+      Diag(NameLoc, TUK == TagUseKind::Friend
+                        ? diag::err_friend_decl_does_not_match
+                        : diag::err_member_decl_does_not_match)
+          << Name << SemanticContext << /*IsDefinition*/ true << SS.getRange();
       Invalid = true;
     }
   }
@@ -2097,8 +2102,8 @@ DeclResult Sema::CheckClassTemplate(
   // recent declaration tricking the template instantiator to make substitutions
   // there.
   // FIXME: Figure out how to combine with shouldLinkDependentDeclWithPrevious
-  bool ShouldAddRedecl
-    = !(TUK == TUK_Friend && CurContext->isDependentContext());
+  bool ShouldAddRedecl =
+      !(TUK == TagUseKind::Friend && CurContext->isDependentContext());
 
   CXXRecordDecl *NewClass =
     CXXRecordDecl::Create(Context, Kind, SemanticContext, KWLoc, NameLoc, Name,
@@ -2113,7 +2118,7 @@ DeclResult Sema::CheckClassTemplate(
 
   // Add alignment attributes if necessary; these attributes are checked when
   // the ASTContext lays out the structure.
-  if (TUK == TUK_Definition && (!SkipBody || !SkipBody->ShouldSkip)) {
+  if (TUK == TagUseKind::Definition && (!SkipBody || !SkipBody->ShouldSkip)) {
     AddAlignmentAttributesForRecord(NewClass);
     AddMsStructLayoutForRecord(NewClass);
   }
@@ -2144,14 +2149,15 @@ DeclResult Sema::CheckClassTemplate(
     PrevClassTemplate->setMemberSpecialization();
 
   // Set the access specifier.
-  if (!Invalid && TUK != TUK_Friend && NewTemplate->getDeclContext()->isRecord())
+  if (!Invalid && TUK != TagUseKind::Friend &&
+      NewTemplate->getDeclContext()->isRecord())
     SetMemberAccessSpecifier(NewTemplate, PrevClassTemplate, AS);
 
   // Set the lexical context of these templates
   NewClass->setLexicalDeclContext(CurContext);
   NewTemplate->setLexicalDeclContext(CurContext);
 
-  if (TUK == TUK_Definition && (!SkipBody || !SkipBody->ShouldSkip))
+  if (TUK == TagUseKind::Definition && (!SkipBody || !SkipBody->ShouldSkip))
     NewClass->startDefinition();
 
   ProcessDeclAttributeList(S, NewClass, Attr);
@@ -2164,7 +2170,7 @@ DeclResult Sema::CheckClassTemplate(
   inferGslOwnerPointerAttribute(NewClass);
   inferNullableClassAttribute(NewClass);
 
-  if (TUK != TUK_Friend) {
+  if (TUK != TagUseKind::Friend) {
     // Per C++ [basic.scope.temp]p2, skip the template parameter scopes.
     Scope *Outer = S;
     while ((Outer->getFlags() & Scope::TemplateParamScope) != 0)
@@ -2318,11 +2324,11 @@ transformTemplateTypeParam(Sema &SemaRef, DeclContext *DC,
     SemaRef.SubstTypeConstraint(NewTTP, TC, Args,
                                 /*EvaluateConstraint=*/true);
   if (TTP->hasDefaultArgument()) {
-    TypeSourceInfo *InstantiatedDefaultArg =
-        SemaRef.SubstType(TTP->getDefaultArgumentInfo(), Args,
-                          TTP->getDefaultArgumentLoc(), TTP->getDeclName());
-    if (InstantiatedDefaultArg)
-      NewTTP->setDefaultArgument(InstantiatedDefaultArg);
+    TemplateArgumentLoc InstantiatedDefaultArg;
+    if (!SemaRef.SubstTemplateArgument(
+            TTP->getDefaultArgument(), Args, InstantiatedDefaultArg,
+            TTP->getDefaultArgumentLoc(), TTP->getDeclName()))
+      NewTTP->setDefaultArgument(SemaRef.Context, InstantiatedDefaultArg);
   }
   SemaRef.CurrentInstantiationScope->InstantiatedLocal(TTP, NewTTP);
   return NewTTP;
@@ -3575,10 +3581,9 @@ bool Sema::CheckTemplateParameterList(TemplateParameterList *NewParams,
           = dyn_cast<TemplateTypeParmDecl>(*NewParam)) {
       // Check the presence of a default argument here.
       if (NewTypeParm->hasDefaultArgument() &&
-          DiagnoseDefaultTemplateArgument(*this, TPC,
-                                          NewTypeParm->getLocation(),
-               NewTypeParm->getDefaultArgumentInfo()->getTypeLoc()
-                                                       .getSourceRange()))
+          DiagnoseDefaultTemplateArgument(
+              *this, TPC, NewTypeParm->getLocation(),
+              NewTypeParm->getDefaultArgument().getSourceRange()))
         NewTypeParm->removeDefaultArgument();
 
       // Merge default arguments for template type parameters.
@@ -3627,9 +3632,9 @@ bool Sema::CheckTemplateParameterList(TemplateParameterList *NewParams,
 
       // Check the presence of a default argument here.
       if (NewNonTypeParm->hasDefaultArgument() &&
-          DiagnoseDefaultTemplateArgument(*this, TPC,
-                                          NewNonTypeParm->getLocation(),
-                    NewNonTypeParm->getDefaultArgument()->getSourceRange())) {
+          DiagnoseDefaultTemplateArgument(
+              *this, TPC, NewNonTypeParm->getLocation(),
+              NewNonTypeParm->getDefaultArgument().getSourceRange())) {
         NewNonTypeParm->removeDefaultArgument();
       }
 
@@ -5015,7 +5020,7 @@ TypeResult Sema::ActOnTagTemplateIdType(TagUseKind TUK,
     IdentifierInfo *Id = D->getIdentifier();
     assert(Id && "templated class must have an identifier");
 
-    if (!isAcceptableTagRedeclaration(D, TagKind, TUK == TUK_Definition,
+    if (!isAcceptableTagRedeclaration(D, TagKind, TUK == TagUseKind::Definition,
                                       TagLoc, Id)) {
       Diag(TagLoc, diag::err_use_with_wrong_tag)
         << Result
@@ -6040,22 +6045,26 @@ bool Sema::CheckTemplateTypeArgument(
 ///
 /// \param Converted the list of template arguments provided for template
 /// parameters that precede \p Param in the template parameter list.
-/// \returns the substituted template argument, or NULL if an error occurred.
-static TypeSourceInfo *SubstDefaultTemplateArgument(
+///
+/// \param Output the resulting substituted template argument.
+///
+/// \returns true if an error occurred.
+static bool SubstDefaultTemplateArgument(
     Sema &SemaRef, TemplateDecl *Template, SourceLocation TemplateLoc,
     SourceLocation RAngleLoc, TemplateTypeParmDecl *Param,
     ArrayRef<TemplateArgument> SugaredConverted,
-    ArrayRef<TemplateArgument> CanonicalConverted) {
-  TypeSourceInfo *ArgType = Param->getDefaultArgumentInfo();
+    ArrayRef<TemplateArgument> CanonicalConverted,
+    TemplateArgumentLoc &Output) {
+  Output = Param->getDefaultArgument();
 
   // If the argument type is dependent, instantiate it now based
   // on the previously-computed template arguments.
-  if (ArgType->getType()->isInstantiationDependentType()) {
+  if (Output.getArgument().isInstantiationDependent()) {
     Sema::InstantiatingTemplate Inst(SemaRef, TemplateLoc, Param, Template,
                                      SugaredConverted,
                                      SourceRange(TemplateLoc, RAngleLoc));
     if (Inst.isInvalid())
-      return nullptr;
+      return true;
 
     // Only substitute for the innermost template argument list.
     MultiLevelTemplateArgumentList TemplateArgLists(Template, SugaredConverted,
@@ -6068,12 +6077,14 @@ static TypeSourceInfo *SubstDefaultTemplateArgument(
       ForLambdaCallOperator = Rec->isLambda();
     Sema::ContextRAII SavedContext(SemaRef, Template->getDeclContext(),
                                    !ForLambdaCallOperator);
-    ArgType =
-        SemaRef.SubstType(ArgType, TemplateArgLists,
-                          Param->getDefaultArgumentLoc(), Param->getDeclName());
+
+    if (SemaRef.SubstTemplateArgument(Output, TemplateArgLists, Output,
+                                      Param->getDefaultArgumentLoc(),
+                                      Param->getDeclName()))
+      return true;
   }
 
-  return ArgType;
+  return false;
 }
 
 /// Substitute template arguments into the default template argument for
@@ -6098,16 +6109,17 @@ static TypeSourceInfo *SubstDefaultTemplateArgument(
 /// parameters that precede \p Param in the template parameter list.
 ///
 /// \returns the substituted template argument, or NULL if an error occurred.
-static ExprResult SubstDefaultTemplateArgument(
+static bool SubstDefaultTemplateArgument(
     Sema &SemaRef, TemplateDecl *Template, SourceLocation TemplateLoc,
     SourceLocation RAngleLoc, NonTypeTemplateParmDecl *Param,
     ArrayRef<TemplateArgument> SugaredConverted,
-    ArrayRef<TemplateArgument> CanonicalConverted) {
+    ArrayRef<TemplateArgument> CanonicalConverted,
+    TemplateArgumentLoc &Output) {
   Sema::InstantiatingTemplate Inst(SemaRef, TemplateLoc, Param, Template,
                                    SugaredConverted,
                                    SourceRange(TemplateLoc, RAngleLoc));
   if (Inst.isInvalid())
-    return ExprError();
+    return true;
 
   // Only substitute for the innermost template argument list.
   MultiLevelTemplateArgumentList TemplateArgLists(Template, SugaredConverted,
@@ -6118,7 +6130,8 @@ static ExprResult SubstDefaultTemplateArgument(
   Sema::ContextRAII SavedContext(SemaRef, Template->getDeclContext());
   EnterExpressionEvaluationContext ConstantEvaluated(
       SemaRef, Sema::ExpressionEvaluationContext::ConstantEvaluated);
-  return SemaRef.SubstExpr(Param->getDefaultArgument(), TemplateArgLists);
+  return SemaRef.SubstTemplateArgument(Param->getDefaultArgument(),
+                                       TemplateArgLists, Output);
 }
 
 /// Substitute template arguments into the default template argument for
@@ -6196,13 +6209,12 @@ TemplateArgumentLoc Sema::SubstDefaultTemplateArgumentIfAvailable(
       return TemplateArgumentLoc();
 
     HasDefaultArg = true;
-    TypeSourceInfo *DI = SubstDefaultTemplateArgument(
-        *this, Template, TemplateLoc, RAngleLoc, TypeParm, SugaredConverted,
-        CanonicalConverted);
-    if (DI)
-      return TemplateArgumentLoc(TemplateArgument(DI->getType()), DI);
-
-    return TemplateArgumentLoc();
+    TemplateArgumentLoc Output;
+    if (SubstDefaultTemplateArgument(*this, Template, TemplateLoc, RAngleLoc,
+                                     TypeParm, SugaredConverted,
+                                     CanonicalConverted, Output))
+      return TemplateArgumentLoc();
+    return Output;
   }
 
   if (NonTypeTemplateParmDecl *NonTypeParm
@@ -6211,14 +6223,12 @@ TemplateArgumentLoc Sema::SubstDefaultTemplateArgumentIfAvailable(
       return TemplateArgumentLoc();
 
     HasDefaultArg = true;
-    ExprResult Arg = SubstDefaultTemplateArgument(
-        *this, Template, TemplateLoc, RAngleLoc, NonTypeParm, SugaredConverted,
-        CanonicalConverted);
-    if (Arg.isInvalid())
+    TemplateArgumentLoc Output;
+    if (SubstDefaultTemplateArgument(*this, Template, TemplateLoc, RAngleLoc,
+                                     NonTypeParm, SugaredConverted,
+                                     CanonicalConverted, Output))
       return TemplateArgumentLoc();
-
-    Expr *ArgE = Arg.getAs<Expr>();
-    return TemplateArgumentLoc(TemplateArgument(ArgE), ArgE);
+    return Output;
   }
 
   TemplateTemplateParmDecl *TempTempParm
@@ -6785,28 +6795,20 @@ bool Sema::CheckTemplateArgumentList(
         return diagnoseMissingArgument(*this, TemplateLoc, Template, TTP,
                                        NewArgs);
 
-      TypeSourceInfo *ArgType = SubstDefaultTemplateArgument(
-          *this, Template, TemplateLoc, RAngleLoc, TTP, SugaredConverted,
-          CanonicalConverted);
-      if (!ArgType)
+      if (SubstDefaultTemplateArgument(*this, Template, TemplateLoc, RAngleLoc,
+                                       TTP, SugaredConverted,
+                                       CanonicalConverted, Arg))
         return true;
-
-      Arg = TemplateArgumentLoc(TemplateArgument(ArgType->getType()),
-                                ArgType);
     } else if (NonTypeTemplateParmDecl *NTTP
                  = dyn_cast<NonTypeTemplateParmDecl>(*Param)) {
       if (!hasReachableDefaultArgument(NTTP))
         return diagnoseMissingArgument(*this, TemplateLoc, Template, NTTP,
                                        NewArgs);
 
-      ExprResult E = SubstDefaultTemplateArgument(
-          *this, Template, TemplateLoc, RAngleLoc, NTTP, SugaredConverted,
-          CanonicalConverted);
-      if (E.isInvalid())
+      if (SubstDefaultTemplateArgument(*this, Template, TemplateLoc, RAngleLoc,
+                                       NTTP, SugaredConverted,
+                                       CanonicalConverted, Arg))
         return true;
-
-      Expr *Ex = E.getAs<Expr>();
-      Arg = TemplateArgumentLoc(TemplateArgument(Ex), Ex);
     } else {
       TemplateTemplateParmDecl *TempParm
         = cast<TemplateTemplateParmDecl>(*Param);
@@ -9451,7 +9453,7 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
     SourceLocation ModulePrivateLoc, CXXScopeSpec &SS,
     TemplateIdAnnotation &TemplateId, const ParsedAttributesView &Attr,
     MultiTemplateParamsArg TemplateParameterLists, SkipBodyInfo *SkipBody) {
-  assert(TUK != TUK_Reference && "References are not specializations");
+  assert(TUK != TagUseKind::Reference && "References are not specializations");
 
   SourceLocation TemplateNameLoc = TemplateId.TemplateNameLoc;
   SourceLocation LAngleLoc = TemplateId.LAngleLoc;
@@ -9473,7 +9475,7 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
   bool isPartialSpecialization = false;
 
   if (SS.isSet()) {
-    if (TUK != TUK_Reference && TUK != TUK_Friend &&
+    if (TUK != TagUseKind::Reference && TUK != TagUseKind::Friend &&
         diagnoseQualifiedDeclaration(SS, ClassTemplate->getDeclContext(),
                                      ClassTemplate->getDeclName(),
                                      TemplateNameLoc, &TemplateId,
@@ -9488,9 +9490,8 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
   bool Invalid = false;
   TemplateParameterList *TemplateParams =
       MatchTemplateParametersToScopeSpecifier(
-          KWLoc, TemplateNameLoc, SS, &TemplateId,
-          TemplateParameterLists, TUK == TUK_Friend, isMemberSpecialization,
-          Invalid);
+          KWLoc, TemplateNameLoc, SS, &TemplateId, TemplateParameterLists,
+          TUK == TagUseKind::Friend, isMemberSpecialization, Invalid);
   if (Invalid)
     return true;
 
@@ -9501,7 +9502,7 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
   if (TemplateParams && TemplateParams->size() > 0) {
     isPartialSpecialization = true;
 
-    if (TUK == TUK_Friend) {
+    if (TUK == TagUseKind::Friend) {
       Diag(KWLoc, diag::err_partial_specialization_friend)
         << SourceRange(LAngleLoc, RAngleLoc);
       return true;
@@ -9520,10 +9521,10 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
         }
       } else if (NonTypeTemplateParmDecl *NTTP
                    = dyn_cast<NonTypeTemplateParmDecl>(Param)) {
-        if (Expr *DefArg = NTTP->getDefaultArgument()) {
+        if (NTTP->hasDefaultArgument()) {
           Diag(NTTP->getDefaultArgumentLoc(),
                diag::err_default_arg_in_partial_spec)
-            << DefArg->getSourceRange();
+              << NTTP->getDefaultArgument().getSourceRange();
           NTTP->removeDefaultArgument();
         }
       } else {
@@ -9537,14 +9538,15 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
       }
     }
   } else if (TemplateParams) {
-    if (TUK == TUK_Friend)
+    if (TUK == TagUseKind::Friend)
       Diag(KWLoc, diag::err_template_spec_friend)
         << FixItHint::CreateRemoval(
                                 SourceRange(TemplateParams->getTemplateLoc(),
                                             TemplateParams->getRAngleLoc()))
         << SourceRange(LAngleLoc, RAngleLoc);
   } else {
-    assert(TUK == TUK_Friend && "should have a 'template<>' for this decl");
+    assert(TUK == TagUseKind::Friend &&
+           "should have a 'template<>' for this decl");
   }
 
   // Check that the specialization uses the same tag kind as the
@@ -9552,8 +9554,8 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
   TagTypeKind Kind = TypeWithKeyword::getTagTypeKindForTypeSpec(TagSpec);
   assert(Kind != TagTypeKind::Enum &&
          "Invalid enum tag in class template spec!");
-  if (!isAcceptableTagRedeclaration(ClassTemplate->getTemplatedDecl(),
-                                    Kind, TUK == TUK_Definition, KWLoc,
+  if (!isAcceptableTagRedeclaration(ClassTemplate->getTemplatedDecl(), Kind,
+                                    TUK == TagUseKind::Definition, KWLoc,
                                     ClassTemplate->getIdentifier())) {
     Diag(KWLoc, diag::err_use_with_wrong_tag)
       << ClassTemplate
@@ -9617,7 +9619,7 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
 
   // Check whether we can declare a class template specialization in
   // the current scope.
-  if (TUK != TUK_Friend &&
+  if (TUK != TagUseKind::Friend &&
       CheckTemplateSpecializationScope(*this, ClassTemplate, PrevDecl,
                                        TemplateNameLoc,
                                        isPartialSpecialization))
@@ -9644,8 +9646,8 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
       // This rule has since been removed, because it's redundant given DR1495,
       // but we keep it because it produces better diagnostics and recovery.
       Diag(TemplateNameLoc, diag::err_partial_spec_args_match_primary_template)
-        << /*class template*/0 << (TUK == TUK_Definition)
-        << FixItHint::CreateRemoval(SourceRange(LAngleLoc, RAngleLoc));
+          << /*class template*/ 0 << (TUK == TagUseKind::Definition)
+          << FixItHint::CreateRemoval(SourceRange(LAngleLoc, RAngleLoc));
       return CheckClassTemplate(S, TagSpec, TUK, KWLoc, SS,
                                 ClassTemplate->getIdentifier(),
                                 TemplateNameLoc,
@@ -9737,11 +9739,11 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
   }
 
   // If this is not a friend, note that this is an explicit specialization.
-  if (TUK != TUK_Friend)
+  if (TUK != TagUseKind::Friend)
     Specialization->setSpecializationKind(TSK_ExplicitSpecialization);
 
   // Check that this isn't a redefinition of this specialization.
-  if (TUK == TUK_Definition) {
+  if (TUK == TagUseKind::Definition) {
     RecordDecl *Def = Specialization->getDefinition();
     NamedDecl *Hidden = nullptr;
     if (Def && SkipBody && !hasVisibleDefinition(Def, &Hidden)) {
@@ -9762,7 +9764,7 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
 
   // Add alignment attributes if necessary; these attributes are checked when
   // the ASTContext lays out the structure.
-  if (TUK == TUK_Definition && (!SkipBody || !SkipBody->ShouldSkip)) {
+  if (TUK == TagUseKind::Definition && (!SkipBody || !SkipBody->ShouldSkip)) {
     AddAlignmentAttributesForRecord(Specialization);
     AddMsStructLayoutForRecord(Specialization);
   }
@@ -9783,10 +9785,10 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
   Specialization->setLexicalDeclContext(CurContext);
 
   // We may be starting the definition of this specialization.
-  if (TUK == TUK_Definition && (!SkipBody || !SkipBody->ShouldSkip))
+  if (TUK == TagUseKind::Definition && (!SkipBody || !SkipBody->ShouldSkip))
     Specialization->startDefinition();
 
-  if (TUK == TUK_Friend) {
+  if (TUK == TagUseKind::Friend) {
     // Build the fully-sugared type for this class template
     // specialization as the user wrote in the specialization
     // itself. This means that we'll pretty-print the type retrieved
@@ -11160,11 +11162,13 @@ Sema::ActOnExplicitInstantiation(Scope *S, SourceLocation ExternLoc,
 
   bool Owned = false;
   bool IsDependent = false;
-  Decl *TagD = ActOnTag(S, TagSpec, Sema::TUK_Reference, KWLoc, SS, Name,
-               NameLoc, Attr, AS_none, /*ModulePrivateLoc=*/SourceLocation(),
+  Decl *TagD =
+      ActOnTag(S, TagSpec, TagUseKind::Reference, KWLoc, SS, Name, NameLoc,
+               Attr, AS_none, /*ModulePrivateLoc=*/SourceLocation(),
                MultiTemplateParamsArg(), Owned, IsDependent, SourceLocation(),
                false, TypeResult(), /*IsTypeSpecifier*/ false,
-               /*IsTemplateParamOrArg*/ false, /*OOK=*/OOK_Outside).get();
+               /*IsTemplateParamOrArg*/ false, /*OOK=*/OOK_Outside)
+          .get();
   assert(!IsDependent && "explicit instantiation of dependent name not yet handled");
 
   if (!TagD)
@@ -11695,9 +11699,9 @@ TypeResult Sema::ActOnDependentTag(Scope *S, unsigned TagSpec, TagUseKind TUK,
 
   TagTypeKind Kind = TypeWithKeyword::getTagTypeKindForTypeSpec(TagSpec);
 
-  if (TUK == TUK_Declaration || TUK == TUK_Definition) {
+  if (TUK == TagUseKind::Declaration || TUK == TagUseKind::Definition) {
     Diag(NameLoc, diag::err_dependent_tag_decl)
-        << (TUK == TUK_Definition) << llvm::to_underlying(Kind)
+        << (TUK == TagUseKind::Definition) << llvm::to_underlying(Kind)
         << SS.getRange();
     return true;
   }
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 41fd210..f9ec341 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -519,18 +519,14 @@ static NamedDecl *getTemplateParameterWithDefault(Sema &S, NamedDecl *A,
   switch (A->getKind()) {
   case Decl::TemplateTypeParm: {
     auto *T = cast<TemplateTypeParmDecl>(A);
-    // FIXME: A TemplateTypeParmDecl's DefaultArgument can't hold a full
-    // TemplateArgument, so there is currently no way to specify a pack as a
-    // default argument for these.
-    if (T->isParameterPack())
-      return A;
     auto *R = TemplateTypeParmDecl::Create(
         S.Context, A->getDeclContext(), SourceLocation(), SourceLocation(),
         T->getDepth(), T->getIndex(), T->getIdentifier(),
-        T->wasDeclaredWithTypename(), /*ParameterPack=*/false,
+        T->wasDeclaredWithTypename(), T->isParameterPack(),
         T->hasTypeConstraint());
     R->setDefaultArgument(
-        S.Context.getTrivialTypeSourceInfo(Default.getAsType()));
+        S.Context,
+        S.getTrivialTemplateArgumentLoc(Default, QualType(), SourceLocation()));
     if (R->hasTypeConstraint()) {
       auto *C = R->getTypeConstraint();
       R->setTypeConstraint(C->getConceptReference(),
@@ -540,14 +536,14 @@ static NamedDecl *getTemplateParameterWithDefault(Sema &S, NamedDecl *A,
   }
   case Decl::NonTypeTemplateParm: {
     auto *T = cast<NonTypeTemplateParmDecl>(A);
-    // FIXME: Ditto, as above for TemplateTypeParm case.
-    if (T->isParameterPack())
-      return A;
     auto *R = NonTypeTemplateParmDecl::Create(
         S.Context, A->getDeclContext(), SourceLocation(), SourceLocation(),
         T->getDepth(), T->getIndex(), T->getIdentifier(), T->getType(),
-        /*ParameterPack=*/false, T->getTypeSourceInfo());
-    R->setDefaultArgument(Default.getAsExpr());
+        T->isParameterPack(), T->getTypeSourceInfo());
+    R->setDefaultArgument(S.Context,
+                          S.getTrivialTemplateArgumentLoc(
+                              Default, Default.getNonTypeTemplateArgumentType(),
+                              SourceLocation()));
     if (auto *PTC = T->getPlaceholderTypeConstraint())
       R->setPlaceholderTypeConstraint(PTC);
     return R;
@@ -4776,8 +4772,13 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
       DeduceReturnType(Specialization, Info.getLocation(), false))
     return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
+  // [C++26][expr.const]/p17
+  // An expression or conversion is immediate-escalating if it is not initially
+  // in an immediate function context and it is [...]
+  // a potentially-evaluated id-expression that denotes an immediate function.
   if (IsAddressOfFunction && getLangOpts().CPlusPlus20 &&
       Specialization->isImmediateEscalating() &&
+      parentEvaluationContext().isPotentiallyEvaluated() &&
       CheckIfFunctionSpecializationIsImmediate(Specialization,
                                                Info.getLocation()))
     return TemplateDeductionResult::MiscellaneousDeductionFailure;
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 0762605..abb8a26 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -1619,11 +1619,6 @@ namespace {
       case TemplateArgument::Pack:
         // Literally rewrite the template argument pack, instead of unpacking
         // it.
-        assert(
-            SemaRef.CodeSynthesisContexts.back().Kind ==
-                Sema::CodeSynthesisContext::BuildingDeductionGuides &&
-            "Transforming a template argument pack is only allowed in building "
-            "deduction guide");
         for (auto &pack : Arg.getPackAsArray()) {
           TemplateArgumentLoc Input = SemaRef.getTrivialTemplateArgumentLoc(
               pack, QualType(), SourceLocation{});
@@ -4375,9 +4370,9 @@ Sema::SubstStmt(Stmt *S, const MultiLevelTemplateArgumentList &TemplateArgs) {
 bool Sema::SubstTemplateArgument(
     const TemplateArgumentLoc &Input,
     const MultiLevelTemplateArgumentList &TemplateArgs,
-    TemplateArgumentLoc &Output) {
-  TemplateInstantiator Instantiator(*this, TemplateArgs, SourceLocation(),
-                                    DeclarationName());
+    TemplateArgumentLoc &Output, SourceLocation Loc,
+    const DeclarationName &Entity) {
+  TemplateInstantiator Instantiator(*this, TemplateArgs, Loc, Entity);
   return Instantiator.TransformTemplateArgument(Input, Output);
 }
 
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 381d79b2..bb49aae 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -2956,11 +2956,10 @@ Decl *TemplateDeclInstantiator::VisitTemplateTypeParmDecl(
     }
   }
   if (D->hasDefaultArgument() && !D->defaultArgumentWasInherited()) {
-    TypeSourceInfo *InstantiatedDefaultArg =
-        SemaRef.SubstType(D->getDefaultArgumentInfo(), TemplateArgs,
-                          D->getDefaultArgumentLoc(), D->getDeclName());
-    if (InstantiatedDefaultArg)
-      Inst->setDefaultArgument(InstantiatedDefaultArg);
+    TemplateArgumentLoc Output;
+    if (!SemaRef.SubstTemplateArgument(D->getDefaultArgument(), TemplateArgs,
+                                       Output))
+      Inst->setDefaultArgument(SemaRef.getASTContext(), Output);
   }
 
   // Introduce this template parameter's instantiation into the instantiation
@@ -3124,9 +3123,10 @@ Decl *TemplateDeclInstantiator::VisitNonTypeTemplateParmDecl(
   if (D->hasDefaultArgument() && !D->defaultArgumentWasInherited()) {
     EnterExpressionEvaluationContext ConstantEvaluated(
         SemaRef, Sema::ExpressionEvaluationContext::ConstantEvaluated);
-    ExprResult Value = SemaRef.SubstExpr(D->getDefaultArgument(), TemplateArgs);
-    if (!Value.isInvalid())
-      Param->setDefaultArgument(Value.get());
+    TemplateArgumentLoc Result;
+    if (!SemaRef.SubstTemplateArgument(D->getDefaultArgument(), TemplateArgs,
+                                       Result))
+      Param->setDefaultArgument(SemaRef.Context, Result);
   }
 
   // Introduce this template parameter's instantiation into the instantiation
@@ -5055,6 +5055,7 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation,
   Function->setLocation(PatternDecl->getLocation());
   Function->setInnerLocStart(PatternDecl->getInnerLocStart());
   Function->setRangeEnd(PatternDecl->getEndLoc());
+  Function->setDeclarationNameLoc(PatternDecl->getNameInfo().getInfo());
 
   EnterExpressionEvaluationContext EvalContext(
       *this, Sema::ExpressionEvaluationContext::PotentiallyEvaluated);
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index c19c8cc..ef0b6b7 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -9345,9 +9345,9 @@ BuildTypeCoupledDecls(Expr *E,
   Decls.push_back(TypeCoupledDeclRefInfo(CountDecl, /*IsDref*/ false));
 }
 
-QualType Sema::BuildCountAttributedArrayType(QualType WrappedTy,
-                                             Expr *CountExpr) {
-  assert(WrappedTy->isIncompleteArrayType());
+QualType Sema::BuildCountAttributedArrayOrPointerType(QualType WrappedTy,
+                                                      Expr *CountExpr) {
+  assert(WrappedTy->isIncompleteArrayType() || WrappedTy->isPointerType());
 
   llvm::SmallVector<TypeCoupledDeclRefInfo, 1> Decls;
   BuildTypeCoupledDecls(CountExpr, Decls);
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
new file mode 100644
index 0000000..ffac1af
--- /dev/null
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -0,0 +1,878 @@
+//===------ SemaX86.cpp ---------- X86 target-specific routines -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements semantic analysis functions specific to X86.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Sema/SemaX86.h"
+#include "clang/Basic/DiagnosticSema.h"
+#include "clang/Basic/TargetBuiltins.h"
+#include "clang/Sema/Sema.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/TargetParser/Triple.h"
+#include <bitset>
+
+namespace clang {
+
+SemaX86::SemaX86(Sema &S) : SemaBase(S) {}
+
+// Check if the rounding mode is legal.
+bool SemaX86::CheckBuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
+  // Indicates if this instruction has rounding control or just SAE.
+  bool HasRC = false;
+
+  unsigned ArgNum = 0;
+  switch (BuiltinID) {
+  default:
+    return false;
+  case X86::BI__builtin_ia32_vcvttsd2si32:
+  case X86::BI__builtin_ia32_vcvttsd2si64:
+  case X86::BI__builtin_ia32_vcvttsd2usi32:
+  case X86::BI__builtin_ia32_vcvttsd2usi64:
+  case X86::BI__builtin_ia32_vcvttss2si32:
+  case X86::BI__builtin_ia32_vcvttss2si64:
+  case X86::BI__builtin_ia32_vcvttss2usi32:
+  case X86::BI__builtin_ia32_vcvttss2usi64:
+  case X86::BI__builtin_ia32_vcvttsh2si32:
+  case X86::BI__builtin_ia32_vcvttsh2si64:
+  case X86::BI__builtin_ia32_vcvttsh2usi32:
+  case X86::BI__builtin_ia32_vcvttsh2usi64:
+    ArgNum = 1;
+    break;
+  case X86::BI__builtin_ia32_maxpd512:
+  case X86::BI__builtin_ia32_maxps512:
+  case X86::BI__builtin_ia32_minpd512:
+  case X86::BI__builtin_ia32_minps512:
+  case X86::BI__builtin_ia32_maxph512:
+  case X86::BI__builtin_ia32_minph512:
+    ArgNum = 2;
+    break;
+  case X86::BI__builtin_ia32_vcvtph2pd512_mask:
+  case X86::BI__builtin_ia32_vcvtph2psx512_mask:
+  case X86::BI__builtin_ia32_cvtps2pd512_mask:
+  case X86::BI__builtin_ia32_cvttpd2dq512_mask:
+  case X86::BI__builtin_ia32_cvttpd2qq512_mask:
+  case X86::BI__builtin_ia32_cvttpd2udq512_mask:
+  case X86::BI__builtin_ia32_cvttpd2uqq512_mask:
+  case X86::BI__builtin_ia32_cvttps2dq512_mask:
+  case X86::BI__builtin_ia32_cvttps2qq512_mask:
+  case X86::BI__builtin_ia32_cvttps2udq512_mask:
+  case X86::BI__builtin_ia32_cvttps2uqq512_mask:
+  case X86::BI__builtin_ia32_vcvttph2w512_mask:
+  case X86::BI__builtin_ia32_vcvttph2uw512_mask:
+  case X86::BI__builtin_ia32_vcvttph2dq512_mask:
+  case X86::BI__builtin_ia32_vcvttph2udq512_mask:
+  case X86::BI__builtin_ia32_vcvttph2qq512_mask:
+  case X86::BI__builtin_ia32_vcvttph2uqq512_mask:
+  case X86::BI__builtin_ia32_getexppd512_mask:
+  case X86::BI__builtin_ia32_getexpps512_mask:
+  case X86::BI__builtin_ia32_getexpph512_mask:
+  case X86::BI__builtin_ia32_vcomisd:
+  case X86::BI__builtin_ia32_vcomiss:
+  case X86::BI__builtin_ia32_vcomish:
+  case X86::BI__builtin_ia32_vcvtph2ps512_mask:
+    ArgNum = 3;
+    break;
+  case X86::BI__builtin_ia32_cmppd512_mask:
+  case X86::BI__builtin_ia32_cmpps512_mask:
+  case X86::BI__builtin_ia32_cmpsd_mask:
+  case X86::BI__builtin_ia32_cmpss_mask:
+  case X86::BI__builtin_ia32_cmpsh_mask:
+  case X86::BI__builtin_ia32_vcvtsh2sd_round_mask:
+  case X86::BI__builtin_ia32_vcvtsh2ss_round_mask:
+  case X86::BI__builtin_ia32_cvtss2sd_round_mask:
+  case X86::BI__builtin_ia32_getexpsd128_round_mask:
+  case X86::BI__builtin_ia32_getexpss128_round_mask:
+  case X86::BI__builtin_ia32_getexpsh128_round_mask:
+  case X86::BI__builtin_ia32_getmantpd512_mask:
+  case X86::BI__builtin_ia32_getmantps512_mask:
+  case X86::BI__builtin_ia32_getmantph512_mask:
+  case X86::BI__builtin_ia32_maxsd_round_mask:
+  case X86::BI__builtin_ia32_maxss_round_mask:
+  case X86::BI__builtin_ia32_maxsh_round_mask:
+  case X86::BI__builtin_ia32_minsd_round_mask:
+  case X86::BI__builtin_ia32_minss_round_mask:
+  case X86::BI__builtin_ia32_minsh_round_mask:
+  case X86::BI__builtin_ia32_reducepd512_mask:
+  case X86::BI__builtin_ia32_reduceps512_mask:
+  case X86::BI__builtin_ia32_reduceph512_mask:
+  case X86::BI__builtin_ia32_rndscalepd_mask:
+  case X86::BI__builtin_ia32_rndscaleps_mask:
+  case X86::BI__builtin_ia32_rndscaleph_mask:
+    ArgNum = 4;
+    break;
+  case X86::BI__builtin_ia32_fixupimmpd512_mask:
+  case X86::BI__builtin_ia32_fixupimmpd512_maskz:
+  case X86::BI__builtin_ia32_fixupimmps512_mask:
+  case X86::BI__builtin_ia32_fixupimmps512_maskz:
+  case X86::BI__builtin_ia32_fixupimmsd_mask:
+  case X86::BI__builtin_ia32_fixupimmsd_maskz:
+  case X86::BI__builtin_ia32_fixupimmss_mask:
+  case X86::BI__builtin_ia32_fixupimmss_maskz:
+  case X86::BI__builtin_ia32_getmantsd_round_mask:
+  case X86::BI__builtin_ia32_getmantss_round_mask:
+  case X86::BI__builtin_ia32_getmantsh_round_mask:
+  case X86::BI__builtin_ia32_rangepd512_mask:
+  case X86::BI__builtin_ia32_rangeps512_mask:
+  case X86::BI__builtin_ia32_rangesd128_round_mask:
+  case X86::BI__builtin_ia32_rangess128_round_mask:
+  case X86::BI__builtin_ia32_reducesd_mask:
+  case X86::BI__builtin_ia32_reducess_mask:
+  case X86::BI__builtin_ia32_reducesh_mask:
+  case X86::BI__builtin_ia32_rndscalesd_round_mask:
+  case X86::BI__builtin_ia32_rndscaless_round_mask:
+  case X86::BI__builtin_ia32_rndscalesh_round_mask:
+    ArgNum = 5;
+    break;
+  case X86::BI__builtin_ia32_vcvtsd2si64:
+  case X86::BI__builtin_ia32_vcvtsd2si32:
+  case X86::BI__builtin_ia32_vcvtsd2usi32:
+  case X86::BI__builtin_ia32_vcvtsd2usi64:
+  case X86::BI__builtin_ia32_vcvtss2si32:
+  case X86::BI__builtin_ia32_vcvtss2si64:
+  case X86::BI__builtin_ia32_vcvtss2usi32:
+  case X86::BI__builtin_ia32_vcvtss2usi64:
+  case X86::BI__builtin_ia32_vcvtsh2si32:
+  case X86::BI__builtin_ia32_vcvtsh2si64:
+  case X86::BI__builtin_ia32_vcvtsh2usi32:
+  case X86::BI__builtin_ia32_vcvtsh2usi64:
+  case X86::BI__builtin_ia32_sqrtpd512:
+  case X86::BI__builtin_ia32_sqrtps512:
+  case X86::BI__builtin_ia32_sqrtph512:
+    ArgNum = 1;
+    HasRC = true;
+    break;
+  case X86::BI__builtin_ia32_addph512:
+  case X86::BI__builtin_ia32_divph512:
+  case X86::BI__builtin_ia32_mulph512:
+  case X86::BI__builtin_ia32_subph512:
+  case X86::BI__builtin_ia32_addpd512:
+  case X86::BI__builtin_ia32_addps512:
+  case X86::BI__builtin_ia32_divpd512:
+  case X86::BI__builtin_ia32_divps512:
+  case X86::BI__builtin_ia32_mulpd512:
+  case X86::BI__builtin_ia32_mulps512:
+  case X86::BI__builtin_ia32_subpd512:
+  case X86::BI__builtin_ia32_subps512:
+  case X86::BI__builtin_ia32_cvtsi2sd64:
+  case X86::BI__builtin_ia32_cvtsi2ss32:
+  case X86::BI__builtin_ia32_cvtsi2ss64:
+  case X86::BI__builtin_ia32_cvtusi2sd64:
+  case X86::BI__builtin_ia32_cvtusi2ss32:
+  case X86::BI__builtin_ia32_cvtusi2ss64:
+  case X86::BI__builtin_ia32_vcvtusi2sh:
+  case X86::BI__builtin_ia32_vcvtusi642sh:
+  case X86::BI__builtin_ia32_vcvtsi2sh:
+  case X86::BI__builtin_ia32_vcvtsi642sh:
+    ArgNum = 2;
+    HasRC = true;
+    break;
+  case X86::BI__builtin_ia32_cvtdq2ps512_mask:
+  case X86::BI__builtin_ia32_cvtudq2ps512_mask:
+  case X86::BI__builtin_ia32_vcvtpd2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtps2phx512_mask:
+  case X86::BI__builtin_ia32_cvtpd2ps512_mask:
+  case X86::BI__builtin_ia32_cvtpd2dq512_mask:
+  case X86::BI__builtin_ia32_cvtpd2qq512_mask:
+  case X86::BI__builtin_ia32_cvtpd2udq512_mask:
+  case X86::BI__builtin_ia32_cvtpd2uqq512_mask:
+  case X86::BI__builtin_ia32_cvtps2dq512_mask:
+  case X86::BI__builtin_ia32_cvtps2qq512_mask:
+  case X86::BI__builtin_ia32_cvtps2udq512_mask:
+  case X86::BI__builtin_ia32_cvtps2uqq512_mask:
+  case X86::BI__builtin_ia32_cvtqq2pd512_mask:
+  case X86::BI__builtin_ia32_cvtqq2ps512_mask:
+  case X86::BI__builtin_ia32_cvtuqq2pd512_mask:
+  case X86::BI__builtin_ia32_cvtuqq2ps512_mask:
+  case X86::BI__builtin_ia32_vcvtdq2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtudq2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtw2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtuw2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtph2w512_mask:
+  case X86::BI__builtin_ia32_vcvtph2uw512_mask:
+  case X86::BI__builtin_ia32_vcvtph2dq512_mask:
+  case X86::BI__builtin_ia32_vcvtph2udq512_mask:
+  case X86::BI__builtin_ia32_vcvtph2qq512_mask:
+  case X86::BI__builtin_ia32_vcvtph2uqq512_mask:
+  case X86::BI__builtin_ia32_vcvtqq2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
+    ArgNum = 3;
+    HasRC = true;
+    break;
+  case X86::BI__builtin_ia32_addsh_round_mask:
+  case X86::BI__builtin_ia32_addss_round_mask:
+  case X86::BI__builtin_ia32_addsd_round_mask:
+  case X86::BI__builtin_ia32_divsh_round_mask:
+  case X86::BI__builtin_ia32_divss_round_mask:
+  case X86::BI__builtin_ia32_divsd_round_mask:
+  case X86::BI__builtin_ia32_mulsh_round_mask:
+  case X86::BI__builtin_ia32_mulss_round_mask:
+  case X86::BI__builtin_ia32_mulsd_round_mask:
+  case X86::BI__builtin_ia32_subsh_round_mask:
+  case X86::BI__builtin_ia32_subss_round_mask:
+  case X86::BI__builtin_ia32_subsd_round_mask:
+  case X86::BI__builtin_ia32_scalefph512_mask:
+  case X86::BI__builtin_ia32_scalefpd512_mask:
+  case X86::BI__builtin_ia32_scalefps512_mask:
+  case X86::BI__builtin_ia32_scalefsd_round_mask:
+  case X86::BI__builtin_ia32_scalefss_round_mask:
+  case X86::BI__builtin_ia32_scalefsh_round_mask:
+  case X86::BI__builtin_ia32_cvtsd2ss_round_mask:
+  case X86::BI__builtin_ia32_vcvtss2sh_round_mask:
+  case X86::BI__builtin_ia32_vcvtsd2sh_round_mask:
+  case X86::BI__builtin_ia32_sqrtsd_round_mask:
+  case X86::BI__builtin_ia32_sqrtss_round_mask:
+  case X86::BI__builtin_ia32_sqrtsh_round_mask:
+  case X86::BI__builtin_ia32_vfmaddsd3_mask:
+  case X86::BI__builtin_ia32_vfmaddsd3_maskz:
+  case X86::BI__builtin_ia32_vfmaddsd3_mask3:
+  case X86::BI__builtin_ia32_vfmaddss3_mask:
+  case X86::BI__builtin_ia32_vfmaddss3_maskz:
+  case X86::BI__builtin_ia32_vfmaddss3_mask3:
+  case X86::BI__builtin_ia32_vfmaddsh3_mask:
+  case X86::BI__builtin_ia32_vfmaddsh3_maskz:
+  case X86::BI__builtin_ia32_vfmaddsh3_mask3:
+  case X86::BI__builtin_ia32_vfmaddpd512_mask:
+  case X86::BI__builtin_ia32_vfmaddpd512_maskz:
+  case X86::BI__builtin_ia32_vfmaddpd512_mask3:
+  case X86::BI__builtin_ia32_vfmsubpd512_mask3:
+  case X86::BI__builtin_ia32_vfmaddps512_mask:
+  case X86::BI__builtin_ia32_vfmaddps512_maskz:
+  case X86::BI__builtin_ia32_vfmaddps512_mask3:
+  case X86::BI__builtin_ia32_vfmsubps512_mask3:
+  case X86::BI__builtin_ia32_vfmaddph512_mask:
+  case X86::BI__builtin_ia32_vfmaddph512_maskz:
+  case X86::BI__builtin_ia32_vfmaddph512_mask3:
+  case X86::BI__builtin_ia32_vfmsubph512_mask3:
+  case X86::BI__builtin_ia32_vfmaddsubpd512_mask:
+  case X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
+  case X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
+  case X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
+  case X86::BI__builtin_ia32_vfmaddsubps512_mask:
+  case X86::BI__builtin_ia32_vfmaddsubps512_maskz:
+  case X86::BI__builtin_ia32_vfmaddsubps512_mask3:
+  case X86::BI__builtin_ia32_vfmsubaddps512_mask3:
+  case X86::BI__builtin_ia32_vfmaddsubph512_mask:
+  case X86::BI__builtin_ia32_vfmaddsubph512_maskz:
+  case X86::BI__builtin_ia32_vfmaddsubph512_mask3:
+  case X86::BI__builtin_ia32_vfmsubaddph512_mask3:
+  case X86::BI__builtin_ia32_vfmaddcsh_mask:
+  case X86::BI__builtin_ia32_vfmaddcsh_round_mask:
+  case X86::BI__builtin_ia32_vfmaddcsh_round_mask3:
+  case X86::BI__builtin_ia32_vfmaddcph512_mask:
+  case X86::BI__builtin_ia32_vfmaddcph512_maskz:
+  case X86::BI__builtin_ia32_vfmaddcph512_mask3:
+  case X86::BI__builtin_ia32_vfcmaddcsh_mask:
+  case X86::BI__builtin_ia32_vfcmaddcsh_round_mask:
+  case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3:
+  case X86::BI__builtin_ia32_vfcmaddcph512_mask:
+  case X86::BI__builtin_ia32_vfcmaddcph512_maskz:
+  case X86::BI__builtin_ia32_vfcmaddcph512_mask3:
+  case X86::BI__builtin_ia32_vfmulcsh_mask:
+  case X86::BI__builtin_ia32_vfmulcph512_mask:
+  case X86::BI__builtin_ia32_vfcmulcsh_mask:
+  case X86::BI__builtin_ia32_vfcmulcph512_mask:
+    ArgNum = 4;
+    HasRC = true;
+    break;
+  }
+
+  llvm::APSInt Result;
+
+  // We can't check the value of a dependent argument.
+  Expr *Arg = TheCall->getArg(ArgNum);
+  if (Arg->isTypeDependent() || Arg->isValueDependent())
+    return false;
+
+  // Check constant-ness first.
+  if (SemaRef.BuiltinConstantArg(TheCall, ArgNum, Result))
+    return true;
+
+  // Make sure rounding mode is either ROUND_CUR_DIRECTION or ROUND_NO_EXC bit
+  // is set. If the intrinsic has rounding control(bits 1:0), make sure its only
+  // combined with ROUND_NO_EXC. If the intrinsic does not have rounding
+  // control, allow ROUND_NO_EXC and ROUND_CUR_DIRECTION together.
+  if (Result == 4 /*ROUND_CUR_DIRECTION*/ || Result == 8 /*ROUND_NO_EXC*/ ||
+      (!HasRC && Result == 12 /*ROUND_CUR_DIRECTION|ROUND_NO_EXC*/) ||
+      (HasRC && Result.getZExtValue() >= 8 && Result.getZExtValue() <= 11))
+    return false;
+
+  return Diag(TheCall->getBeginLoc(), diag::err_x86_builtin_invalid_rounding)
+         << Arg->getSourceRange();
+}
+
+// Check if the gather/scatter scale is legal.
+bool SemaX86::CheckBuiltinGatherScatterScale(unsigned BuiltinID,
+                                             CallExpr *TheCall) {
+  unsigned ArgNum = 0;
+  switch (BuiltinID) {
+  default:
+    return false;
+  case X86::BI__builtin_ia32_gatherd_pd:
+  case X86::BI__builtin_ia32_gatherd_pd256:
+  case X86::BI__builtin_ia32_gatherq_pd:
+  case X86::BI__builtin_ia32_gatherq_pd256:
+  case X86::BI__builtin_ia32_gatherd_ps:
+  case X86::BI__builtin_ia32_gatherd_ps256:
+  case X86::BI__builtin_ia32_gatherq_ps:
+  case X86::BI__builtin_ia32_gatherq_ps256:
+  case X86::BI__builtin_ia32_gatherd_q:
+  case X86::BI__builtin_ia32_gatherd_q256:
+  case X86::BI__builtin_ia32_gatherq_q:
+  case X86::BI__builtin_ia32_gatherq_q256:
+  case X86::BI__builtin_ia32_gatherd_d:
+  case X86::BI__builtin_ia32_gatherd_d256:
+  case X86::BI__builtin_ia32_gatherq_d:
+  case X86::BI__builtin_ia32_gatherq_d256:
+  case X86::BI__builtin_ia32_gather3div2df:
+  case X86::BI__builtin_ia32_gather3div2di:
+  case X86::BI__builtin_ia32_gather3div4df:
+  case X86::BI__builtin_ia32_gather3div4di:
+  case X86::BI__builtin_ia32_gather3div4sf:
+  case X86::BI__builtin_ia32_gather3div4si:
+  case X86::BI__builtin_ia32_gather3div8sf:
+  case X86::BI__builtin_ia32_gather3div8si:
+  case X86::BI__builtin_ia32_gather3siv2df:
+  case X86::BI__builtin_ia32_gather3siv2di:
+  case X86::BI__builtin_ia32_gather3siv4df:
+  case X86::BI__builtin_ia32_gather3siv4di:
+  case X86::BI__builtin_ia32_gather3siv4sf:
+  case X86::BI__builtin_ia32_gather3siv4si:
+  case X86::BI__builtin_ia32_gather3siv8sf:
+  case X86::BI__builtin_ia32_gather3siv8si:
+  case X86::BI__builtin_ia32_gathersiv8df:
+  case X86::BI__builtin_ia32_gathersiv16sf:
+  case X86::BI__builtin_ia32_gatherdiv8df:
+  case X86::BI__builtin_ia32_gatherdiv16sf:
+  case X86::BI__builtin_ia32_gathersiv8di:
+  case X86::BI__builtin_ia32_gathersiv16si:
+  case X86::BI__builtin_ia32_gatherdiv8di:
+  case X86::BI__builtin_ia32_gatherdiv16si:
+  case X86::BI__builtin_ia32_scatterdiv2df:
+  case X86::BI__builtin_ia32_scatterdiv2di:
+  case X86::BI__builtin_ia32_scatterdiv4df:
+  case X86::BI__builtin_ia32_scatterdiv4di:
+  case X86::BI__builtin_ia32_scatterdiv4sf:
+  case X86::BI__builtin_ia32_scatterdiv4si:
+  case X86::BI__builtin_ia32_scatterdiv8sf:
+  case X86::BI__builtin_ia32_scatterdiv8si:
+  case X86::BI__builtin_ia32_scattersiv2df:
+  case X86::BI__builtin_ia32_scattersiv2di:
+  case X86::BI__builtin_ia32_scattersiv4df:
+  case X86::BI__builtin_ia32_scattersiv4di:
+  case X86::BI__builtin_ia32_scattersiv4sf:
+  case X86::BI__builtin_ia32_scattersiv4si:
+  case X86::BI__builtin_ia32_scattersiv8sf:
+  case X86::BI__builtin_ia32_scattersiv8si:
+  case X86::BI__builtin_ia32_scattersiv8df:
+  case X86::BI__builtin_ia32_scattersiv16sf:
+  case X86::BI__builtin_ia32_scatterdiv8df:
+  case X86::BI__builtin_ia32_scatterdiv16sf:
+  case X86::BI__builtin_ia32_scattersiv8di:
+  case X86::BI__builtin_ia32_scattersiv16si:
+  case X86::BI__builtin_ia32_scatterdiv8di:
+  case X86::BI__builtin_ia32_scatterdiv16si:
+    ArgNum = 4;
+    break;
+  }
+
+  llvm::APSInt Result;
+
+  // We can't check the value of a dependent argument.
+  Expr *Arg = TheCall->getArg(ArgNum);
+  if (Arg->isTypeDependent() || Arg->isValueDependent())
+    return false;
+
+  // Check constant-ness first.
+  if (SemaRef.BuiltinConstantArg(TheCall, ArgNum, Result))
+    return true;
+
+  if (Result == 1 || Result == 2 || Result == 4 || Result == 8)
+    return false;
+
+  return Diag(TheCall->getBeginLoc(), diag::err_x86_builtin_invalid_scale)
+         << Arg->getSourceRange();
+}
+
+enum { TileRegLow = 0, TileRegHigh = 7 };
+
+bool SemaX86::CheckBuiltinTileArgumentsRange(CallExpr *TheCall,
+                                             ArrayRef<int> ArgNums) {
+  for (int ArgNum : ArgNums) {
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, TileRegLow,
+                                        TileRegHigh))
+      return true;
+  }
+  return false;
+}
+
+bool SemaX86::CheckBuiltinTileDuplicate(CallExpr *TheCall,
+                                        ArrayRef<int> ArgNums) {
+  // Because the max number of tile register is TileRegHigh + 1, so here we use
+  // each bit to represent the usage of them in bitset.
+  std::bitset<TileRegHigh + 1> ArgValues;
+  for (int ArgNum : ArgNums) {
+    Expr *Arg = TheCall->getArg(ArgNum);
+    if (Arg->isTypeDependent() || Arg->isValueDependent())
+      continue;
+
+    llvm::APSInt Result;
+    if (SemaRef.BuiltinConstantArg(TheCall, ArgNum, Result))
+      return true;
+    int ArgExtValue = Result.getExtValue();
+    assert((ArgExtValue >= TileRegLow && ArgExtValue <= TileRegHigh) &&
+           "Incorrect tile register num.");
+    if (ArgValues.test(ArgExtValue))
+      return Diag(TheCall->getBeginLoc(),
+                  diag::err_x86_builtin_tile_arg_duplicate)
+             << TheCall->getArg(ArgNum)->getSourceRange();
+    ArgValues.set(ArgExtValue);
+  }
+  return false;
+}
+
+bool SemaX86::CheckBuiltinTileRangeAndDuplicate(CallExpr *TheCall,
+                                                ArrayRef<int> ArgNums) {
+  return CheckBuiltinTileArgumentsRange(TheCall, ArgNums) ||
+         CheckBuiltinTileDuplicate(TheCall, ArgNums);
+}
+
+bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
+  switch (BuiltinID) {
+  default:
+    return false;
+  case X86::BI__builtin_ia32_tileloadd64:
+  case X86::BI__builtin_ia32_tileloaddt164:
+  case X86::BI__builtin_ia32_tilestored64:
+  case X86::BI__builtin_ia32_tilezero:
+    return CheckBuiltinTileArgumentsRange(TheCall, 0);
+  case X86::BI__builtin_ia32_tdpbssd:
+  case X86::BI__builtin_ia32_tdpbsud:
+  case X86::BI__builtin_ia32_tdpbusd:
+  case X86::BI__builtin_ia32_tdpbuud:
+  case X86::BI__builtin_ia32_tdpbf16ps:
+  case X86::BI__builtin_ia32_tdpfp16ps:
+  case X86::BI__builtin_ia32_tcmmimfp16ps:
+  case X86::BI__builtin_ia32_tcmmrlfp16ps:
+    return CheckBuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2});
+  }
+}
+static bool isX86_32Builtin(unsigned BuiltinID) {
+  // These builtins only work on x86-32 targets.
+  switch (BuiltinID) {
+  case X86::BI__builtin_ia32_readeflags_u32:
+  case X86::BI__builtin_ia32_writeeflags_u32:
+    return true;
+  }
+
+  return false;
+}
+
+bool SemaX86::CheckBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
+                                       CallExpr *TheCall) {
+  // Check for 32-bit only builtins on a 64-bit target.
+  const llvm::Triple &TT = TI.getTriple();
+  if (TT.getArch() != llvm::Triple::x86 && isX86_32Builtin(BuiltinID))
+    return Diag(TheCall->getCallee()->getBeginLoc(),
+                diag::err_32_bit_builtin_64_bit_tgt);
+
+  // If the intrinsic has rounding or SAE make sure its valid.
+  if (CheckBuiltinRoundingOrSAE(BuiltinID, TheCall))
+    return true;
+
+  // If the intrinsic has a gather/scatter scale immediate make sure its valid.
+  if (CheckBuiltinGatherScatterScale(BuiltinID, TheCall))
+    return true;
+
+  // If the intrinsic has a tile arguments, make sure they are valid.
+  if (CheckBuiltinTileArguments(BuiltinID, TheCall))
+    return true;
+
+  // For intrinsics which take an immediate value as part of the instruction,
+  // range check them here.
+  int i = 0, l = 0, u = 0;
+  switch (BuiltinID) {
+  default:
+    return false;
+  case X86::BI__builtin_ia32_vec_ext_v2si:
+  case X86::BI__builtin_ia32_vec_ext_v2di:
+  case X86::BI__builtin_ia32_vextractf128_pd256:
+  case X86::BI__builtin_ia32_vextractf128_ps256:
+  case X86::BI__builtin_ia32_vextractf128_si256:
+  case X86::BI__builtin_ia32_extract128i256:
+  case X86::BI__builtin_ia32_extractf64x4_mask:
+  case X86::BI__builtin_ia32_extracti64x4_mask:
+  case X86::BI__builtin_ia32_extractf32x8_mask:
+  case X86::BI__builtin_ia32_extracti32x8_mask:
+  case X86::BI__builtin_ia32_extractf64x2_256_mask:
+  case X86::BI__builtin_ia32_extracti64x2_256_mask:
+  case X86::BI__builtin_ia32_extractf32x4_256_mask:
+  case X86::BI__builtin_ia32_extracti32x4_256_mask:
+    i = 1;
+    l = 0;
+    u = 1;
+    break;
+  case X86::BI__builtin_ia32_vec_set_v2di:
+  case X86::BI__builtin_ia32_vinsertf128_pd256:
+  case X86::BI__builtin_ia32_vinsertf128_ps256:
+  case X86::BI__builtin_ia32_vinsertf128_si256:
+  case X86::BI__builtin_ia32_insert128i256:
+  case X86::BI__builtin_ia32_insertf32x8:
+  case X86::BI__builtin_ia32_inserti32x8:
+  case X86::BI__builtin_ia32_insertf64x4:
+  case X86::BI__builtin_ia32_inserti64x4:
+  case X86::BI__builtin_ia32_insertf64x2_256:
+  case X86::BI__builtin_ia32_inserti64x2_256:
+  case X86::BI__builtin_ia32_insertf32x4_256:
+  case X86::BI__builtin_ia32_inserti32x4_256:
+    i = 2;
+    l = 0;
+    u = 1;
+    break;
+  case X86::BI__builtin_ia32_vpermilpd:
+  case X86::BI__builtin_ia32_vec_ext_v4hi:
+  case X86::BI__builtin_ia32_vec_ext_v4si:
+  case X86::BI__builtin_ia32_vec_ext_v4sf:
+  case X86::BI__builtin_ia32_vec_ext_v4di:
+  case X86::BI__builtin_ia32_extractf32x4_mask:
+  case X86::BI__builtin_ia32_extracti32x4_mask:
+  case X86::BI__builtin_ia32_extractf64x2_512_mask:
+  case X86::BI__builtin_ia32_extracti64x2_512_mask:
+    i = 1;
+    l = 0;
+    u = 3;
+    break;
+  case X86::BI_mm_prefetch:
+  case X86::BI__builtin_ia32_vec_ext_v8hi:
+  case X86::BI__builtin_ia32_vec_ext_v8si:
+    i = 1;
+    l = 0;
+    u = 7;
+    break;
+  case X86::BI__builtin_ia32_sha1rnds4:
+  case X86::BI__builtin_ia32_blendpd:
+  case X86::BI__builtin_ia32_shufpd:
+  case X86::BI__builtin_ia32_vec_set_v4hi:
+  case X86::BI__builtin_ia32_vec_set_v4si:
+  case X86::BI__builtin_ia32_vec_set_v4di:
+  case X86::BI__builtin_ia32_shuf_f32x4_256:
+  case X86::BI__builtin_ia32_shuf_f64x2_256:
+  case X86::BI__builtin_ia32_shuf_i32x4_256:
+  case X86::BI__builtin_ia32_shuf_i64x2_256:
+  case X86::BI__builtin_ia32_insertf64x2_512:
+  case X86::BI__builtin_ia32_inserti64x2_512:
+  case X86::BI__builtin_ia32_insertf32x4:
+  case X86::BI__builtin_ia32_inserti32x4:
+    i = 2;
+    l = 0;
+    u = 3;
+    break;
+  case X86::BI__builtin_ia32_vpermil2pd:
+  case X86::BI__builtin_ia32_vpermil2pd256:
+  case X86::BI__builtin_ia32_vpermil2ps:
+  case X86::BI__builtin_ia32_vpermil2ps256:
+    i = 3;
+    l = 0;
+    u = 3;
+    break;
+  case X86::BI__builtin_ia32_cmpb128_mask:
+  case X86::BI__builtin_ia32_cmpw128_mask:
+  case X86::BI__builtin_ia32_cmpd128_mask:
+  case X86::BI__builtin_ia32_cmpq128_mask:
+  case X86::BI__builtin_ia32_cmpb256_mask:
+  case X86::BI__builtin_ia32_cmpw256_mask:
+  case X86::BI__builtin_ia32_cmpd256_mask:
+  case X86::BI__builtin_ia32_cmpq256_mask:
+  case X86::BI__builtin_ia32_cmpb512_mask:
+  case X86::BI__builtin_ia32_cmpw512_mask:
+  case X86::BI__builtin_ia32_cmpd512_mask:
+  case X86::BI__builtin_ia32_cmpq512_mask:
+  case X86::BI__builtin_ia32_ucmpb128_mask:
+  case X86::BI__builtin_ia32_ucmpw128_mask:
+  case X86::BI__builtin_ia32_ucmpd128_mask:
+  case X86::BI__builtin_ia32_ucmpq128_mask:
+  case X86::BI__builtin_ia32_ucmpb256_mask:
+  case X86::BI__builtin_ia32_ucmpw256_mask:
+  case X86::BI__builtin_ia32_ucmpd256_mask:
+  case X86::BI__builtin_ia32_ucmpq256_mask:
+  case X86::BI__builtin_ia32_ucmpb512_mask:
+  case X86::BI__builtin_ia32_ucmpw512_mask:
+  case X86::BI__builtin_ia32_ucmpd512_mask:
+  case X86::BI__builtin_ia32_ucmpq512_mask:
+  case X86::BI__builtin_ia32_vpcomub:
+  case X86::BI__builtin_ia32_vpcomuw:
+  case X86::BI__builtin_ia32_vpcomud:
+  case X86::BI__builtin_ia32_vpcomuq:
+  case X86::BI__builtin_ia32_vpcomb:
+  case X86::BI__builtin_ia32_vpcomw:
+  case X86::BI__builtin_ia32_vpcomd:
+  case X86::BI__builtin_ia32_vpcomq:
+  case X86::BI__builtin_ia32_vec_set_v8hi:
+  case X86::BI__builtin_ia32_vec_set_v8si:
+    i = 2;
+    l = 0;
+    u = 7;
+    break;
+  case X86::BI__builtin_ia32_vpermilpd256:
+  case X86::BI__builtin_ia32_roundps:
+  case X86::BI__builtin_ia32_roundpd:
+  case X86::BI__builtin_ia32_roundps256:
+  case X86::BI__builtin_ia32_roundpd256:
+  case X86::BI__builtin_ia32_getmantpd128_mask:
+  case X86::BI__builtin_ia32_getmantpd256_mask:
+  case X86::BI__builtin_ia32_getmantps128_mask:
+  case X86::BI__builtin_ia32_getmantps256_mask:
+  case X86::BI__builtin_ia32_getmantpd512_mask:
+  case X86::BI__builtin_ia32_getmantps512_mask:
+  case X86::BI__builtin_ia32_getmantph128_mask:
+  case X86::BI__builtin_ia32_getmantph256_mask:
+  case X86::BI__builtin_ia32_getmantph512_mask:
+  case X86::BI__builtin_ia32_vec_ext_v16qi:
+  case X86::BI__builtin_ia32_vec_ext_v16hi:
+    i = 1;
+    l = 0;
+    u = 15;
+    break;
+  case X86::BI__builtin_ia32_pblendd128:
+  case X86::BI__builtin_ia32_blendps:
+  case X86::BI__builtin_ia32_blendpd256:
+  case X86::BI__builtin_ia32_shufpd256:
+  case X86::BI__builtin_ia32_roundss:
+  case X86::BI__builtin_ia32_roundsd:
+  case X86::BI__builtin_ia32_rangepd128_mask:
+  case X86::BI__builtin_ia32_rangepd256_mask:
+  case X86::BI__builtin_ia32_rangepd512_mask:
+  case X86::BI__builtin_ia32_rangeps128_mask:
+  case X86::BI__builtin_ia32_rangeps256_mask:
+  case X86::BI__builtin_ia32_rangeps512_mask:
+  case X86::BI__builtin_ia32_getmantsd_round_mask:
+  case X86::BI__builtin_ia32_getmantss_round_mask:
+  case X86::BI__builtin_ia32_getmantsh_round_mask:
+  case X86::BI__builtin_ia32_vec_set_v16qi:
+  case X86::BI__builtin_ia32_vec_set_v16hi:
+    i = 2;
+    l = 0;
+    u = 15;
+    break;
+  case X86::BI__builtin_ia32_vec_ext_v32qi:
+    i = 1;
+    l = 0;
+    u = 31;
+    break;
+  case X86::BI__builtin_ia32_cmpps:
+  case X86::BI__builtin_ia32_cmpss:
+  case X86::BI__builtin_ia32_cmppd:
+  case X86::BI__builtin_ia32_cmpsd:
+  case X86::BI__builtin_ia32_cmpps256:
+  case X86::BI__builtin_ia32_cmppd256:
+  case X86::BI__builtin_ia32_cmpps128_mask:
+  case X86::BI__builtin_ia32_cmppd128_mask:
+  case X86::BI__builtin_ia32_cmpps256_mask:
+  case X86::BI__builtin_ia32_cmppd256_mask:
+  case X86::BI__builtin_ia32_cmpps512_mask:
+  case X86::BI__builtin_ia32_cmppd512_mask:
+  case X86::BI__builtin_ia32_cmpsd_mask:
+  case X86::BI__builtin_ia32_cmpss_mask:
+  case X86::BI__builtin_ia32_vec_set_v32qi:
+    i = 2;
+    l = 0;
+    u = 31;
+    break;
+  case X86::BI__builtin_ia32_permdf256:
+  case X86::BI__builtin_ia32_permdi256:
+  case X86::BI__builtin_ia32_permdf512:
+  case X86::BI__builtin_ia32_permdi512:
+  case X86::BI__builtin_ia32_vpermilps:
+  case X86::BI__builtin_ia32_vpermilps256:
+  case X86::BI__builtin_ia32_vpermilpd512:
+  case X86::BI__builtin_ia32_vpermilps512:
+  case X86::BI__builtin_ia32_pshufd:
+  case X86::BI__builtin_ia32_pshufd256:
+  case X86::BI__builtin_ia32_pshufd512:
+  case X86::BI__builtin_ia32_pshufhw:
+  case X86::BI__builtin_ia32_pshufhw256:
+  case X86::BI__builtin_ia32_pshufhw512:
+  case X86::BI__builtin_ia32_pshuflw:
+  case X86::BI__builtin_ia32_pshuflw256:
+  case X86::BI__builtin_ia32_pshuflw512:
+  case X86::BI__builtin_ia32_vcvtps2ph:
+  case X86::BI__builtin_ia32_vcvtps2ph_mask:
+  case X86::BI__builtin_ia32_vcvtps2ph256:
+  case X86::BI__builtin_ia32_vcvtps2ph256_mask:
+  case X86::BI__builtin_ia32_vcvtps2ph512_mask:
+  case X86::BI__builtin_ia32_rndscaleps_128_mask:
+  case X86::BI__builtin_ia32_rndscalepd_128_mask:
+  case X86::BI__builtin_ia32_rndscaleps_256_mask:
+  case X86::BI__builtin_ia32_rndscalepd_256_mask:
+  case X86::BI__builtin_ia32_rndscaleps_mask:
+  case X86::BI__builtin_ia32_rndscalepd_mask:
+  case X86::BI__builtin_ia32_rndscaleph_mask:
+  case X86::BI__builtin_ia32_reducepd128_mask:
+  case X86::BI__builtin_ia32_reducepd256_mask:
+  case X86::BI__builtin_ia32_reducepd512_mask:
+  case X86::BI__builtin_ia32_reduceps128_mask:
+  case X86::BI__builtin_ia32_reduceps256_mask:
+  case X86::BI__builtin_ia32_reduceps512_mask:
+  case X86::BI__builtin_ia32_reduceph128_mask:
+  case X86::BI__builtin_ia32_reduceph256_mask:
+  case X86::BI__builtin_ia32_reduceph512_mask:
+  case X86::BI__builtin_ia32_prold512:
+  case X86::BI__builtin_ia32_prolq512:
+  case X86::BI__builtin_ia32_prold128:
+  case X86::BI__builtin_ia32_prold256:
+  case X86::BI__builtin_ia32_prolq128:
+  case X86::BI__builtin_ia32_prolq256:
+  case X86::BI__builtin_ia32_prord512:
+  case X86::BI__builtin_ia32_prorq512:
+  case X86::BI__builtin_ia32_prord128:
+  case X86::BI__builtin_ia32_prord256:
+  case X86::BI__builtin_ia32_prorq128:
+  case X86::BI__builtin_ia32_prorq256:
+  case X86::BI__builtin_ia32_fpclasspd128_mask:
+  case X86::BI__builtin_ia32_fpclasspd256_mask:
+  case X86::BI__builtin_ia32_fpclassps128_mask:
+  case X86::BI__builtin_ia32_fpclassps256_mask:
+  case X86::BI__builtin_ia32_fpclassps512_mask:
+  case X86::BI__builtin_ia32_fpclasspd512_mask:
+  case X86::BI__builtin_ia32_fpclassph128_mask:
+  case X86::BI__builtin_ia32_fpclassph256_mask:
+  case X86::BI__builtin_ia32_fpclassph512_mask:
+  case X86::BI__builtin_ia32_fpclasssd_mask:
+  case X86::BI__builtin_ia32_fpclassss_mask:
+  case X86::BI__builtin_ia32_fpclasssh_mask:
+  case X86::BI__builtin_ia32_pslldqi128_byteshift:
+  case X86::BI__builtin_ia32_pslldqi256_byteshift:
+  case X86::BI__builtin_ia32_pslldqi512_byteshift:
+  case X86::BI__builtin_ia32_psrldqi128_byteshift:
+  case X86::BI__builtin_ia32_psrldqi256_byteshift:
+  case X86::BI__builtin_ia32_psrldqi512_byteshift:
+  case X86::BI__builtin_ia32_kshiftliqi:
+  case X86::BI__builtin_ia32_kshiftlihi:
+  case X86::BI__builtin_ia32_kshiftlisi:
+  case X86::BI__builtin_ia32_kshiftlidi:
+  case X86::BI__builtin_ia32_kshiftriqi:
+  case X86::BI__builtin_ia32_kshiftrihi:
+  case X86::BI__builtin_ia32_kshiftrisi:
+  case X86::BI__builtin_ia32_kshiftridi:
+    i = 1;
+    l = 0;
+    u = 255;
+    break;
+  case X86::BI__builtin_ia32_vperm2f128_pd256:
+  case X86::BI__builtin_ia32_vperm2f128_ps256:
+  case X86::BI__builtin_ia32_vperm2f128_si256:
+  case X86::BI__builtin_ia32_permti256:
+  case X86::BI__builtin_ia32_pblendw128:
+  case X86::BI__builtin_ia32_pblendw256:
+  case X86::BI__builtin_ia32_blendps256:
+  case X86::BI__builtin_ia32_pblendd256:
+  case X86::BI__builtin_ia32_palignr128:
+  case X86::BI__builtin_ia32_palignr256:
+  case X86::BI__builtin_ia32_palignr512:
+  case X86::BI__builtin_ia32_alignq512:
+  case X86::BI__builtin_ia32_alignd512:
+  case X86::BI__builtin_ia32_alignd128:
+  case X86::BI__builtin_ia32_alignd256:
+  case X86::BI__builtin_ia32_alignq128:
+  case X86::BI__builtin_ia32_alignq256:
+  case X86::BI__builtin_ia32_vcomisd:
+  case X86::BI__builtin_ia32_vcomiss:
+  case X86::BI__builtin_ia32_shuf_f32x4:
+  case X86::BI__builtin_ia32_shuf_f64x2:
+  case X86::BI__builtin_ia32_shuf_i32x4:
+  case X86::BI__builtin_ia32_shuf_i64x2:
+  case X86::BI__builtin_ia32_shufpd512:
+  case X86::BI__builtin_ia32_shufps:
+  case X86::BI__builtin_ia32_shufps256:
+  case X86::BI__builtin_ia32_shufps512:
+  case X86::BI__builtin_ia32_dbpsadbw128:
+  case X86::BI__builtin_ia32_dbpsadbw256:
+  case X86::BI__builtin_ia32_dbpsadbw512:
+  case X86::BI__builtin_ia32_vpshldd128:
+  case X86::BI__builtin_ia32_vpshldd256:
+  case X86::BI__builtin_ia32_vpshldd512:
+  case X86::BI__builtin_ia32_vpshldq128:
+  case X86::BI__builtin_ia32_vpshldq256:
+  case X86::BI__builtin_ia32_vpshldq512:
+  case X86::BI__builtin_ia32_vpshldw128:
+  case X86::BI__builtin_ia32_vpshldw256:
+  case X86::BI__builtin_ia32_vpshldw512:
+  case X86::BI__builtin_ia32_vpshrdd128:
+  case X86::BI__builtin_ia32_vpshrdd256:
+  case X86::BI__builtin_ia32_vpshrdd512:
+  case X86::BI__builtin_ia32_vpshrdq128:
+  case X86::BI__builtin_ia32_vpshrdq256:
+  case X86::BI__builtin_ia32_vpshrdq512:
+  case X86::BI__builtin_ia32_vpshrdw128:
+  case X86::BI__builtin_ia32_vpshrdw256:
+  case X86::BI__builtin_ia32_vpshrdw512:
+    i = 2;
+    l = 0;
+    u = 255;
+    break;
+  case X86::BI__builtin_ia32_fixupimmpd512_mask:
+  case X86::BI__builtin_ia32_fixupimmpd512_maskz:
+  case X86::BI__builtin_ia32_fixupimmps512_mask:
+  case X86::BI__builtin_ia32_fixupimmps512_maskz:
+  case X86::BI__builtin_ia32_fixupimmsd_mask:
+  case X86::BI__builtin_ia32_fixupimmsd_maskz:
+  case X86::BI__builtin_ia32_fixupimmss_mask:
+  case X86::BI__builtin_ia32_fixupimmss_maskz:
+  case X86::BI__builtin_ia32_fixupimmpd128_mask:
+  case X86::BI__builtin_ia32_fixupimmpd128_maskz:
+  case X86::BI__builtin_ia32_fixupimmpd256_mask:
+  case X86::BI__builtin_ia32_fixupimmpd256_maskz:
+  case X86::BI__builtin_ia32_fixupimmps128_mask:
+  case X86::BI__builtin_ia32_fixupimmps128_maskz:
+  case X86::BI__builtin_ia32_fixupimmps256_mask:
+  case X86::BI__builtin_ia32_fixupimmps256_maskz:
+  case X86::BI__builtin_ia32_pternlogd512_mask:
+  case X86::BI__builtin_ia32_pternlogd512_maskz:
+  case X86::BI__builtin_ia32_pternlogq512_mask:
+  case X86::BI__builtin_ia32_pternlogq512_maskz:
+  case X86::BI__builtin_ia32_pternlogd128_mask:
+  case X86::BI__builtin_ia32_pternlogd128_maskz:
+  case X86::BI__builtin_ia32_pternlogd256_mask:
+  case X86::BI__builtin_ia32_pternlogd256_maskz:
+  case X86::BI__builtin_ia32_pternlogq128_mask:
+  case X86::BI__builtin_ia32_pternlogq128_maskz:
+  case X86::BI__builtin_ia32_pternlogq256_mask:
+  case X86::BI__builtin_ia32_pternlogq256_maskz:
+  case X86::BI__builtin_ia32_vsm3rnds2:
+    i = 3;
+    l = 0;
+    u = 255;
+    break;
+  case X86::BI__builtin_ia32_reducesd_mask:
+  case X86::BI__builtin_ia32_reducess_mask:
+  case X86::BI__builtin_ia32_rndscalesd_round_mask:
+  case X86::BI__builtin_ia32_rndscaless_round_mask:
+  case X86::BI__builtin_ia32_rndscalesh_round_mask:
+  case X86::BI__builtin_ia32_reducesh_mask:
+    i = 4;
+    l = 0;
+    u = 255;
+    break;
+  case X86::BI__builtin_ia32_cmpccxadd32:
+  case X86::BI__builtin_ia32_cmpccxadd64:
+    i = 3;
+    l = 0;
+    u = 15;
+    break;
+  }
+
+  // Note that we don't force a hard error on the range check here, allowing
+  // template-generated or macro-generated dead code to potentially have out-of-
+  // range values. These need to code generate, but don't need to necessarily
+  // make any sense. We use a warning that defaults to an error.
+  return SemaRef.BuiltinConstantArgRange(TheCall, i, l, u,
+                                         /*RangeIsError*/ false);
+}
+
+} // namespace clang
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 6b53c24..dee335b 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -7337,7 +7337,7 @@ QualType TreeTransform<Derived>::TransformCountAttributedType(
   if (getDerived().AlwaysRebuild() || InnerTy != OldTy->desugar() ||
       OldCount != NewCount) {
     // Currently, CountAttributedType can only wrap incomplete array types.
-    Result = SemaRef.BuildCountAttributedArrayType(InnerTy, NewCount);
+    Result = SemaRef.BuildCountAttributedArrayOrPointerType(InnerTy, NewCount);
   }
 
   TLB.push<CountAttributedTypeLoc>(Result);
@@ -14114,6 +14114,13 @@ TreeTransform<Derived>::TransformCXXTemporaryObjectExpr(
     if (TransformExprs(E->getArgs(), E->getNumArgs(), true, Args,
                        &ArgumentChanged))
       return ExprError();
+
+    if (E->isListInitialization() && !E->isStdInitListInitialization()) {
+      ExprResult Res = RebuildInitList(E->getBeginLoc(), Args, E->getEndLoc());
+      if (Res.isInvalid())
+        return ExprError();
+      Args = {Res.get()};
+    }
   }
 
   if (!getDerived().AlwaysRebuild() &&
@@ -14125,12 +14132,9 @@ TreeTransform<Derived>::TransformCXXTemporaryObjectExpr(
     return SemaRef.MaybeBindToTemporary(E);
   }
 
-  // FIXME: We should just pass E->isListInitialization(), but we're not
-  // prepared to handle list-initialization without a child InitListExpr.
   SourceLocation LParenLoc = T->getTypeLoc().getEndLoc();
   return getDerived().RebuildCXXTemporaryObjectExpr(
-      T, LParenLoc, Args, E->getEndLoc(),
-      /*ListInitialization=*/LParenLoc.isInvalid());
+      T, LParenLoc, Args, E->getEndLoc(), E->isListInitialization());
 }
 
 template<typename Derived>
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index a6254b7..61cc99d 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -2695,7 +2695,8 @@ void ASTDeclReader::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
   }
 
   if (Record.readInt())
-    D->setDefaultArgument(readTypeSourceInfo());
+    D->setDefaultArgument(Reader.getContext(),
+                          Record.readTemplateArgumentLoc());
 }
 
 void ASTDeclReader::VisitNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D) {
@@ -2716,7 +2717,8 @@ void ASTDeclReader::VisitNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D) {
     // Rest of NonTypeTemplateParmDecl.
     D->ParameterPack = Record.readInt();
     if (Record.readInt())
-      D->setDefaultArgument(Record.readExpr());
+      D->setDefaultArgument(Reader.getContext(),
+                            Record.readTemplateArgumentLoc());
   }
 }
 
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index c2f1d1b..bbd16db 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -1899,7 +1899,7 @@ void ASTDeclWriter::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
                         !D->defaultArgumentWasInherited();
   Record.push_back(OwnsDefaultArg);
   if (OwnsDefaultArg)
-    Record.AddTypeSourceInfo(D->getDefaultArgumentInfo());
+    Record.AddTemplateArgumentLoc(D->getDefaultArgument());
 
   if (!TC && !OwnsDefaultArg &&
       D->getDeclContext() == D->getLexicalDeclContext() &&
@@ -1941,7 +1941,7 @@ void ASTDeclWriter::VisitNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D) {
                           !D->defaultArgumentWasInherited();
     Record.push_back(OwnsDefaultArg);
     if (OwnsDefaultArg)
-      Record.AddStmt(D->getDefaultArgument());
+      Record.AddTemplateArgumentLoc(D->getDefaultArgument());
     Code = serialization::DECL_NON_TYPE_TEMPLATE_PARM;
   }
 }
diff --git a/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt b/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt
index 4443ffd..cd5a3bd 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt
+++ b/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt
@@ -96,13 +96,14 @@ add_clang_library(clangStaticAnalyzerCheckers
   PointerSortingChecker.cpp
   PointerSubChecker.cpp
   PthreadLockChecker.cpp
-  cert/PutenvWithAutoChecker.cpp
+  PutenvStackArrayChecker.cpp
   RetainCountChecker/RetainCountChecker.cpp
   RetainCountChecker/RetainCountDiagnostics.cpp
   ReturnPointerRangeChecker.cpp
   ReturnUndefChecker.cpp
   ReturnValueChecker.cpp
   RunLoopAutoreleaseLeakChecker.cpp
+  SetgidSetuidOrderChecker.cpp
   SimpleStreamChecker.cpp
   SmartPtrChecker.cpp
   SmartPtrModeling.cpp
diff --git a/clang/lib/StaticAnalyzer/Checkers/cert/PutenvWithAutoChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/PutenvStackArrayChecker.cpp
index a82f7ca..d59cebf0 100644
--- a/clang/lib/StaticAnalyzer/Checkers/cert/PutenvWithAutoChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/PutenvStackArrayChecker.cpp
@@ -1,4 +1,4 @@
-//== PutenvWithAutoChecker.cpp --------------------------------- -*- C++ -*--=//
+//== PutenvStackArrayChecker.cpp ------------------------------- -*- C++ -*--=//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines PutenvWithAutoChecker which finds calls of ``putenv``
-// function with automatic variable as the argument.
+// This file defines PutenvStackArrayChecker which finds calls of ``putenv``
+// function with automatic array variable as the argument.
 // https://wiki.sei.cmu.edu/confluence/x/6NYxBQ
 //
 //===----------------------------------------------------------------------===//
 
-#include "../AllocationState.h"
+#include "AllocationState.h"
 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
 #include "clang/StaticAnalyzer/Core/Checker.h"
@@ -26,9 +26,9 @@ using namespace clang;
 using namespace ento;
 
 namespace {
-class PutenvWithAutoChecker : public Checker<check::PostCall> {
+class PutenvStackArrayChecker : public Checker<check::PostCall> {
 private:
-  BugType BT{this, "'putenv' function should not be called with auto variables",
+  BugType BT{this, "'putenv' called with stack-allocated string",
              categories::SecurityError};
   const CallDescription Putenv{CDM::CLibrary, {"putenv"}, 1};
 
@@ -37,8 +37,8 @@ public:
 };
 } // namespace
 
-void PutenvWithAutoChecker::checkPostCall(const CallEvent &Call,
-                                          CheckerContext &C) const {
+void PutenvStackArrayChecker::checkPostCall(const CallEvent &Call,
+                                            CheckerContext &C) const {
   if (!Putenv.matches(Call))
     return;
 
@@ -50,7 +50,7 @@ void PutenvWithAutoChecker::checkPostCall(const CallEvent &Call,
     return;
 
   StringRef ErrorMsg = "The 'putenv' function should not be called with "
-                       "arguments that have automatic storage";
+                       "arrays that have automatic storage";
   ExplodedNode *N = C.generateErrorNode();
   auto Report = std::make_unique<PathSensitiveBugReport>(BT, ErrorMsg, N);
 
@@ -60,8 +60,10 @@ void PutenvWithAutoChecker::checkPostCall(const CallEvent &Call,
   C.emitReport(std::move(Report));
 }
 
-void ento::registerPutenvWithAuto(CheckerManager &Mgr) {
-  Mgr.registerChecker<PutenvWithAutoChecker>();
+void ento::registerPutenvStackArray(CheckerManager &Mgr) {
+  Mgr.registerChecker<PutenvStackArrayChecker>();
 }
 
-bool ento::shouldRegisterPutenvWithAuto(const CheckerManager &) { return true; }
+bool ento::shouldRegisterPutenvStackArray(const CheckerManager &) {
+  return true;
+}
diff --git a/clang/lib/StaticAnalyzer/Checkers/SetgidSetuidOrderChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/SetgidSetuidOrderChecker.cpp
new file mode 100644
index 0000000..dbe3fd3
--- /dev/null
+++ b/clang/lib/StaticAnalyzer/Checkers/SetgidSetuidOrderChecker.cpp
@@ -0,0 +1,196 @@
+//===-- SetgidSetuidOrderChecker.cpp - check privilege revocation calls ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines a checker to detect possible reversed order of privilege
+//  revocations when 'setgid' and 'setuid' is used.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
+#include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
+#include "clang/StaticAnalyzer/Core/Checker.h"
+#include "clang/StaticAnalyzer/Core/CheckerManager.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h"
+
+using namespace clang;
+using namespace ento;
+
+namespace {
+
+enum SetPrivilegeFunctionKind { Irrelevant, Setuid, Setgid };
+
+class SetgidSetuidOrderChecker : public Checker<check::PostCall, eval::Assume> {
+  const BugType BT{this, "Possible wrong order of privilege revocation"};
+
+  const CallDescription SetuidDesc{CDM::CLibrary, {"setuid"}, 1};
+  const CallDescription SetgidDesc{CDM::CLibrary, {"setgid"}, 1};
+
+  const CallDescription GetuidDesc{CDM::CLibrary, {"getuid"}, 0};
+  const CallDescription GetgidDesc{CDM::CLibrary, {"getgid"}, 0};
+
+  const CallDescriptionSet OtherSetPrivilegeDesc{
+      {CDM::CLibrary, {"seteuid"}, 1},   {CDM::CLibrary, {"setegid"}, 1},
+      {CDM::CLibrary, {"setreuid"}, 2},  {CDM::CLibrary, {"setregid"}, 2},
+      {CDM::CLibrary, {"setresuid"}, 3}, {CDM::CLibrary, {"setresgid"}, 3}};
+
+public:
+  void checkPostCall(const CallEvent &Call, CheckerContext &C) const;
+  ProgramStateRef evalAssume(ProgramStateRef State, SVal Cond,
+                             bool Assumption) const;
+
+private:
+  void processSetuid(ProgramStateRef State, const CallEvent &Call,
+                     CheckerContext &C) const;
+  void processSetgid(ProgramStateRef State, const CallEvent &Call,
+                     CheckerContext &C) const;
+  void processOther(ProgramStateRef State, const CallEvent &Call,
+                    CheckerContext &C) const;
+  /// Check if a function like \c getuid or \c getgid is called directly from
+  /// the first argument of function called from \a Call.
+  bool isFunctionCalledInArg(const CallDescription &Desc,
+                             const CallEvent &Call) const;
+  void emitReport(ProgramStateRef State, CheckerContext &C) const;
+};
+
+} // end anonymous namespace
+
+/// Store if there was a call to 'setuid(getuid())' or 'setgid(getgid())' not
+/// followed by other different privilege-change functions.
+/// If the value \c Setuid is stored and a 'setgid(getgid())' call is found we
+/// have found the bug to be reported. Value \c Setgid is used too to prevent
+/// warnings at a setgid-setuid-setgid sequence.
+REGISTER_TRAIT_WITH_PROGRAMSTATE(LastSetPrivilegeCall, SetPrivilegeFunctionKind)
+/// Store the symbol value of the last 'setuid(getuid())' call. This is used to
+/// detect if the result is compared to -1 and avoid warnings on that branch
+/// (which is the failure branch of the call), and for identification of note
+/// tags.
+REGISTER_TRAIT_WITH_PROGRAMSTATE(LastSetuidCallSVal, SymbolRef)
+
+void SetgidSetuidOrderChecker::checkPostCall(const CallEvent &Call,
+                                             CheckerContext &C) const {
+  ProgramStateRef State = C.getState();
+  if (SetuidDesc.matches(Call)) {
+    processSetuid(State, Call, C);
+  } else if (SetgidDesc.matches(Call)) {
+    processSetgid(State, Call, C);
+  } else if (OtherSetPrivilegeDesc.contains(Call)) {
+    processOther(State, Call, C);
+  }
+}
+
+ProgramStateRef SetgidSetuidOrderChecker::evalAssume(ProgramStateRef State,
+                                                     SVal Cond,
+                                                     bool Assumption) const {
+  SValBuilder &SVB = State->getStateManager().getSValBuilder();
+  SymbolRef LastSetuidSym = State->get<LastSetuidCallSVal>();
+  if (!LastSetuidSym)
+    return State;
+
+  // Check if the most recent call to 'setuid(getuid())' is assumed to be != 0.
+  // It should be only -1 at failure, but we want to accept a "!= 0" check too.
+  // (But now an invalid failure check like "!= 1" will be recognized as correct
+  // too. The "invalid failure check" is a different bug that is not the scope
+  // of this checker.)
+  auto FailComparison =
+      SVB.evalBinOpNN(State, BO_NE, nonloc::SymbolVal(LastSetuidSym),
+                      SVB.makeIntVal(0, /*isUnsigned=*/false),
+                      SVB.getConditionType())
+          .getAs<DefinedOrUnknownSVal>();
+  if (!FailComparison)
+    return State;
+  if (auto IsFailBranch = State->assume(*FailComparison);
+      IsFailBranch.first && !IsFailBranch.second) {
+    // This is the 'setuid(getuid())' != 0 case.
+    // On this branch we do not want to emit warning.
+    State = State->set<LastSetPrivilegeCall>(Irrelevant);
+    State = State->set<LastSetuidCallSVal>(SymbolRef{});
+  }
+  return State;
+}
+
+void SetgidSetuidOrderChecker::processSetuid(ProgramStateRef State,
+                                             const CallEvent &Call,
+                                             CheckerContext &C) const {
+  bool IsSetuidWithGetuid = isFunctionCalledInArg(GetuidDesc, Call);
+  if (State->get<LastSetPrivilegeCall>() != Setgid && IsSetuidWithGetuid) {
+    SymbolRef RetSym = Call.getReturnValue().getAsSymbol();
+    State = State->set<LastSetPrivilegeCall>(Setuid);
+    State = State->set<LastSetuidCallSVal>(RetSym);
+    const NoteTag *Note = C.getNoteTag([this,
+                                        RetSym](PathSensitiveBugReport &BR) {
+      if (!BR.isInteresting(RetSym) || &BR.getBugType() != &this->BT)
+        return "";
+      return "Call to 'setuid' found here that removes superuser privileges";
+    });
+    C.addTransition(State, Note);
+    return;
+  }
+  State = State->set<LastSetPrivilegeCall>(Irrelevant);
+  State = State->set<LastSetuidCallSVal>(SymbolRef{});
+  C.addTransition(State);
+}
+
+void SetgidSetuidOrderChecker::processSetgid(ProgramStateRef State,
+                                             const CallEvent &Call,
+                                             CheckerContext &C) const {
+  bool IsSetgidWithGetgid = isFunctionCalledInArg(GetgidDesc, Call);
+  if (State->get<LastSetPrivilegeCall>() == Setuid) {
+    if (IsSetgidWithGetgid) {
+      State = State->set<LastSetPrivilegeCall>(Irrelevant);
+      emitReport(State, C);
+      return;
+    }
+    State = State->set<LastSetPrivilegeCall>(Irrelevant);
+  } else {
+    State = State->set<LastSetPrivilegeCall>(IsSetgidWithGetgid ? Setgid
+                                                                : Irrelevant);
+  }
+  State = State->set<LastSetuidCallSVal>(SymbolRef{});
+  C.addTransition(State);
+}
+
+void SetgidSetuidOrderChecker::processOther(ProgramStateRef State,
+                                            const CallEvent &Call,
+                                            CheckerContext &C) const {
+  State = State->set<LastSetuidCallSVal>(SymbolRef{});
+  State = State->set<LastSetPrivilegeCall>(Irrelevant);
+  C.addTransition(State);
+}
+
+bool SetgidSetuidOrderChecker::isFunctionCalledInArg(
+    const CallDescription &Desc, const CallEvent &Call) const {
+  if (const auto *CallInArg0 =
+          dyn_cast<CallExpr>(Call.getArgExpr(0)->IgnoreParenImpCasts()))
+    return Desc.matchesAsWritten(*CallInArg0);
+  return false;
+}
+
+void SetgidSetuidOrderChecker::emitReport(ProgramStateRef State,
+                                          CheckerContext &C) const {
+  if (ExplodedNode *N = C.generateNonFatalErrorNode(State)) {
+    llvm::StringLiteral Msg =
+        "A 'setgid(getgid())' call following a 'setuid(getuid())' "
+        "call is likely to fail; probably the order of these "
+        "statements is wrong";
+    auto Report = std::make_unique<PathSensitiveBugReport>(BT, Msg, N);
+    Report->markInteresting(State->get<LastSetuidCallSVal>());
+    C.emitReport(std::move(Report));
+  }
+}
+
+void ento::registerSetgidSetuidOrderChecker(CheckerManager &mgr) {
+  mgr.registerChecker<SetgidSetuidOrderChecker>();
+}
+
+bool ento::shouldRegisterSetgidSetuidOrderChecker(const CheckerManager &mgr) {
+  return true;
+}
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index 5c797d5..49bbff1 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -271,6 +271,43 @@ public:
 
   TrivialFunctionAnalysisVisitor(CacheTy &Cache) : Cache(Cache) {}
 
+  bool IsFunctionTrivial(const Decl *D) {
+    auto CacheIt = Cache.find(D);
+    if (CacheIt != Cache.end())
+      return CacheIt->second;
+
+    // Treat a recursive function call to be trivial until proven otherwise.
+    auto [RecursiveIt, IsNew] = RecursiveFn.insert(std::make_pair(D, true));
+    if (!IsNew)
+      return RecursiveIt->second;
+
+    bool Result = [&]() {
+      if (auto *CtorDecl = dyn_cast<CXXConstructorDecl>(D)) {
+        for (auto *CtorInit : CtorDecl->inits()) {
+          if (!Visit(CtorInit->getInit()))
+            return false;
+        }
+      }
+      const Stmt *Body = D->getBody();
+      if (!Body)
+        return false;
+      return Visit(Body);
+    }();
+
+    if (!Result) {
+      // D and its mutually recursive callers are all non-trivial.
+      for (auto &It : RecursiveFn)
+        It.second = false;
+    }
+    RecursiveIt = RecursiveFn.find(D);
+    assert(RecursiveIt != RecursiveFn.end());
+    Result = RecursiveIt->second;
+    RecursiveFn.erase(RecursiveIt);
+    Cache[D] = Result;
+
+    return Result;
+  }
+
   bool VisitStmt(const Stmt *S) {
     // All statements are non-trivial unless overriden later.
     // Don't even recurse into children by default.
@@ -368,7 +405,7 @@ public:
         Name == "bitwise_cast" || Name.find("__builtin") == 0)
       return true;
 
-    return TrivialFunctionAnalysis::isTrivialImpl(Callee, Cache);
+    return IsFunctionTrivial(Callee);
   }
 
   bool
@@ -403,7 +440,7 @@ public:
       return true;
 
     // Recursively descend into the callee to confirm that it's trivial as well.
-    return TrivialFunctionAnalysis::isTrivialImpl(Callee, Cache);
+    return IsFunctionTrivial(Callee);
   }
 
   bool VisitCXXOperatorCallExpr(const CXXOperatorCallExpr *OCE) {
@@ -413,7 +450,7 @@ public:
     if (!Callee)
       return false;
     // Recursively descend into the callee to confirm that it's trivial as well.
-    return TrivialFunctionAnalysis::isTrivialImpl(Callee, Cache);
+    return IsFunctionTrivial(Callee);
   }
 
   bool VisitCXXDefaultArgExpr(const CXXDefaultArgExpr *E) {
@@ -439,7 +476,7 @@ public:
     }
 
     // Recursively descend into the callee to confirm that it's trivial.
-    return TrivialFunctionAnalysis::isTrivialImpl(CE->getConstructor(), Cache);
+    return IsFunctionTrivial(CE->getConstructor());
   }
 
   bool VisitCXXNewExpr(const CXXNewExpr *NE) { return VisitChildren(NE); }
@@ -513,36 +550,13 @@ public:
 
 private:
   CacheTy &Cache;
+  CacheTy RecursiveFn;
 };
 
 bool TrivialFunctionAnalysis::isTrivialImpl(
     const Decl *D, TrivialFunctionAnalysis::CacheTy &Cache) {
-  // If the function isn't in the cache, conservatively assume that
-  // it's not trivial until analysis completes. This makes every recursive
-  // function non-trivial. This also guarantees that each function
-  // will be scanned at most once.
-  auto [It, IsNew] = Cache.insert(std::make_pair(D, false));
-  if (!IsNew)
-    return It->second;
-
   TrivialFunctionAnalysisVisitor V(Cache);
-
-  if (auto *CtorDecl = dyn_cast<CXXConstructorDecl>(D)) {
-    for (auto *CtorInit : CtorDecl->inits()) {
-      if (!V.Visit(CtorInit->getInit()))
-        return false;
-    }
-  }
-
-  const Stmt *Body = D->getBody();
-  if (!Body)
-    return false;
-
-  bool Result = V.Visit(Body);
-  if (Result)
-    Cache[D] = true;
-
-  return Result;
+  return V.IsFunctionTrivial(D);
 }
 
 bool TrivialFunctionAnalysis::isTrivialImpl(
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp
index 0d9710a..274da0b 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp
@@ -135,7 +135,19 @@ public:
       bool shouldVisitImplicitCode() const { return false; }
 
       bool VisitVarDecl(VarDecl *V) {
-        Checker->visitVarDecl(V);
+        auto *Init = V->getInit();
+        if (Init && V->isLocalVarDecl())
+          Checker->visitVarDecl(V, Init);
+        return true;
+      }
+
+      bool VisitBinaryOperator(const BinaryOperator *BO) {
+        if (BO->isAssignmentOp()) {
+          if (auto *VarRef = dyn_cast<DeclRefExpr>(BO->getLHS())) {
+            if (auto *V = dyn_cast<VarDecl>(VarRef->getDecl()))
+              Checker->visitVarDecl(V, BO->getRHS());
+          }
+        }
         return true;
       }
 
@@ -174,7 +186,7 @@ public:
     visitor.TraverseDecl(const_cast<TranslationUnitDecl *>(TUD));
   }
 
-  void visitVarDecl(const VarDecl *V) const {
+  void visitVarDecl(const VarDecl *V, const Expr *Value) const {
     if (shouldSkipVarDecl(V))
       return;
 
@@ -184,12 +196,8 @@ public:
 
     std::optional<bool> IsUncountedPtr = isUncountedPtr(ArgType);
     if (IsUncountedPtr && *IsUncountedPtr) {
-      const Expr *const InitExpr = V->getInit();
-      if (!InitExpr)
-        return; // FIXME: later on we might warn on uninitialized vars too
-
       if (tryToFindPtrOrigin(
-              InitExpr, /*StopAtFirstRefCountedObj=*/false,
+              Value, /*StopAtFirstRefCountedObj=*/false,
               [&](const clang::Expr *InitArgOrigin, bool IsSafe) {
                 if (!InitArgOrigin)
                   return true;
@@ -232,34 +240,46 @@ public:
               }))
         return;
 
-      reportBug(V);
+      reportBug(V, Value);
     }
   }
 
   bool shouldSkipVarDecl(const VarDecl *V) const {
     assert(V);
-    if (!V->isLocalVarDecl())
-      return true;
-
-    if (BR->getSourceManager().isInSystemHeader(V->getLocation()))
-      return true;
-
-    return false;
+    return BR->getSourceManager().isInSystemHeader(V->getLocation());
   }
 
-  void reportBug(const VarDecl *V) const {
+  void reportBug(const VarDecl *V, const Expr *Value) const {
     assert(V);
     SmallString<100> Buf;
     llvm::raw_svector_ostream Os(Buf);
 
-    Os << "Local variable ";
-    printQuotedQualifiedName(Os, V);
-    Os << " is uncounted and unsafe.";
-
-    PathDiagnosticLocation BSLoc(V->getLocation(), BR->getSourceManager());
-    auto Report = std::make_unique<BasicBugReport>(Bug, Os.str(), BSLoc);
-    Report->addRange(V->getSourceRange());
-    BR->emitReport(std::move(Report));
+    if (dyn_cast<ParmVarDecl>(V)) {
+      Os << "Assignment to an uncounted parameter ";
+      printQuotedQualifiedName(Os, V);
+      Os << " is unsafe.";
+
+      PathDiagnosticLocation BSLoc(Value->getExprLoc(), BR->getSourceManager());
+      auto Report = std::make_unique<BasicBugReport>(Bug, Os.str(), BSLoc);
+      Report->addRange(Value->getSourceRange());
+      BR->emitReport(std::move(Report));
+    } else {
+      if (V->hasLocalStorage())
+        Os << "Local variable ";
+      else if (V->isStaticLocal())
+        Os << "Static local variable ";
+      else if (V->hasGlobalStorage())
+        Os << "Global variable ";
+      else
+        Os << "Variable ";
+      printQuotedQualifiedName(Os, V);
+      Os << " is uncounted and unsafe.";
+
+      PathDiagnosticLocation BSLoc(V->getLocation(), BR->getSourceManager());
+      auto Report = std::make_unique<BasicBugReport>(Bug, Os.str(), BSLoc);
+      Report->addRange(V->getSourceRange());
+      BR->emitReport(std::move(Report));
+    }
   }
 };
 } // namespace
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 0b1edf3..793f3a6 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1970,33 +1970,45 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
       ExplodedNodeSet Tmp;
       StmtNodeBuilder Bldr2(PreVisit, Tmp, *currBldrCtx);
 
-      const Expr *ArgE;
-      if (const auto *DefE = dyn_cast<CXXDefaultArgExpr>(S))
+      bool HasRewrittenInit = false;
+      const Expr *ArgE = nullptr;
+      if (const auto *DefE = dyn_cast<CXXDefaultArgExpr>(S)) {
         ArgE = DefE->getExpr();
-      else if (const auto *DefE = dyn_cast<CXXDefaultInitExpr>(S))
+        HasRewrittenInit = DefE->hasRewrittenInit();
+      } else if (const auto *DefE = dyn_cast<CXXDefaultInitExpr>(S)) {
         ArgE = DefE->getExpr();
-      else
+        HasRewrittenInit = DefE->hasRewrittenInit();
+      } else
         llvm_unreachable("unknown constant wrapper kind");
 
-      bool IsTemporary = false;
-      if (const auto *MTE = dyn_cast<MaterializeTemporaryExpr>(ArgE)) {
-        ArgE = MTE->getSubExpr();
-        IsTemporary = true;
-      }
+      if (HasRewrittenInit) {
+        for (auto *N : PreVisit) {
+          ProgramStateRef state = N->getState();
+          const LocationContext *LCtx = N->getLocationContext();
+          state = state->BindExpr(S, LCtx, state->getSVal(ArgE, LCtx));
+          Bldr2.generateNode(S, N, state);
+        }
+      } else {
+        // If it's not rewritten, the contents of these expressions are not
+        // actually part of the current function, so we fall back to constant
+        // evaluation.
+        bool IsTemporary = false;
+        if (const auto *MTE = dyn_cast<MaterializeTemporaryExpr>(ArgE)) {
+          ArgE = MTE->getSubExpr();
+          IsTemporary = true;
+        }
+
+        std::optional<SVal> ConstantVal = svalBuilder.getConstantVal(ArgE);
+        const LocationContext *LCtx = Pred->getLocationContext();
+        for (auto *I : PreVisit) {
+          ProgramStateRef State = I->getState();
+          State = State->BindExpr(S, LCtx, ConstantVal.value_or(UnknownVal()));
+          if (IsTemporary)
+            State = createTemporaryRegionIfNeeded(State, LCtx, cast<Expr>(S),
+                                                  cast<Expr>(S));
 
-      std::optional<SVal> ConstantVal = svalBuilder.getConstantVal(ArgE);
-      if (!ConstantVal)
-        ConstantVal = UnknownVal();
-
-      const LocationContext *LCtx = Pred->getLocationContext();
-      for (const auto I : PreVisit) {
-        ProgramStateRef State = I->getState();
-        State = State->BindExpr(S, LCtx, *ConstantVal);
-        if (IsTemporary)
-          State = createTemporaryRegionIfNeeded(State, LCtx,
-                                                cast<Expr>(S),
-                                                cast<Expr>(S));
-        Bldr2.generateNode(S, I, State);
+          Bldr2.generateNode(S, I, State);
+        }
       }
 
       getCheckerManager().runCheckersForPostStmt(Dst, Tmp, S, *this);
diff --git a/clang/test/AST/Interp/arrays.cpp b/clang/test/AST/Interp/arrays.cpp
index e936ec6..dd5064d 100644
--- a/clang/test/AST/Interp/arrays.cpp
+++ b/clang/test/AST/Interp/arrays.cpp
@@ -54,6 +54,10 @@ constexpr int derefPtr(const int *d) {
 }
 static_assert(derefPtr(data) == 5, "");
 
+/// Make sure we can refer to the one-past-the-end element
+/// and then return back to the end of the array.
+static_assert((&data[5])[-1] == 1, "");
+
 constexpr int storePtr() {
   int b[] = {1,2,3,4};
   int *c = b;
diff --git a/clang/test/AST/Interp/builtin-functions.cpp b/clang/test/AST/Interp/builtin-functions.cpp
index fbe76ab..0a17106 100644
--- a/clang/test/AST/Interp/builtin-functions.cpp
+++ b/clang/test/AST/Interp/builtin-functions.cpp
@@ -900,7 +900,7 @@ namespace shufflevector {
   static_assert(vectorShuffle6[7] == 7, "");// ref-error {{not an integral constant expression}}
 
   constexpr vector4char  vectorShuffleFail1 = __builtin_shufflevector( // both-error {{must be initialized by a constant expression}}\
-                                                                       // ref-error {{index for __builtin_shufflevector not within the bounds of the input vectors; index of -1 found at position 0 not permitted in a constexpr context.}}
+                                                                       // ref-error {{index for __builtin_shufflevector not within the bounds of the input vectors; index of -1 found at position 0 is not permitted in a constexpr context}}
           vector4charConst1,
           vector4charConst2, -1, -1, -1, -1);
 }
diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index 2a75457..f4c7bf1 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -278,3 +278,15 @@ void addrlabelexpr(void) {
  a0: ;
   static void *ps[] = { &&a0 }; // pedantic-warning {{use of GNU address-of-label extension}}
 }
+
+extern void cv2;
+void *foo5 (void)
+{
+  return &cv2; // pedantic-warning{{address of an expression of type 'void'}}
+}
+
+__attribute__((weak)) const unsigned int test10_bound = 10;
+char test10_global[test10_bound]; // all-error {{variable length array declaration not allowed at file scope}}
+void test10(void) {
+  char test10_local[test10_bound] = "help"; // all-error {{variable-sized object may not be initialized}}
+}
diff --git a/clang/test/AST/Interp/cxx03.cpp b/clang/test/AST/Interp/cxx03.cpp
index b6aaf08..70ae413 100644
--- a/clang/test/AST/Interp/cxx03.cpp
+++ b/clang/test/AST/Interp/cxx03.cpp
@@ -24,3 +24,8 @@ namespace NonLValueMemberExpr {
 
   const int &TT1::subobj_init = PODType().value;
 }
+
+void LambdaAccessingADummy() {
+  int d;
+  int a9[1] = {[d = 0] = 1}; // both-error {{is not an integral constant expression}}
+}
diff --git a/clang/test/AST/Interp/cxx98.cpp b/clang/test/AST/Interp/cxx98.cpp
index be81735..e68e4db 100644
--- a/clang/test/AST/Interp/cxx98.cpp
+++ b/clang/test/AST/Interp/cxx98.cpp
@@ -50,3 +50,7 @@ _Static_assert(c0_test == 0, "");
 int a = 0; // both-note {{declared here}}
 _Static_assert(a == 0, ""); // both-error {{static assertion expression is not an integral constant expression}} \
                             // both-note {{read of non-const variable 'a' is not allowed in a constant expression}}
+
+struct SelfReference { SelfReference &r; };
+extern SelfReference self_reference_1;
+SelfReference self_reference_2 = {self_reference_1};
diff --git a/clang/test/AST/Interp/eval-order.cpp b/clang/test/AST/Interp/eval-order.cpp
index 695a43c..aaf2b74 100644
--- a/clang/test/AST/Interp/eval-order.cpp
+++ b/clang/test/AST/Interp/eval-order.cpp
@@ -1,8 +1,7 @@
-// RUN: %clang_cc1 -std=c++1z -verify %s -fcxx-exceptions -triple=x86_64-linux-gnu
-// RUN: %clang_cc1 -std=c++1z -verify %s -fcxx-exceptions -triple=x86_64-linux-gnu -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++1z -verify=ref,both %s -fcxx-exceptions -triple=x86_64-linux-gnu
+// RUN: %clang_cc1 -std=c++1z -verify=expected,both %s -fcxx-exceptions -triple=x86_64-linux-gnu -fexperimental-new-constant-interpreter
 
 // ref-no-diagnostics
-// expected-no-diagnostics
 
 /// Check that assignment operators evaluate their operands right-to-left.
 /// Copied from test/SemaCXX/constant-expression-cxx1z.cpp
@@ -46,7 +45,7 @@ namespace EvalOrder {
     }
     template <typename T> constexpr T &&b(T &&v) {
       if (!done_a)
-        throw "wrong";
+        throw "wrong"; // expected-note 7{{not valid}}
       done_b = true;
       return (T &&)v;
     }
@@ -76,21 +75,30 @@ namespace EvalOrder {
   // SEQ(A(&ud)->*B(&UserDefined::n)); FIXME
 
   // Rule 4: a(b1, b2, b3)
-  // SEQ(A(f)(B(1), B(2), B(3))); FIXME
+  SEQ(A(f)(B(1), B(2), B(3))); // expected-error {{not an integral constant expression}} FIXME \
+                               // expected-note 2{{in call to}}
 
   // Rule 5: b = a, b @= a
-  // SEQ(B(lvalue<int>().get()) = A(0)); FIXME
-  // SEQ(B(lvalue<UserDefined>().get()) = A(ud)); FIXME
+  SEQ(B(lvalue<int>().get()) = A(0)); // expected-error {{not an integral constant expression}} FIXME \
+                                      // expected-note 2{{in call to}}
+  SEQ(B(lvalue<UserDefined>().get()) = A(ud)); // expected-error {{not an integral constant expression}} FIXME \
+                                               // expected-note 2{{in call to}}
   SEQ(B(lvalue<int>().get()) += A(0));
-  // SEQ(B(lvalue<UserDefined>().get()) += A(ud)); FIXME
-  // SEQ(B(lvalue<NonMember>().get()) += A(nm)); FIXME
+  SEQ(B(lvalue<UserDefined>().get()) += A(ud)); // expected-error {{not an integral constant expression}} FIXME \
+                                                // expected-note 2{{in call to}}
+
+  SEQ(B(lvalue<NonMember>().get()) += A(nm)); // expected-error {{not an integral constant expression}} FIXME \
+                                              // expected-note 2{{in call to}}
+
 
   // Rule 6: a[b]
   constexpr int arr[3] = {};
   SEQ(A(arr)[B(0)]);
   SEQ(A(+arr)[B(0)]);
-  // SEQ(A(0)[B(arr)]); FIXME
-  // SEQ(A(0)[B(+arr)]); FIXME
+  SEQ(A(0)[B(arr)]); // expected-error {{not an integral constant expression}} FIXME \
+                     // expected-note 2{{in call to}}
+  SEQ(A(0)[B(+arr)]); // expected-error {{not an integral constant expression}} FIXME \
+                      // expected-note 2{{in call to}}
   SEQ(A(ud)[B(0)]);
 
   // Rule 7: a << b
diff --git a/clang/test/AST/Interp/objc.mm b/clang/test/AST/Interp/objc.mm
new file mode 100644
index 0000000..44b74d1
--- /dev/null
+++ b/clang/test/AST/Interp/objc.mm
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both %s
+// RUN: %clang_cc1 -verify=ref,both %s
+
+@interface A {
+  int a;
+  static_assert(a, ""); // both-error {{static assertion expression is not an integral constant expression}}
+}
+@end
diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp
index 3a5ecd2..97ac3e9 100644
--- a/clang/test/AST/Interp/records.cpp
+++ b/clang/test/AST/Interp/records.cpp
@@ -1335,8 +1335,6 @@ namespace UnnamedBitFields {
   static_assert(a.c == 'a', "");
 }
 
-/// FIXME: This still doesn't work in the new interpreter because
-/// we lack type information for dummy pointers.
 namespace VirtualBases {
   /// This used to crash.
   namespace One {
@@ -1346,7 +1344,7 @@ namespace VirtualBases {
     };
     class B : public virtual A {
     public:
-      int getX() { return x; } // ref-note {{declared here}}
+      int getX() { return x; } // both-note {{declared here}}
     };
 
     class DV : virtual public B{};
@@ -1354,7 +1352,7 @@ namespace VirtualBases {
     void foo() {
       DV b;
       int a[b.getX()]; // both-warning {{variable length arrays}} \
-                       // ref-note {{non-constexpr function 'getX' cannot be used}}
+                       // both-note {{non-constexpr function 'getX' cannot be used}}
     }
   }
 
diff --git a/clang/test/AST/Interp/unions.cpp b/clang/test/AST/Interp/unions.cpp
new file mode 100644
index 0000000..293a198
--- /dev/null
+++ b/clang/test/AST/Interp/unions.cpp
@@ -0,0 +1,67 @@
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both %s
+// RUN: %clang_cc1 -verify=ref,both %s
+
+union U {
+  int a;
+  int b;
+};
+
+constexpr U a = {12};
+static_assert(a.a == 12, "");
+static_assert(a.b == 0, ""); // both-error {{not an integral constant expression}} \
+                             // both-note {{read of member 'b' of union with active member 'a'}}
+union U1 {
+  int i;
+  float f = 3.0f;
+};
+constexpr U1 u1{};
+static_assert(u1.f == 3.0, "");
+static_assert(u1.i == 1, ""); // both-error {{not an integral constant expression}} \
+                              // both-note {{read of member 'i' of union with active member 'f'}}
+
+
+
+union A {
+  int a;
+  double d;
+};
+constexpr A aa = {1, 2.0}; // both-error {{excess elements in union initializer}}
+constexpr A ab = {.d = 1.0};
+static_assert(ab.d == 1.0, "");
+static_assert(ab.a == 1, ""); // both-error {{not an integral constant expression}} \
+                              // both-note {{read of member 'a' of union with active member 'd'}}
+
+
+namespace Empty {
+  union E {};
+  constexpr E e{};
+}
+
+namespace SimpleStore {
+  union A {
+    int a;
+    int b;
+  };
+  constexpr int foo() {
+    A a{.b = 4};
+    a.b = 10;
+    return a.b;
+  }
+  static_assert(foo() == 10, "");
+
+  constexpr int empty() {
+    A a{}; /// Just test that this works.
+    return 10;
+  }
+  static_assert(empty() == 10, "");
+}
+
+namespace ZeroInit {
+  struct S { int m; };
+  union Z {
+    float f;
+  };
+
+  constexpr Z z{};
+  static_assert(z.f == 0.0, "");
+}
diff --git a/clang/test/AST/ast-dump-decl.cpp b/clang/test/AST/ast-dump-decl.cpp
index 554cdcf83..e062d4f 100644
--- a/clang/test/AST/ast-dump-decl.cpp
+++ b/clang/test/AST/ast-dump-decl.cpp
@@ -459,7 +459,7 @@ namespace testClassTemplateDecl {
 
 // CHECK:       ClassTemplateDecl 0x{{.+}} <{{.+}}:[[@LINE-148]]:3, col:31> col:31 TestTemplateDefaultNonType{{$}}
 // CHECK-NEXT:  |-NonTypeTemplateParmDecl 0x{{.+}} <col:12, col:20> col:16 'int' depth 0 index 0 I{{$}}
-// CHECK-NEXT:  | `-TemplateArgument expr{{$}}
+// CHECK-NEXT:  | `-TemplateArgument <col:20> expr{{$}}
 // CHECK-NEXT:  |   `-IntegerLiteral 0x{{.+}} <col:20> 'int' 42{{$}}
 // CHECK-NEXT:  `-CXXRecordDecl 0x{{.+}} <col:24, col:31> col:31 struct TestTemplateDefaultNonType{{$}}
 
@@ -671,7 +671,7 @@ namespace TestNonTypeTemplateParmDecl {
 // CHECK:      NamespaceDecl{{.*}} TestNonTypeTemplateParmDecl
 // CHECK-NEXT:   FunctionTemplateDecl
 // CHECK-NEXT:     NonTypeTemplateParmDecl{{.*}} 'int' depth 0 index 0 I
-// CHECK-NEXT:       TemplateArgument expr
+// CHECK-NEXT:       TemplateArgument {{.*}} expr
 // CHECK-NEXT:         IntegerLiteral{{.*}} 'int' 1
 // CHECK-NEXT:     NonTypeTemplateParmDecl{{.*}} 'int' depth 0 index 1 ... J
 
diff --git a/clang/test/AST/ast-dump-default-init-json.cpp b/clang/test/AST/ast-dump-default-init-json.cpp
index 1058b4e..f4949a9 100644
--- a/clang/test/AST/ast-dump-default-init-json.cpp
+++ b/clang/test/AST/ast-dump-default-init-json.cpp
@@ -789,10 +789,10 @@ void test() {
 // CHECK-NEXT:                  "valueCategory": "lvalue",
 // CHECK-NEXT:                  "extendingDecl": {
 // CHECK-NEXT:                   "id": "0x{{.*}}",
-// CHECK-NEXT:                   "kind": "FieldDecl",
-// CHECK-NEXT:                   "name": "a",
+// CHECK-NEXT:                   "kind": "VarDecl",
+// CHECK-NEXT:                   "name": "b",
 // CHECK-NEXT:                   "type": {
-// CHECK-NEXT:                    "qualType": "const A &"
+// CHECK-NEXT:                    "qualType": "B"
 // CHECK-NEXT:                   }
 // CHECK-NEXT:                  },
 // CHECK-NEXT:                  "storageDuration": "automatic",
diff --git a/clang/test/AST/ast-dump-default-init.cpp b/clang/test/AST/ast-dump-default-init.cpp
index 15b29f0..26864fb 100644
--- a/clang/test/AST/ast-dump-default-init.cpp
+++ b/clang/test/AST/ast-dump-default-init.cpp
@@ -13,7 +13,7 @@ void test() {
 }
 // CHECK: -CXXDefaultInitExpr 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue has rewritten init
 // CHECK-NEXT:  `-ExprWithCleanups 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue
-// CHECK-NEXT:    `-MaterializeTemporaryExpr 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue extended by Field 0x{{[^ ]*}} 'a' 'const A &'
+// CHECK-NEXT:    `-MaterializeTemporaryExpr 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue extended by Var 0x{{[^ ]*}} 'b' 'B'
 // CHECK-NEXT:      `-ImplicitCastExpr 0x{{[^ ]*}} <{{.*}}> 'const A' <NoOp>
 // CHECK-NEXT:        `-CXXFunctionalCastExpr 0x{{[^ ]*}} <{{.*}}> 'A' functional cast to A <NoOp>
 // CHECK-NEXT:          `-InitListExpr 0x{{[^ ]*}} <{{.*}}> 'A'
diff --git a/clang/test/AST/ast-dump-expr-json.cpp b/clang/test/AST/ast-dump-expr-json.cpp
index 4b7365e..dd2fe1f 100644
--- a/clang/test/AST/ast-dump-expr-json.cpp
+++ b/clang/test/AST/ast-dump-expr-json.cpp
@@ -2333,7 +2333,7 @@ void TestNonADLCall3() {
 // CHECK-NEXT:         "kind": "FunctionDecl",
 // CHECK-NEXT:         "name": "operator delete",
 // CHECK-NEXT:         "type": {
-// CHECK-NEXT:          "qualType": "void (void *) noexcept"
+// CHECK-NEXT:          "qualType": "void (void *, unsigned long) noexcept"
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "inner": [
diff --git a/clang/test/AST/ast-dump-expr.cpp b/clang/test/AST/ast-dump-expr.cpp
index 6048681..f9e9ee9 100644
--- a/clang/test/AST/ast-dump-expr.cpp
+++ b/clang/test/AST/ast-dump-expr.cpp
@@ -164,7 +164,7 @@ void UnaryExpressions(int *p) {
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:8> 'int *' lvalue ParmVar 0x{{[^ ]*}} 'p' 'int *'
 
   ::delete p;
-  // CHECK: CXXDeleteExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:12> 'void' global Function 0x{{[^ ]*}} 'operator delete' 'void (void *) noexcept'
+  // CHECK: CXXDeleteExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:12> 'void' global Function 0x{{[^ ]*}} 'operator delete' 'void (void *, unsigned long) noexcept'
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:12> 'int *' lvalue ParmVar 0x{{[^ ]*}} 'p' 'int *'
 
diff --git a/clang/test/AST/ast-dump-stmt-json.cpp b/clang/test/AST/ast-dump-stmt-json.cpp
index 667a12a..a473d17 100644
--- a/clang/test/AST/ast-dump-stmt-json.cpp
+++ b/clang/test/AST/ast-dump-stmt-json.cpp
@@ -994,7 +994,7 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:       "kind": "FunctionDecl",
 // CHECK-NEXT:       "name": "operator delete",
 // CHECK-NEXT:       "type": {
-// CHECK-NEXT:        "qualType": "void (void *) noexcept"
+// CHECK-NEXT:        "qualType": "void (void *, unsigned long) noexcept"
 // CHECK-NEXT:       }
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "inner": [
@@ -1369,7 +1369,7 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:       "kind": "FunctionDecl",
 // CHECK-NEXT:       "name": "operator delete",
 // CHECK-NEXT:       "type": {
-// CHECK-NEXT:        "qualType": "void (void *) noexcept"
+// CHECK-NEXT:        "qualType": "void (void *, unsigned long) noexcept"
 // CHECK-NEXT:       }
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "inner": [
@@ -1722,7 +1722,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:   "end": {}
 // CHECK-NEXT:  },
 // CHECK-NEXT:  "isImplicit": true,
-// CHECK-NEXT:  "isUsed": true,
 // CHECK-NEXT:  "name": "operator delete",
 // CHECK-NEXT:  "mangledName": "_ZdlPv",
 // CHECK-NEXT:  "type": {
@@ -1819,6 +1818,126 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:  },
 // CHECK-NEXT:  "isImplicit": true,
 // CHECK-NEXT:  "isUsed": true,
+// CHECK-NEXT:  "name": "operator delete",
+// CHECK-NEXT:  "mangledName": "_ZdlPvm",
+// CHECK-NEXT:  "type": {
+// CHECK-NEXT:   "qualType": "void (void *, unsigned long) noexcept"
+// CHECK-NEXT:  },
+// CHECK-NEXT:  "inner": [
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "ParmVarDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "void *"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "ParmVarDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "unsigned long"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "VisibilityAttr",
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "implicit": true,
+// CHECK-NEXT:    "visibility": "default"
+// CHECK-NEXT:   }
+// CHECK-NEXT:  ]
+// CHECK-NEXT: }
+
+// CHECK-NOT: {{^}}Dumping
+// CHECK:  "kind": "FunctionDecl",
+// CHECK-NEXT:  "loc": {},
+// CHECK-NEXT:  "range": {
+// CHECK-NEXT:   "begin": {},
+// CHECK-NEXT:   "end": {}
+// CHECK-NEXT:  },
+// CHECK-NEXT:  "isImplicit": true,
+// CHECK-NEXT:  "name": "operator delete",
+// CHECK-NEXT:  "mangledName": "_ZdlPvmSt11align_val_t",
+// CHECK-NEXT:  "type": {
+// CHECK-NEXT:   "qualType": "void (void *, unsigned long, std::align_val_t) noexcept"
+// CHECK-NEXT:  },
+// CHECK-NEXT:  "inner": [
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "ParmVarDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "void *"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "ParmVarDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "unsigned long"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "ParmVarDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "std::align_val_t"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "VisibilityAttr",
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "implicit": true,
+// CHECK-NEXT:    "visibility": "default"
+// CHECK-NEXT:   }
+// CHECK-NEXT:  ]
+// CHECK-NEXT: }
+
+// CHECK-NOT: {{^}}Dumping
+// CHECK:  "kind": "FunctionDecl",
+// CHECK-NEXT:  "loc": {},
+// CHECK-NEXT:  "range": {
+// CHECK-NEXT:   "begin": {},
+// CHECK-NEXT:   "end": {}
+// CHECK-NEXT:  },
+// CHECK-NEXT:  "isImplicit": true,
+// CHECK-NEXT:  "isUsed": true,
 // CHECK-NEXT:  "name": "operator delete[]",
 // CHECK-NEXT:  "mangledName": "_ZdaPv",
 // CHECK-NEXT:  "type": {
@@ -1907,6 +2026,125 @@ void TestDependentGenericSelectionExpr(Ty T) {
 
 
 // CHECK-NOT: {{^}}Dumping
+// CHECK:  "kind": "FunctionDecl",
+// CHECK-NEXT:  "loc": {},
+// CHECK-NEXT:  "range": {
+// CHECK-NEXT:   "begin": {},
+// CHECK-NEXT:   "end": {}
+// CHECK-NEXT:  },
+// CHECK-NEXT:  "isImplicit": true,
+// CHECK-NEXT:  "name": "operator delete[]",
+// CHECK-NEXT:  "mangledName": "_ZdaPvm",
+// CHECK-NEXT:  "type": {
+// CHECK-NEXT:   "qualType": "void (void *, unsigned long) noexcept"
+// CHECK-NEXT:  },
+// CHECK-NEXT:  "inner": [
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "ParmVarDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "void *"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "ParmVarDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "unsigned long"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "VisibilityAttr",
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "implicit": true,
+// CHECK-NEXT:    "visibility": "default"
+// CHECK-NEXT:   }
+// CHECK-NEXT:  ]
+// CHECK-NEXT: }
+
+// CHECK-NOT: {{^}}Dumping
+// CHECK:  "kind": "FunctionDecl",
+// CHECK-NEXT:  "loc": {},
+// CHECK-NEXT:  "range": {
+// CHECK-NEXT:   "begin": {},
+// CHECK-NEXT:   "end": {}
+// CHECK-NEXT:  },
+// CHECK-NEXT:  "isImplicit": true,
+// CHECK-NEXT:  "name": "operator delete[]",
+// CHECK-NEXT:  "mangledName": "_ZdaPvmSt11align_val_t",
+// CHECK-NEXT:  "type": {
+// CHECK-NEXT:   "qualType": "void (void *, unsigned long, std::align_val_t) noexcept"
+// CHECK-NEXT:  },
+// CHECK-NEXT:  "inner": [
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "ParmVarDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "void *"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "ParmVarDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "unsigned long"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "ParmVarDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "std::align_val_t"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "VisibilityAttr",
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "implicit": true,
+// CHECK-NEXT:    "visibility": "default"
+// CHECK-NEXT:   }
+// CHECK-NEXT:  ]
+// CHECK-NEXT: }
+
+// CHECK-NOT: {{^}}Dumping
 // CHECK:  "kind": "FunctionTemplateDecl",
 // CHECK-NEXT:  "loc": {
 // CHECK-NEXT:   "offset": 598,
diff --git a/clang/test/AST/attr-counted-by-late-parsed-struct-ptrs.c b/clang/test/AST/attr-counted-by-late-parsed-struct-ptrs.c
new file mode 100644
index 0000000..a585a45
--- /dev/null
+++ b/clang/test/AST/attr-counted-by-late-parsed-struct-ptrs.c
@@ -0,0 +1,45 @@
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes %s -ast-dump | FileCheck %s
+
+#define __counted_by(f)  __attribute__((counted_by(f)))
+
+struct size_known {
+  int field;
+};
+
+//==============================================================================
+// __counted_by on struct member pointer in decl attribute position
+//==============================================================================
+
+struct on_member_pointer_complete_ty {
+  struct size_known *buf __counted_by(count);
+  int count;
+};
+// CHECK-LABEL: struct on_member_pointer_complete_ty definition
+// CHECK-NEXT: |-FieldDecl {{.*}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
+// CHECK-NEXT: `-FieldDecl {{.*}} referenced count 'int'
+
+struct on_pointer_anon_count {
+  struct size_known *buf __counted_by(count);
+  struct {
+    int count;
+  };
+};
+
+// CHECK-LABEL: struct on_pointer_anon_count definition
+// CHECK-NEXT:  |-FieldDecl {{.*}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
+// CHECK-NEXT:  |-RecordDecl {{.*}} struct definition
+// CHECK-NEXT:  | `-FieldDecl {{.*}} count 'int'
+// CHECK-NEXT:  |-FieldDecl {{.*}} implicit 'struct on_pointer_anon_count::(anonymous at {{.*}})'
+// CHECK-NEXT:  `-IndirectFieldDecl {{.*}} implicit referenced count 'int'
+// CHECK-NEXT:    |-Field {{.*}} '' 'struct on_pointer_anon_count::(anonymous at {{.*}})'
+// CHECK-NEXT:    `-Field {{.*}} 'count' 'int'
+
+//==============================================================================
+// __counted_by on struct member pointer in type attribute position
+//==============================================================================
+// TODO: Correctly parse counted_by as a type attribute. Currently it is parsed
+// as a declaration attribute and is **not** late parsed resulting in the `count`
+// field being unavailable.
+//
+// See `clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c` for test
+// cases.
diff --git a/clang/test/AST/attr-counted-by-struct-ptrs.c b/clang/test/AST/attr-counted-by-struct-ptrs.c
new file mode 100644
index 0000000..79a453d
--- /dev/null
+++ b/clang/test/AST/attr-counted-by-struct-ptrs.c
@@ -0,0 +1,117 @@
+// RUN: %clang_cc1 %s -ast-dump | FileCheck %s
+
+#define __counted_by(f)  __attribute__((counted_by(f)))
+
+struct size_unknown;
+struct size_known {
+  int field;
+};
+
+//==============================================================================
+// __counted_by on struct member pointer in decl attribute position
+//==============================================================================
+
+// CHECK-LABEL: RecordDecl {{.+}} struct on_member_pointer_complete_ty definition
+// CHECK-NEXT: |-FieldDecl {{.+}} referenced count 'int'
+// CHECK-NEXT: `-FieldDecl {{.+}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
+struct on_member_pointer_complete_ty {
+  int count;
+  struct size_known * buf __counted_by(count);
+};
+
+// CHECK-LABEL: RecordDecl {{.+}} struct on_pointer_anon_buf definition
+// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
+// CHECK-NEXT:  |-RecordDecl {{.+}} struct definition
+// CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
+// CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH:.+]])'
+// CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __counted_by(count)':'struct size_known *'
+// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH]])'
+// CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __counted_by(count)':'struct size_known *'
+struct on_pointer_anon_buf {
+  int count;
+  struct {
+    struct size_known *buf __counted_by(count);
+  };
+};
+
+struct on_pointer_anon_count {
+  struct {
+    int count;
+  };
+  struct size_known *buf __counted_by(count);
+};
+
+//==============================================================================
+// __counted_by on struct member pointer in type attribute position
+//==============================================================================
+// TODO: Correctly parse counted_by as a type attribute. Currently it is parsed
+// as a declaration attribute
+
+// CHECK-LABEL: RecordDecl {{.+}} struct on_member_pointer_complete_ty_ty_pos definition
+// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
+// CHECK-NEXT:  `-FieldDecl {{.+}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
+struct on_member_pointer_complete_ty_ty_pos {
+  int count;
+  struct size_known *__counted_by(count) buf;
+};
+
+// TODO: This should be forbidden but isn't due to counted_by being treated as a
+// declaration attribute. The attribute ends up on the outer most pointer
+// (allowed by sema) even though syntactically its supposed to be on the inner
+// pointer (would not allowed by sema due to pointee being a function type).
+// CHECK-LABEL: RecordDecl {{.+}} struct on_member_pointer_fn_ptr_ty_ty_pos_inner definition
+// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
+// CHECK-NEXT:  `-FieldDecl {{.+}} fn_ptr 'void (** __counted_by(count))(void)':'void (**)(void)'
+struct on_member_pointer_fn_ptr_ty_ty_pos_inner {
+  int count;
+  void (* __counted_by(count) * fn_ptr)(void);
+};
+
+// FIXME: The generated AST here is wrong. The attribute should be on the inner
+// pointer.
+// CHECK-LABEL: RecordDecl {{.+}} struct on_nested_pointer_inner definition
+// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
+// CHECK-NEXT:  `-FieldDecl {{.+}} buf 'struct size_known ** __counted_by(count)':'struct size_known **'
+struct on_nested_pointer_inner {
+  int count;
+  // TODO: This should be disallowed because in the `-fbounds-safety` model
+  // `__counted_by` can only be nested when used in function parameters.
+  struct size_known *__counted_by(count) *buf;
+};
+
+// CHECK-LABEL: RecordDecl {{.+}} struct on_nested_pointer_outer definition
+// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
+// CHECK-NEXT:  `-FieldDecl {{.+}} buf 'struct size_known ** __counted_by(count)':'struct size_known **'
+struct on_nested_pointer_outer {
+  int count;
+  struct size_known **__counted_by(count) buf;
+};
+
+// CHECK-LABEL: RecordDecl {{.+}} struct on_pointer_anon_buf_ty_pos definition
+// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
+// CHECK-NEXT:  |-RecordDecl {{.+}} struct definition
+// CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
+// CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2:.+]])'
+// CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __counted_by(count)':'struct size_known *'
+// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2]])'
+// CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __counted_by(count)':'struct size_known *'
+struct on_pointer_anon_buf_ty_pos {
+  int count;
+  struct {
+    struct size_known * __counted_by(count) buf;
+  };
+};
+
+// CHECK-LABEL: RecordDecl {{.+}} struct on_pointer_anon_count_ty_pos definition
+// CHECK-NEXT:  |-RecordDecl {{.+}} struct definition
+// CHECK-NEXT:  | `-FieldDecl {{.+}} count 'int'
+// CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3:.+]])'
+// CHECK-NEXT:  |-IndirectFieldDecl {{.+}} implicit referenced count 'int'
+// CHECK-NEXT:  | |-Field {{.+}} '' 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3]])'
+// CHECK-NEXT:  | `-Field {{.+}} 'count' 'int'
+struct on_pointer_anon_count_ty_pos {
+  struct {
+    int count;
+  };
+  struct size_known *__counted_by(count) buf;
+};
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp
index 632a82e..2577687 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp
@@ -216,3 +216,76 @@ void foo() {
 }
 
 } // namespace conditional_op
+
+namespace local_assignment_basic {
+
+RefCountable *provide_ref_cntbl();
+
+void foo(RefCountable* a) {
+  RefCountable* b = a;
+  // expected-warning@-1{{Local variable 'b' is uncounted and unsafe [alpha.webkit.UncountedLocalVarsChecker]}}
+  if (b->trivial())
+    b = provide_ref_cntbl();
+}
+
+void bar(RefCountable* a) {
+  RefCountable* b;
+  // expected-warning@-1{{Local variable 'b' is uncounted and unsafe [alpha.webkit.UncountedLocalVarsChecker]}}
+  b = provide_ref_cntbl();
+}
+
+void baz() {
+  RefPtr a = provide_ref_cntbl();
+  {
+    RefCountable* b = a.get();
+    // expected-warning@-1{{Local variable 'b' is uncounted and unsafe [alpha.webkit.UncountedLocalVarsChecker]}}
+    b = provide_ref_cntbl();
+  }
+}
+
+} // namespace local_assignment_basic
+
+namespace local_assignment_to_parameter {
+
+RefCountable *provide_ref_cntbl();
+void someFunction();
+
+void foo(RefCountable* a) {
+  a = provide_ref_cntbl();
+  // expected-warning@-1{{Assignment to an uncounted parameter 'a' is unsafe [alpha.webkit.UncountedLocalVarsChecker]}}
+  someFunction();
+  a->method();
+}
+
+} // namespace local_assignment_to_parameter
+
+namespace local_assignment_to_static_local {
+
+RefCountable *provide_ref_cntbl();
+void someFunction();
+
+void foo() {
+  static RefCountable* a = nullptr;
+  // expected-warning@-1{{Static local variable 'a' is uncounted and unsafe [alpha.webkit.UncountedLocalVarsChecker]}}
+  a = provide_ref_cntbl();
+  someFunction();
+  a->method();
+}
+
+} // namespace local_assignment_to_static_local
+
+namespace local_assignment_to_global {
+
+RefCountable *provide_ref_cntbl();
+void someFunction();
+
+RefCountable* g_a = nullptr;
+// expected-warning@-1{{Global variable 'local_assignment_to_global::g_a' is uncounted and unsafe [alpha.webkit.UncountedLocalVarsChecker]}}
+
+void foo() {
+  g_a = provide_ref_cntbl();
+  someFunction();
+  g_a->method();
+}
+
+} // namespace local_assignment_to_global
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
index 9698663..a98c6eb9c 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
@@ -231,6 +231,18 @@ public:
   void method();
   void someFunction();
   int otherFunction();
+  unsigned recursiveTrivialFunction(int n) { return !n ? 1 : recursiveTrivialFunction(n - 1);  }
+  unsigned recursiveComplexFunction(int n) { return !n ? otherFunction() : recursiveComplexFunction(n - 1);  }
+  unsigned mutuallyRecursiveFunction1(int n) { return n < 0 ? 1 : (n % 2 ? mutuallyRecursiveFunction2(n - 2) : mutuallyRecursiveFunction1(n - 1)); }
+  unsigned mutuallyRecursiveFunction2(int n) { return n < 0 ? 1 : (n % 3 ? mutuallyRecursiveFunction2(n - 3) : mutuallyRecursiveFunction1(n - 2)); }
+  unsigned mutuallyRecursiveFunction3(int n) { return n < 0 ? 1 : (n % 5 ? mutuallyRecursiveFunction3(n - 5) : mutuallyRecursiveFunction4(n - 3)); }
+  unsigned mutuallyRecursiveFunction4(int n) { return n < 0 ? 1 : (n % 7 ? otherFunction() : mutuallyRecursiveFunction3(n - 3)); }
+  unsigned recursiveFunction5(unsigned n) { return n > 100 ? 2 : (n % 2 ? recursiveFunction5(n + 1) : recursiveFunction6(n + 2)); }
+  unsigned recursiveFunction6(unsigned n) { return n > 100 ? 3 : (n % 2 ? recursiveFunction6(n % 7) : recursiveFunction7(n % 5)); }
+  unsigned recursiveFunction7(unsigned n) { return n > 100 ? 5 : recursiveFunction7(n * 5); }
+
+  void mutuallyRecursive8() { mutuallyRecursive9(); someFunction(); }
+  void mutuallyRecursive9() { mutuallyRecursive8(); }
 
   int trivial1() { return 123; }
   float trivial2() { return 0.3; }
@@ -498,6 +510,24 @@ public:
     RefCounted::singleton().trivial18(); // no-warning
     RefCounted::singleton().someFunction(); // no-warning
 
+    getFieldTrivial().recursiveTrivialFunction(7); // no-warning
+    getFieldTrivial().recursiveComplexFunction(9);
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().mutuallyRecursiveFunction1(11); // no-warning
+    getFieldTrivial().mutuallyRecursiveFunction2(13); // no-warning
+    getFieldTrivial().mutuallyRecursiveFunction3(17);
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().mutuallyRecursiveFunction4(19);
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().recursiveFunction5(23); // no-warning
+    getFieldTrivial().recursiveFunction6(29); // no-warning
+    getFieldTrivial().recursiveFunction7(31); // no-warning
+
+    getFieldTrivial().mutuallyRecursive8();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().mutuallyRecursive9();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+
     getFieldTrivial().someFunction();
     // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
     getFieldTrivial().nonTrivial1();
diff --git a/clang/test/Analysis/cert/pos34-c-fp-suppression.cpp b/clang/test/Analysis/cert/pos34-c-fp-suppression.cpp
deleted file mode 100644
index d982fcb..0000000
--- a/clang/test/Analysis/cert/pos34-c-fp-suppression.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-// RUN: %clang_analyze_cc1 \
-// RUN:  -analyzer-checker=alpha.security.cert.pos.34c\
-// RUN:  -verify %s
-
-#include "../Inputs/system-header-simulator.h"
-void free(void *memblock);
-void *malloc(size_t size);
-int putenv(char *);
-int rand();
-
-namespace test_auto_var_used_good {
-
-extern char *ex;
-int test_extern() {
-  return putenv(ex); // no-warning: extern storage class.
-}
-
-void foo(void) {
-  char *buffer = (char *)"huttah!";
-  if (rand() % 2 == 0) {
-    buffer = (char *)malloc(5);
-    strcpy(buffer, "woot");
-  }
-  putenv(buffer);
-}
-
-void bar(void) {
-  char *buffer = (char *)malloc(5);
-  strcpy(buffer, "woot");
-
-  if (rand() % 2 == 0) {
-    free(buffer);
-    buffer = (char *)"blah blah blah";
-  }
-  putenv(buffer);
-}
-
-void baz() {
-  char env[] = "NAME=value";
-  // TODO: False Positive
-  putenv(env);
-  // expected-warning@-1 {{The 'putenv' function should not be called with arguments that have automatic storage}}
-
-  /*
-    DO SOMETHING
-  */
-
-  putenv((char *)"NAME=anothervalue");
-}
-
-} // namespace test_auto_var_used_good
diff --git a/clang/test/Analysis/cert/pos34-c.cpp b/clang/test/Analysis/cert/pos34-c.cpp
deleted file mode 100644
index f2bd7b3..0000000
--- a/clang/test/Analysis/cert/pos34-c.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-// RUN: %clang_analyze_cc1 \
-// RUN:  -analyzer-checker=alpha.security.cert.pos.34c\
-// RUN:  -verify %s
-
-// Examples from the CERT rule's page.
-// https://wiki.sei.cmu.edu/confluence/x/6NYxBQ
-
-#include "../Inputs/system-header-simulator.h"
-void free(void *memblock);
-void *malloc(size_t size);
-int putenv(char *);
-int snprintf(char *str, size_t size, const char *format, ...);
-
-namespace test_auto_var_used_bad {
-
-int volatile_memory1(const char *var) {
-  char env[1024];
-  int retval = snprintf(env, sizeof(env), "TEST=%s", var);
-  if (retval < 0 || (size_t)retval >= sizeof(env)) {
-    /* Handle error */
-  }
-
-  return putenv(env);
-  // expected-warning@-1 {{The 'putenv' function should not be called with arguments that have automatic storage}}
-}
-
-} // namespace test_auto_var_used_bad
-
-namespace test_auto_var_used_good {
-
-int test_static(const char *var) {
-  static char env[1024];
-
-  int retval = snprintf(env, sizeof(env), "TEST=%s", var);
-  if (retval < 0 || (size_t)retval >= sizeof(env)) {
-    /* Handle error */
-  }
-
-  return putenv(env);
-}
-
-int test_heap_memory(const char *var) {
-  static char *oldenv;
-  const char *env_format = "TEST=%s";
-  const size_t len = strlen(var) + strlen(env_format);
-  char *env = (char *)malloc(len);
-  if (env == NULL) {
-    return -1;
-  }
-  if (putenv(env) != 0) { // no-warning: env was dynamically allocated.
-    free(env);
-    return -1;
-  }
-  if (oldenv != NULL) {
-    free(oldenv); /* avoid memory leak */
-  }
-  oldenv = env;
-  return 0;
-}
-
-} // namespace test_auto_var_used_good
diff --git a/clang/test/Analysis/cxx-uninitialized-object.cpp b/clang/test/Analysis/cxx-uninitialized-object.cpp
index e3fa8ae..aee0dae 100644
--- a/clang/test/Analysis/cxx-uninitialized-object.cpp
+++ b/clang/test/Analysis/cxx-uninitialized-object.cpp
@@ -1114,27 +1114,27 @@ void fCXX11MemberInitTest1() {
   CXX11MemberInitTest1();
 }
 
+#ifdef PEDANTIC
 struct CXX11MemberInitTest2 {
   struct RecordType {
-    // TODO: we'd expect the note: {{uninitialized field 'this->rec.a'}}
-    int a; // no-note
-    // TODO: we'd expect the note: {{uninitialized field 'this->rec.b'}}
-    int b; // no-note
+    int a; // expected-note {{uninitialized field 'this->a'}}
+    int b; // expected-note {{uninitialized field 'this->b'}}
 
     RecordType(int) {}
   };
 
-  RecordType rec = RecordType(int());
+  RecordType rec = RecordType(int()); // expected-warning {{2 uninitialized fields}}
   int dontGetFilteredByNonPedanticMode = 0;
 
   CXX11MemberInitTest2() {}
 };
 
 void fCXX11MemberInitTest2() {
-  // TODO: we'd expect the warning: {{2 uninitializeds field}}
   CXX11MemberInitTest2(); // no-warning
 }
 
+#endif // PEDANTIC
+
 //===----------------------------------------------------------------------===//
 // "Esoteric" primitive type tests.
 //===----------------------------------------------------------------------===//
diff --git a/clang/test/Analysis/cxxnewexpr-callback.cpp b/clang/test/Analysis/cxxnewexpr-callback.cpp
index fe7a9ff..7df58cf 100644
--- a/clang/test/Analysis/cxxnewexpr-callback.cpp
+++ b/clang/test/Analysis/cxxnewexpr-callback.cpp
@@ -9,7 +9,7 @@ void free(void *);
 } // namespace std
 
 void *operator new(size_t size) { return std::malloc(size); }
-void operator delete(void *ptr) { std::free(ptr); }
+void operator delete(void *ptr, size_t size) { std::free(ptr); }
 
 struct S {
   S() {}
@@ -49,7 +49,7 @@ void test() {
 // CHECK-NEXT: PostCall (operator delete)
 }
 
-void operator delete(void *ptr) {
+void operator delete(void *ptr, size_t size) {
   std::free(ptr);
 // CHECK-NO-INLINE-NEXT: PreCall (std::free)
 // CHECK-NO-INLINE-NEXT: PostCall (std::free)
diff --git a/clang/test/Analysis/lifetime-extended-regions.cpp b/clang/test/Analysis/lifetime-extended-regions.cpp
index 4e98bd4..524f4e0 100644
--- a/clang/test/Analysis/lifetime-extended-regions.cpp
+++ b/clang/test/Analysis/lifetime-extended-regions.cpp
@@ -120,11 +120,11 @@ void aggregateWithReferences() {
   clang_analyzer_dump(viaReference);    // expected-warning-re {{&lifetime_extended_object{RefAggregate, viaReference, S{{[0-9]+}}} }}
   clang_analyzer_dump(viaReference.rx); // expected-warning-re {{&lifetime_extended_object{int, viaReference, S{{[0-9]+}}} }}
   clang_analyzer_dump(viaReference.ry); // expected-warning-re {{&lifetime_extended_object{Composite, viaReference, S{{[0-9]+}}} }}
-
-  // clang does not currently implement extending lifetime of object bound to reference members of aggregates,
-  // that are created from default member initializer (see `warn_unsupported_lifetime_extension` from `-Wdangling`)
-  RefAggregate defaultInitExtended{i}; // clang-bug does not extend `Composite`
-  clang_analyzer_dump(defaultInitExtended.ry); // expected-warning {{Unknown }}
+  
+  // The lifetime lifetime of object bound to reference members of aggregates,
+  // that are created from default member initializer was extended.
+  RefAggregate defaultInitExtended{i};
+  clang_analyzer_dump(defaultInitExtended.ry); // expected-warning-re {{&lifetime_extended_object{Composite, defaultInitExtended, S{{[0-9]+}}} }}
 }
 
 void lambda() {
diff --git a/clang/test/Analysis/putenv-stack-array.c b/clang/test/Analysis/putenv-stack-array.c
new file mode 100644
index 0000000..fbbf932
--- /dev/null
+++ b/clang/test/Analysis/putenv-stack-array.c
@@ -0,0 +1,70 @@
+// RUN: %clang_analyze_cc1 \
+// RUN:  -analyzer-checker=alpha.security.PutenvStackArray \
+// RUN:  -verify %s
+
+#include "Inputs/system-header-simulator.h"
+void free(void *);
+void *malloc(size_t);
+int putenv(char *);
+int snprintf(char *, size_t, const char *, ...);
+
+int test_auto_var(const char *var) {
+  char env[1024];
+  (void)snprintf(env, sizeof(env), "TEST=%s", var);
+  return putenv(env); // expected-warning{{The 'putenv' function should not be called with arrays that have automatic storage}}
+}
+
+int test_static_var(const char *var) {
+  static char env[1024];
+  (void)snprintf(env, sizeof(env), "TEST=%s", var);
+  return putenv(env); // no-warning: static array is used
+}
+
+void test_heap_memory(const char *var) {
+  const char *env_format = "TEST=%s";
+  const size_t len = strlen(var) + strlen(env_format);
+  char *env = (char *)malloc(len);
+  if (env == NULL)
+    return;
+  if (putenv(env) != 0) // no-warning: env was dynamically allocated.
+    free(env);
+}
+
+typedef struct {
+  int A;
+  char Env[1024];
+} Mem;
+
+int test_auto_var_struct() {
+  Mem mem;
+  return putenv(mem.Env); // expected-warning{{The 'putenv' function should not be called with}}
+}
+
+int test_auto_var_subarray() {
+  char env[1024];
+  return putenv(env + 100); // expected-warning{{The 'putenv' function should not be called with}}
+}
+
+int test_constant() {
+  char *env = "TEST";
+  return putenv(env); // no-warning: data is not on the stack
+}
+
+extern char *ext_env;
+int test_extern() {
+  return putenv(ext_env); // no-warning: extern storage class.
+}
+
+void test_auto_var_reset() {
+  char env[] = "NAME=value";
+  putenv(env); // expected-warning{{The 'putenv' function should not be called with}}
+  // ... (do something)
+  // Even cases like this are likely a bug:
+  // It looks like that if one string was passed to putenv,
+  // it should not be deallocated at all, because when reading the
+  // environment variable a pointer into this string is returned.
+  // In this case, if another (or the same) thread reads variable "NAME"
+  // at this point and does not copy the returned string, the data may
+  // become invalid.
+  putenv((char *)"NAME=anothervalue");
+}
diff --git a/clang/test/Analysis/setgid-setuid-order-notes.c b/clang/test/Analysis/setgid-setuid-order-notes.c
new file mode 100644
index 0000000..0340241
--- /dev/null
+++ b/clang/test/Analysis/setgid-setuid-order-notes.c
@@ -0,0 +1,73 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,security.SetgidSetuidOrder -analyzer-output=text -verify %s
+
+typedef int uid_t;
+typedef int gid_t;
+
+int setuid(uid_t);
+int setgid(gid_t);
+
+uid_t getuid();
+gid_t getgid();
+
+
+
+void test_note_1() {
+  if (setuid(getuid()) == -1) // expected-note{{Assuming the condition is false}} \
+                              // expected-note{{Taking false branch}}
+    return;
+  if (setuid(getuid()) == -1) // expected-note{{Call to 'setuid' found here that removes superuser privileges}} \
+                              // expected-note{{Assuming the condition is false}} \
+                              // expected-note{{Taking false branch}}
+    return;
+  if (setgid(getgid()) == -1) // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}} \
+                              // expected-note{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+    return;
+}
+
+void test_note_2() {
+  if (setuid(getuid()) == -1) // expected-note{{Call to 'setuid' found here that removes superuser privileges}} \
+                              // expected-note 2 {{Assuming the condition is false}} \
+                              // expected-note 2 {{Taking false branch}}
+    return;
+  if (setgid(getgid()) == -1) // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}} \
+                              // expected-note{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}} \
+                              // expected-note{{Assuming the condition is false}} \
+                              // expected-note{{Taking false branch}}
+    return;
+  if (setuid(getuid()) == -1) // expected-note{{Call to 'setuid' found here that removes superuser privileges}} \
+                              // expected-note{{Assuming the condition is false}} \
+                              // expected-note{{Taking false branch}}
+    return;
+  if (setgid(getgid()) == -1) // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}} \
+                              // expected-note{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+    return;
+}
+
+int f_setuid() {
+  return setuid(getuid()); // expected-note{{Call to 'setuid' found here that removes superuser privileges}}
+}
+
+int f_setgid() {
+  return setgid(getgid()); // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}} \
+                           // expected-note{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+}
+
+void test_note_3() {
+  if (f_setuid() == -1) // expected-note{{Assuming the condition is false}} \
+                        // expected-note{{Calling 'f_setuid'}} \
+                        // expected-note{{Returning from 'f_setuid'}} \
+                        // expected-note{{Taking false branch}}
+    return;
+  if (f_setgid() == -1) // expected-note{{Calling 'f_setgid'}}
+    return;
+}
+
+void test_note_4() {
+  if (setuid(getuid()) == 0) {   // expected-note{{Assuming the condition is true}} \
+                                 // expected-note{{Call to 'setuid' found here that removes superuser privileges}} \
+                                 // expected-note{{Taking true branch}}
+    if (setgid(getgid()) == 0) { // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}} \
+                                 // expected-note{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+    }
+  }
+}
diff --git a/clang/test/Analysis/setgid-setuid-order.c b/clang/test/Analysis/setgid-setuid-order.c
new file mode 100644
index 0000000..1c411aa
--- /dev/null
+++ b/clang/test/Analysis/setgid-setuid-order.c
@@ -0,0 +1,257 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,security.SetgidSetuidOrder -verify %s
+
+typedef int uid_t;
+typedef int gid_t;
+
+int setuid(uid_t);
+int setgid(gid_t);
+int seteuid(uid_t);
+int setegid(gid_t);
+int setreuid(uid_t, uid_t);
+int setregid(gid_t, gid_t);
+int setresuid(uid_t, uid_t, uid_t);
+int setresgid(gid_t, gid_t, gid_t);
+
+uid_t getuid();
+gid_t getgid();
+
+
+
+void correct_order() {
+  // A correct revocation sequence starts here.
+  if (setgid(getgid()) == -1)
+    return;
+  if (setuid(getuid()) == -1)
+    return;
+  // No warning for the following setgid statement.
+  // The previous setgid and setuid calls are a correct privilege revocation
+  // sequence. The checker does not care about the following statements (except
+  // if a wrong setuid-setgid sequence follows again).
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+void incorrect_after_correct() {
+  if (setgid(getgid()) == -1)
+    return;
+  if (setuid(getuid()) == -1)
+    return;
+  // Incorrect sequence starts here.
+  if (setuid(getuid()) == -1)
+    return;
+  if (setgid(getgid()) == -1) // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+    return;
+}
+
+void incorrect_order() {
+  if (setuid(getuid()) == -1)
+    return;
+  if (setgid(getgid()) == -1) // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+    return;
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+void warn_at_second_time() {
+  if (setuid(getuid()) == -1)
+    return;
+  if (setgid(getgid()) == -1) // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+    return;
+  if (setuid(getuid()) == -1)
+    return;
+  if (setgid(getgid()) == -1) // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+    return;
+}
+
+uid_t f_uid();
+gid_t f_gid();
+
+void setuid_other() {
+  if (setuid(f_uid()) == -1)
+    return;
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+void setgid_other() {
+  if (setuid(getuid()) == -1)
+    return;
+  if (setgid(f_gid()) == -1)
+    return;
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+void setuid_other_between() {
+  if (setuid(getuid()) == -1)
+    return;
+  if (setuid(f_uid()) == -1)
+    return;
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+void setgid_with_getuid() {
+  if (setuid(getuid()) == -1)
+    return;
+  // add a clang-tidy check for this case?
+  if (setgid(getuid()) == -1)
+    return;
+}
+
+void setuid_with_getgid() {
+  // add a clang-tidy check for this case?
+  if (setuid(getgid()) == -1)
+    return;
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+int f_setuid() {
+  return setuid(getuid());
+}
+
+int f_setgid() {
+  return setgid(getgid()); // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+}
+
+void function_calls() {
+  if (f_setuid() == -1)
+    return;
+  if (f_setgid() == -1)
+    return;
+}
+
+void seteuid_between() {
+  if (setuid(getuid()) == -1)
+    return;
+  if (seteuid(getuid()) == -1)
+    return;
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+void setegid_between() {
+  if (setuid(getuid()) == -1)
+    return;
+  if (setegid(getgid()) == -1)
+    return;
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+void setreuid_between() {
+  if (setuid(getuid()) == -1)
+    return;
+  if (setreuid(getuid(), getuid()) == -1)
+    return;
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+void setregid_between() {
+  if (setuid(getuid()) == -1)
+    return;
+  if (setregid(getgid(), getgid()) == -1)
+    return;
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+void setresuid_between() {
+  if (setuid(getuid()) == -1)
+    return;
+  if (setresuid(getuid(), getuid(), getuid()) == -1)
+    return;
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+void setresgid_between() {
+  if (setuid(getuid()) == -1)
+    return;
+  if (setresgid(getgid(), getgid(), getgid()) == -1)
+    return;
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+void getgid_getuid_between() {
+  if (setuid(getuid()) == -1)
+    return;
+  (void)getgid();
+  (void)getuid();
+  if (setgid(getgid()) == -1) // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+    return;
+}
+
+void stored_getgid_getuid() {
+  // possible future improvement: detect this case
+  uid_t u = getuid();
+  gid_t g = getgid();
+  if (setuid(u) == -1)
+    return;
+  if (setgid(g) == -1) // no warning
+    return;
+}
+
+void f_extern();
+
+void other_unknown_function_between() {
+  if (setuid(getuid()) == -1)
+    return;
+  f_extern();
+  if (setgid(getgid()) == -1) // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+    return;
+}
+
+void setuid_error_case() {
+  if (setuid(getuid()) == -1) {
+    // No warning if we know that the first setuid call has failed.
+    (void)setgid(getgid());
+    return;
+  }
+  (void)setgid(getgid()); // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+}
+
+void setuid_success_case() {
+  if (setuid(getuid()) == 0) {
+    if (setgid(getgid()) == 0) { // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+    }
+  }
+}
+
+void incorrect_order_compare_zero() {
+  if (setuid(getuid()) != 0)
+    return;
+  (void)setgid(getgid()); // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+}
+
+void setuid_error_case_compare_zero() {
+  if (setuid(getuid()) != 0) {
+    // No warning if we know that the first setuid call has failed.
+    (void)setgid(getgid());
+    return;
+  }
+}
+
+void incorrect_order_compare_other() {
+  if (setuid(getuid()) == -2) {
+    // This is a case for improvement:
+    // The checker does not recognize that this is an invalid error check,
+    // but this is really another type of bug not related to this checker.
+    (void)setgid(getgid()); // warning should appear here
+    return;
+  }
+  if (setgid(getgid()) == -2) { // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+    return;
+  }
+}
+
+const int FAIL = -1;
+
+void incorrect_order_compare_var() {
+  if (setuid(getuid()) == FAIL)
+    return;
+  (void)setgid(getgid()); // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+}
diff --git a/clang/test/CXX/basic/basic.stc/basic.stc.dynamic/basic.stc.dynamic.deallocation/p2.cpp b/clang/test/CXX/basic/basic.stc/basic.stc.dynamic/basic.stc.dynamic.deallocation/p2.cpp
index 9e3210c..706549f 100644
--- a/clang/test/CXX/basic/basic.stc/basic.stc.dynamic/basic.stc.dynamic.deallocation/p2.cpp
+++ b/clang/test/CXX/basic/basic.stc/basic.stc.dynamic/basic.stc.dynamic.deallocation/p2.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++1z -fsized-deallocation -fexceptions -verify %s
+// RUN: %clang_cc1 -std=c++1z -fexceptions -verify %s
 
 using size_t = decltype(sizeof(0));
 
diff --git a/clang/test/CXX/drs/cwg16xx.cpp b/clang/test/CXX/drs/cwg16xx.cpp
index cf6b45c..82ef871 100644
--- a/clang/test/CXX/drs/cwg16xx.cpp
+++ b/clang/test/CXX/drs/cwg16xx.cpp
@@ -483,8 +483,6 @@ namespace cwg1696 { // cwg1696: 7
     const A &a = A(); // #cwg1696-D1-a
   };
   D1 d1 = {}; // #cwg1696-d1
-  // since-cxx14-warning@-1 {{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported; lifetime of temporary will end at the end of the full-expression}}
-  //   since-cxx14-note@#cwg1696-D1-a {{initializing field 'a' with default member initializer}}
 
   struct D2 {
     const A &a = A(); // #cwg1696-D2-a
diff --git a/clang/test/CXX/drs/cwg18xx.cpp b/clang/test/CXX/drs/cwg18xx.cpp
index 3561507..b71a81b 100644
--- a/clang/test/CXX/drs/cwg18xx.cpp
+++ b/clang/test/CXX/drs/cwg18xx.cpp
@@ -206,19 +206,28 @@ namespace cwg1814 { // cwg1814: yes
 #endif
 }
 
-namespace cwg1815 { // cwg1815: no
+namespace cwg1815 { // cwg1815: 19
 #if __cplusplus >= 201402L
-  // FIXME: needs codegen test
-  struct A { int &&r = 0; }; // #cwg1815-A
+  struct A { int &&r = 0; };
   A a = {};
-  // since-cxx14-warning@-1 {{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported; lifetime of temporary will end at the end of the full-expression}} FIXME
-  //   since-cxx14-note@#cwg1815-A {{initializing field 'r' with default member initializer}}
 
   struct B { int &&r = 0; }; // #cwg1815-B
   // since-cxx14-error@-1 {{reference member 'r' binds to a temporary object whose lifetime would be shorter than the lifetime of the constructed object}}
   //   since-cxx14-note@#cwg1815-B {{initializing field 'r' with default member initializer}}
   //   since-cxx14-note@#cwg1815-b {{in implicit default constructor for 'cwg1815::B' first required here}}
   B b; // #cwg1815-b
+
+#if __cplusplus >= 201703L
+  struct C { const int &r = 0; };
+  constexpr C c = {}; // OK, since cwg1815
+  static_assert(c.r == 0);
+
+  constexpr int f() {
+    A a = {}; // OK, since cwg1815
+    return a.r;
+  }
+  static_assert(f() == 0);
+#endif
 #endif
 }
 
diff --git a/clang/test/CXX/drs/cwg28xx.cpp b/clang/test/CXX/drs/cwg28xx.cpp
index 696cd1b..8469a06 100644
--- a/clang/test/CXX/drs/cwg28xx.cpp
+++ b/clang/test/CXX/drs/cwg28xx.cpp
@@ -109,3 +109,74 @@ struct A {
 #endif
 
 } // namespace cwg2858
+
+namespace cwg2881 { // cwg2881: 19 tentatively ready 2024-04-19
+
+#if __cplusplus >= 202302L
+
+template <typename T> struct A : T {};
+template <typename T> struct B : T {};
+template <typename T> struct C : virtual T { C(T t) : T(t) {} };
+template <typename T> struct D : virtual T { D(T t) : T(t) {} };
+
+template <typename Ts>
+struct O1 : A<Ts>, B<Ts> {
+  using A<Ts>::operator();
+  using B<Ts>::operator();
+};
+
+template <typename Ts> struct O2 : protected Ts { // expected-note {{declared protected here}}
+  using Ts::operator();
+  O2(Ts ts) : Ts(ts) {}
+};
+
+template <typename Ts> struct O3 : private Ts { // expected-note {{declared private here}}
+  using Ts::operator();
+  O3(Ts ts) : Ts(ts) {}
+};
+
+// Not ambiguous because of virtual inheritance.
+template <typename Ts>
+struct O4 : C<Ts>, D<Ts> {
+  using C<Ts>::operator();
+  using D<Ts>::operator();
+  O4(Ts t) : Ts(t), C<Ts>(t), D<Ts>(t) {}
+};
+
+// This still has a public path to the lambda, and it's also not
+// ambiguous because of virtual inheritance.
+template <typename Ts>
+struct O5 : private C<Ts>, D<Ts> {
+  using C<Ts>::operator();
+  using D<Ts>::operator();
+  O5(Ts t) : Ts(t), C<Ts>(t), D<Ts>(t) {}
+};
+
+// This is only invalid if we call T's call operator.
+template <typename T, typename U>
+struct O6 : private T, U { // expected-note {{declared private here}}
+  using T::operator();
+  using U::operator();
+  O6(T t, U u) : T(t), U(u) {}
+};
+
+void f() {
+  int x;
+  auto L1 = [=](this auto&& self) { (void) &x; };
+  auto L2 = [&](this auto&& self) { (void) &x; };
+  O1<decltype(L1)>{L1, L1}(); // expected-error {{inaccessible due to ambiguity}}
+  O1<decltype(L2)>{L2, L2}(); // expected-error {{inaccessible due to ambiguity}}
+  O2{L1}(); // expected-error {{must derive publicly from the lambda}}
+  O3{L1}(); // expected-error {{must derive publicly from the lambda}}
+  O4{L1}();
+  O5{L1}();
+  O6 o{L1, L2};
+  o.decltype(L1)::operator()(); // expected-error {{must derive publicly from the lambda}}
+  o.decltype(L1)::operator()(); // No error here because we've already diagnosed this method.
+  o.decltype(L2)::operator()();
+}
+
+#endif
+
+} // namespace cwg2881
+
diff --git a/clang/test/CXX/drs/cwg292.cpp b/clang/test/CXX/drs/cwg292.cpp
index b05d3b92..a7bcbe6f 100644
--- a/clang/test/CXX/drs/cwg292.cpp
+++ b/clang/test/CXX/drs/cwg292.cpp
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
-// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
-// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
-// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
-// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
-// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
-// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,CXX98-11
+// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,CXX98-11
+// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX14
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX14
+// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX14
+// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX14
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX14
 
 namespace cwg292 { // cwg292: 2.9
 
@@ -23,7 +23,8 @@ void f() {
 // CHECK:         invoke {{.*}} i32 @cwg292::g()()
 // CHECK-NEXT:           to {{.*}} unwind label %lpad
 // CHECK-LABEL: lpad:
-// CHECK:         call void @operator delete(void*)(ptr {{.*}} %[[CALL]])
+// CXX98-11:      call void @operator delete(void*)(ptr {{.*}} %[[CALL]])
+// SINCE-CXX14:   call void @operator delete(void*, unsigned long)(ptr {{.*}} %[[CALL]], i64 noundef 1)
 // CHECK-LABEL: eh.resume:
 // CHECK-LABEL: }
 
diff --git a/clang/test/CXX/expr/expr.unary/expr.new/p14.cpp b/clang/test/CXX/expr/expr.unary/expr.new/p14.cpp
index 6537cdc..d0b24c8 100644
--- a/clang/test/CXX/expr/expr.unary/expr.new/p14.cpp
+++ b/clang/test/CXX/expr/expr.unary/expr.new/p14.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++1z -fsized-deallocation -fexceptions %s -verify
+// RUN: %clang_cc1 -std=c++1z -fexceptions %s -verify
 
 using size_t = decltype(sizeof(0));
 namespace std { enum class align_val_t : size_t {}; }
diff --git a/clang/test/CXX/expr/expr.unary/expr.sizeof/p5-0x.cpp b/clang/test/CXX/expr/expr.unary/expr.sizeof/p5-0x.cpp
index afd8ef0..19f9080 100644
--- a/clang/test/CXX/expr/expr.unary/expr.sizeof/p5-0x.cpp
+++ b/clang/test/CXX/expr/expr.unary/expr.sizeof/p5-0x.cpp
@@ -33,6 +33,6 @@ template<int Value> struct count_ints_2 {
 template<typename ...Types> // expected-note{{parameter pack 'Types' declared here}}
 struct count_types_2 {
   static const unsigned value = sizeof... Type; // expected-error{{missing parentheses around the size of parameter pack 'Type'}} \
-  // expected-error{{Type' does not refer to the name of a parameter pack; did you mean 'Types'?}}
+  // expected-error{{'Type' does not refer to the name of a parameter pack; did you mean 'Types'?}}
 };
 
diff --git a/clang/test/CXX/special/class.temporary/p6.cpp b/clang/test/CXX/special/class.temporary/p6.cpp
index 5554363..a6d2adf 100644
--- a/clang/test/CXX/special/class.temporary/p6.cpp
+++ b/clang/test/CXX/special/class.temporary/p6.cpp
@@ -269,6 +269,40 @@ void init_capture_init_list() {
   // CHECK: }
 }
 
+void check_dr1815() { // dr1815: yes
+#if __cplusplus >= 201402L
+
+  struct A {
+    int &&r = 0;
+    ~A() {}
+  };
+
+  struct B {
+    A &&a = A{};
+    ~B() {}
+  };
+  B a = {};
+  
+  // CHECK: call {{.*}}block_scope_begin_function
+  extern void block_scope_begin_function();
+  extern void block_scope_end_function();
+  block_scope_begin_function();
+  {
+    // CHECK: call void @_ZZ12check_dr1815vEN1BD1Ev
+    // CHECK: call void @_ZZ12check_dr1815vEN1AD1Ev
+    B b = {};
+  }
+  // CHECK: call {{.*}}block_scope_end_function
+  block_scope_end_function();
+
+  // CHECK: call {{.*}}some_other_function
+  extern void some_other_function();
+  some_other_function();
+  // CHECK: call void @_ZZ12check_dr1815vEN1BD1Ev
+  // CHECK: call void @_ZZ12check_dr1815vEN1AD1Ev
+#endif
+}
+
 namespace P2718R0 {
 namespace basic {
 template <typename E> using T2 = std::list<E>;
diff --git a/clang/test/ClangScanDeps/response-file-clang-cl.c b/clang/test/ClangScanDeps/response-file-clang-cl.c
new file mode 100644
index 0000000..b543231
--- /dev/null
+++ b/clang/test/ClangScanDeps/response-file-clang-cl.c
@@ -0,0 +1,56 @@
+// Check that the scanner can adjust arguments by reading .rsp files in advance.
+
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+
+// First run the tests with a .cdb
+// RUN: sed -e "s|DIR|%/t|g" %t/cdb.json.template > %t/cdb.json
+// RUN: sed -e "s|DIR|%/t|g" %t/args_nested.template > %t/args_nested.rsp
+
+// RUN: cp %t/args_compilation.rsp %t/args.rsp
+// RUN: clang-scan-deps --compilation-database %t/cdb.json > %t/deps.json
+// RUN: cat %t/deps.json | sed 's:\\\\\?:/:g' | FileCheck -DPREFIX=%/t %s
+
+// RUN: cp %t/args_preprocess.rsp %t/args.rsp
+// RUN: clang-scan-deps --compilation-database %t/cdb.json > %t/deps.json
+// RUN: cat %t/deps.json | sed 's:\\\\\?:/:g' | FileCheck -DPREFIX=%/t %s
+
+
+// Now run the tests again with a in-place compilation database
+// RUN: cd %t
+
+// RUN: cp args_compilation.rsp args.rsp
+// RUN: clang-scan-deps -o deps.json -- %clang_cl @args.rsp
+// RUN: cat deps.json | sed 's:\\\\\?:/:g' | FileCheck -DPREFIX=%/t %s
+
+// RUN: cp args_preprocess.rsp args.rsp
+// RUN: clang-scan-deps -o deps.json -- %clang_cl @args.rsp
+// RUN: cat deps.json | sed 's:\\\\\?:/:g' | FileCheck -DPREFIX=%/t %s
+
+// Here we ensure that we got a qualified .obj with its full path, since that's what we're passing with /Fo
+// CHECK: [[PREFIX]]/tu.obj:
+
+//--- cdb.json.template
+[{
+  "file": "DIR/tu.cpp",
+  "directory": "DIR",
+  "command": "clang-cl @DIR/args.rsp"
+}]
+
+//--- args_compilation.rsp
+@args_nested.rsp
+/c
+
+//--- args_preprocess.rsp
+@args_nested.rsp
+/E
+
+//--- args_nested.template
+/I include
+tu.cpp
+/FoDIR/tu.obj
+
+//--- include/header.h
+
+//--- tu.cpp
+#include "header.h"
diff --git a/clang/test/CodeGen/SystemZ/sync-builtins-i128-8Al.c b/clang/test/CodeGen/SystemZ/sync-builtins-i128-8Al.c
index 76c9c0e..c678e9a 100644
--- a/clang/test/CodeGen/SystemZ/sync-builtins-i128-8Al.c
+++ b/clang/test/CodeGen/SystemZ/sync-builtins-i128-8Al.c
@@ -7,21 +7,21 @@
 __int128 Ptr __attribute__((aligned(8)));
 
 __int128 f1() {
-// CHECK: warning: __sync builtin operation MUST have natural alignment (consider using __atomic). [-Wsync-alignment]
+// CHECK: warning: __sync builtin operation must have natural alignment (consider using __atomic)
   return __sync_fetch_and_add(&Ptr, 1);
 }
 
 __int128 f2() {
-// CHECK: warning: __sync builtin operation MUST have natural alignment (consider using __atomic). [-Wsync-alignment]
+// CHECK: warning: __sync builtin operation must have natural alignment (consider using __atomic)
   return __sync_sub_and_fetch(&Ptr, 1);
 }
 
 __int128 f3() {
-// CHECK: warning: __sync builtin operation MUST have natural alignment (consider using __atomic). [-Wsync-alignment]
+// CHECK: warning: __sync builtin operation must have natural alignment (consider using __atomic)
   return __sync_val_compare_and_swap(&Ptr, 0, 1);
 }
 
 void f4() {
-// CHECK: warning: __sync builtin operation MUST have natural alignment (consider using __atomic). [-Wsync-alignment]
+// CHECK: warning: __sync builtin operation must have natural alignment (consider using __atomic)
   __sync_lock_release(&Ptr);
 }
diff --git a/clang/test/CodeGen/X86/avx512er-builtins.c b/clang/test/CodeGen/X86/avx512er-builtins.c
deleted file mode 100644
index 11ec6aa..0000000
--- a/clang/test/CodeGen/X86/avx512er-builtins.c
+++ /dev/null
@@ -1,347 +0,0 @@
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512f -target-feature +avx512er -emit-llvm -o - -Wall | FileCheck %s
-
-
-#include <immintrin.h>
-
-__m512d test_mm512_rsqrt28_round_pd(__m512d a) {
-  // CHECK-LABEL: @test_mm512_rsqrt28_round_pd
-  // CHECK: @llvm.x86.avx512.rsqrt28.pd
-  return _mm512_rsqrt28_round_pd(a, _MM_FROUND_NO_EXC);
-}
-
-__m512d test_mm512_mask_rsqrt28_round_pd(__m512d s, __mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_mask_rsqrt28_round_pd
-  // CHECK: @llvm.x86.avx512.rsqrt28.pd
-  return _mm512_mask_rsqrt28_round_pd(s, m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512d test_mm512_maskz_rsqrt28_round_pd(__mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_maskz_rsqrt28_round_pd
-  // CHECK: @llvm.x86.avx512.rsqrt28.pd
-  return _mm512_maskz_rsqrt28_round_pd(m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512d test_mm512_rsqrt28_pd(__m512d a) {
-  // CHECK-LABEL: @test_mm512_rsqrt28_pd
-  // CHECK: @llvm.x86.avx512.rsqrt28.pd
-  return _mm512_rsqrt28_pd(a);
-}
-
-__m512d test_mm512_mask_rsqrt28_pd(__m512d s, __mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_mask_rsqrt28_pd
-  // CHECK: @llvm.x86.avx512.rsqrt28.pd
-  return _mm512_mask_rsqrt28_pd(s, m, a);
-}
-
-__m512d test_mm512_maskz_rsqrt28_pd(__mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_maskz_rsqrt28_pd
-  // CHECK: @llvm.x86.avx512.rsqrt28.pd
-  return _mm512_maskz_rsqrt28_pd(m, a);
-}
-
-__m512 test_mm512_rsqrt28_round_ps(__m512 a) {
-  // CHECK-LABEL: @test_mm512_rsqrt28_round_ps
-  // CHECK: @llvm.x86.avx512.rsqrt28.ps
-  return _mm512_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC);
-}
-
-__m512 test_mm512_mask_rsqrt28_round_ps(__m512 s, __mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_mask_rsqrt28_round_ps
-  // CHECK: @llvm.x86.avx512.rsqrt28.ps
-  return _mm512_mask_rsqrt28_round_ps(s, m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512 test_mm512_maskz_rsqrt28_round_ps(__mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_maskz_rsqrt28_round_ps
-  // CHECK: @llvm.x86.avx512.rsqrt28.ps
-  return _mm512_maskz_rsqrt28_round_ps(m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512 test_mm512_rsqrt28_ps(__m512 a) {
-  // CHECK-LABEL: @test_mm512_rsqrt28_ps
-  // CHECK: @llvm.x86.avx512.rsqrt28.ps
-  return _mm512_rsqrt28_ps(a);
-}
-
-__m512 test_mm512_mask_rsqrt28_ps(__m512 s, __mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_mask_rsqrt28_ps
-  // CHECK: @llvm.x86.avx512.rsqrt28.ps
-  return _mm512_mask_rsqrt28_ps(s, m, a);
-}
-
-__m512 test_mm512_maskz_rsqrt28_ps(__mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_maskz_rsqrt28_ps
-  // CHECK: @llvm.x86.avx512.rsqrt28.ps
-  return _mm512_maskz_rsqrt28_ps(m, a);
-}
-
-__m128 test_mm_rsqrt28_round_ss(__m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_rsqrt28_round_ss
-  // CHECK: @llvm.x86.avx512.rsqrt28.ss
-  return _mm_rsqrt28_round_ss(a, b, _MM_FROUND_NO_EXC);
-}
-
-__m128 test_mm_mask_rsqrt28_round_ss(__m128 s, __mmask16 m, __m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_mask_rsqrt28_round_ss
-  // CHECK: @llvm.x86.avx512.rsqrt28.ss
-  return _mm_mask_rsqrt28_round_ss(s, m, a, b, _MM_FROUND_NO_EXC);
-}
-
-__m128 test_mm_maskz_rsqrt28_round_ss(__mmask16 m, __m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_maskz_rsqrt28_round_ss
-  // CHECK: @llvm.x86.avx512.rsqrt28.ss
-  return _mm_maskz_rsqrt28_round_ss(m, a, b, _MM_FROUND_NO_EXC);
-}
-
-__m128 test_mm_rsqrt28_ss(__m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_rsqrt28_ss
-  // CHECK: @llvm.x86.avx512.rsqrt28.ss
-  return _mm_rsqrt28_ss(a, b);
-}
-
-__m128 test_mm_mask_rsqrt28_ss(__m128 s, __mmask16 m, __m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_mask_rsqrt28_ss
-  // CHECK: @llvm.x86.avx512.rsqrt28.ss
-  return _mm_mask_rsqrt28_ss(s, m, a, b);
-}
-
-__m128 test_mm_maskz_rsqrt28_ss(__mmask16 m, __m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_maskz_rsqrt28_ss
-  // CHECK: @llvm.x86.avx512.rsqrt28.ss
-  return _mm_maskz_rsqrt28_ss(m, a, b);
-}
-
-__m128d test_mm_rsqrt28_round_sd(__m128d a, __m128d b) {
-  // CHECK-LABEL: @test_mm_rsqrt28_round_sd
-  // CHECK: @llvm.x86.avx512.rsqrt28.sd
-  return _mm_rsqrt28_round_sd(a, b, _MM_FROUND_NO_EXC);
-}
-
-__m128d test_mm_mask_rsqrt28_round_sd(__m128d s, __mmask8 m, __m128d a, __m128d b) {
-  // CHECK-LABEL: @test_mm_mask_rsqrt28_round_sd
-  // CHECK: @llvm.x86.avx512.rsqrt28.sd
-  return _mm_mask_rsqrt28_round_sd(s, m, a, b, _MM_FROUND_NO_EXC);
-}
-
-__m128d test_mm_maskz_rsqrt28_round_sd(__mmask8 m, __m128d a, __m128d b) {
-  // CHECK-LABEL: @test_mm_maskz_rsqrt28_round_sd
-  // CHECK: @llvm.x86.avx512.rsqrt28.sd
-  return _mm_maskz_rsqrt28_round_sd(m, a, b, _MM_FROUND_NO_EXC);
-}
-
-__m512d test_mm512_rcp28_round_pd(__m512d a) {
-  // CHECK-LABEL: @test_mm512_rcp28_round_pd
-  // CHECK: @llvm.x86.avx512.rcp28.pd
-  return _mm512_rcp28_round_pd(a, _MM_FROUND_NO_EXC);
-}
-
-__m512d test_mm512_mask_rcp28_round_pd(__m512d s, __mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_mask_rcp28_round_pd
-  // CHECK: @llvm.x86.avx512.rcp28.pd
-  return _mm512_mask_rcp28_round_pd(s, m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512d test_mm512_maskz_rcp28_round_pd(__mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_maskz_rcp28_round_pd
-  // CHECK: @llvm.x86.avx512.rcp28.pd
-  return _mm512_maskz_rcp28_round_pd(m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512d test_mm512_rcp28_pd(__m512d a) {
-  // CHECK-LABEL: @test_mm512_rcp28_pd
-  // CHECK: @llvm.x86.avx512.rcp28.pd
-  return _mm512_rcp28_pd(a);
-}
-
-__m512d test_mm512_mask_rcp28_pd(__m512d s, __mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_mask_rcp28_pd
-  // CHECK: @llvm.x86.avx512.rcp28.pd
-  return _mm512_mask_rcp28_pd(s, m, a);
-}
-
-__m512d test_mm512_maskz_rcp28_pd(__mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_maskz_rcp28_pd
-  // CHECK: @llvm.x86.avx512.rcp28.pd
-  return _mm512_maskz_rcp28_pd(m, a);
-}
-
-__m512 test_mm512_rcp28_round_ps(__m512 a) {
-  // CHECK-LABEL: @test_mm512_rcp28_round_ps
-  // CHECK: @llvm.x86.avx512.rcp28.ps
-  return _mm512_rcp28_round_ps(a, _MM_FROUND_NO_EXC);
-}
-
-__m512 test_mm512_mask_rcp28_round_ps(__m512 s, __mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_mask_rcp28_round_ps
-  // CHECK: @llvm.x86.avx512.rcp28.ps
-  return _mm512_mask_rcp28_round_ps(s, m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512 test_mm512_maskz_rcp28_round_ps(__mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_maskz_rcp28_round_ps
-  // CHECK: @llvm.x86.avx512.rcp28.ps
-  return _mm512_maskz_rcp28_round_ps(m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512 test_mm512_rcp28_ps(__m512 a) {
-  // CHECK-LABEL: @test_mm512_rcp28_ps
-  // CHECK: @llvm.x86.avx512.rcp28.ps
-  return _mm512_rcp28_ps(a);
-}
-
-__m512 test_mm512_mask_rcp28_ps(__m512 s, __mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_mask_rcp28_ps
-  // CHECK: @llvm.x86.avx512.rcp28.ps
-  return _mm512_mask_rcp28_ps(s, m, a);
-}
-
-__m512 test_mm512_maskz_rcp28_ps(__mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_maskz_rcp28_ps
-  // CHECK: @llvm.x86.avx512.rcp28.ps
-  return _mm512_maskz_rcp28_ps(m, a);
-}
-
-__m128 test_mm_rcp28_round_ss(__m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_rcp28_round_ss
-  // CHECK: @llvm.x86.avx512.rcp28.ss
-  return _mm_rcp28_round_ss(a, b, _MM_FROUND_NO_EXC);
-}
-
-__m128 test_mm_mask_rcp28_round_ss(__m128 s, __mmask16 m, __m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_mask_rcp28_round_ss
-  // CHECK: @llvm.x86.avx512.rcp28.ss
-  return _mm_mask_rcp28_round_ss(s, m, a, b, _MM_FROUND_NO_EXC);
-}
-
-__m128 test_mm_maskz_rcp28_round_ss(__mmask16 m, __m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_maskz_rcp28_round_ss
-  // CHECK: @llvm.x86.avx512.rcp28.ss
-  return _mm_maskz_rcp28_round_ss(m, a, b, _MM_FROUND_NO_EXC);
-}
-
-__m128 test_mm_rcp28_ss(__m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_rcp28_ss
-  // CHECK: @llvm.x86.avx512.rcp28.ss
-  return _mm_rcp28_ss(a, b);
-}
-
-__m128 test_mm_mask_rcp28_ss(__m128 s, __mmask16 m, __m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_mask_rcp28_ss
-  // CHECK: @llvm.x86.avx512.rcp28.ss
-  return _mm_mask_rcp28_ss(s, m, a, b);
-}
-
-__m128 test_mm_maskz_rcp28_ss(__mmask16 m, __m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_maskz_rcp28_ss
-  // CHECK: @llvm.x86.avx512.rcp28.ss
-  return _mm_maskz_rcp28_ss(m, a, b);
-}
-
-__m128d test_mm_rcp28_round_sd(__m128d a, __m128d b) {
-  // CHECK-LABEL: @test_mm_rcp28_round_sd
-  // CHECK: @llvm.x86.avx512.rcp28.sd
-  return _mm_rcp28_round_sd(a, b, _MM_FROUND_NO_EXC);
-}
-
-__m128d test_mm_mask_rcp28_round_sd(__m128d s, __mmask8 m, __m128d a, __m128d b) {
-  // CHECK-LABEL: @test_mm_mask_rcp28_round_sd
-  // CHECK: @llvm.x86.avx512.rcp28.sd
-  return _mm_mask_rcp28_round_sd(s, m, a, b, _MM_FROUND_NO_EXC);
-}
-
-__m128d test_mm_maskz_rcp28_round_sd(__mmask8 m, __m128d a, __m128d b) {
-  // CHECK-LABEL: @test_mm_maskz_rcp28_round_sd
-  // CHECK: @llvm.x86.avx512.rcp28.sd
-  return _mm_maskz_rcp28_round_sd(m, a, b, _MM_FROUND_NO_EXC);
-}
-
-__m128d test_mm_rcp28_sd(__m128d a, __m128d b) {
-  // CHECK-LABEL: @test_mm_rcp28_sd
-  // CHECK: @llvm.x86.avx512.rcp28.sd
-  return _mm_rcp28_sd(a, b);
-}
-
-__m128d test_mm_mask_rcp28_sd(__m128d s, __mmask8 m, __m128d a, __m128d b) {
-  // CHECK-LABEL: @test_mm_mask_rcp28_sd
-  // CHECK: @llvm.x86.avx512.rcp28.sd
-  return _mm_mask_rcp28_sd(s, m, a, b);
-}
-
-__m128d test_mm_maskz_rcp28_sd(__mmask8 m, __m128d a, __m128d b) {
-  // CHECK-LABEL: @test_mm_maskz_rcp28_sd
-  // CHECK: @llvm.x86.avx512.rcp28.sd
-  return _mm_maskz_rcp28_sd(m, a, b);
-}
-
-__m512d test_mm512_exp2a23_round_pd(__m512d a) {
-  // CHECK-LABEL: @test_mm512_exp2a23_round_pd
-  // CHECK: @llvm.x86.avx512.exp2.pd
-  return _mm512_exp2a23_round_pd(a, _MM_FROUND_NO_EXC);
-}
-
-__m512d test_mm512_mask_exp2a23_round_pd(__m512d s, __mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_mask_exp2a23_round_pd
-  // CHECK: @llvm.x86.avx512.exp2.pd
-  return _mm512_mask_exp2a23_round_pd(s, m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512d test_mm512_maskz_exp2a23_round_pd(__mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_maskz_exp2a23_round_pd
-  // CHECK: @llvm.x86.avx512.exp2.pd
-  return _mm512_maskz_exp2a23_round_pd(m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512d test_mm512_exp2a23_pd(__m512d a) {
-  // CHECK-LABEL: @test_mm512_exp2a23_pd
-  // CHECK: @llvm.x86.avx512.exp2.pd
-  return _mm512_exp2a23_pd(a);
-}
-
-__m512d test_mm512_mask_exp2a23_pd(__m512d s, __mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_mask_exp2a23_pd
-  // CHECK: @llvm.x86.avx512.exp2.pd
-  return _mm512_mask_exp2a23_pd(s, m, a);
-}
-
-__m512d test_mm512_maskz_exp2a23_pd(__mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_maskz_exp2a23_pd
-  // CHECK: @llvm.x86.avx512.exp2.pd
-  return _mm512_maskz_exp2a23_pd(m, a);
-}
-
-__m512 test_mm512_exp2a23_round_ps(__m512 a) {
-  // CHECK-LABEL: @test_mm512_exp2a23_round_ps
-  // CHECK: @llvm.x86.avx512.exp2.ps
-  return _mm512_exp2a23_round_ps(a, _MM_FROUND_NO_EXC);
-}
-
-__m512 test_mm512_mask_exp2a23_round_ps(__m512 s, __mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_mask_exp2a23_round_ps
-  // CHECK: @llvm.x86.avx512.exp2.ps
-  return _mm512_mask_exp2a23_round_ps(s, m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512 test_mm512_maskz_exp2a23_round_ps(__mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_maskz_exp2a23_round_ps
-  // CHECK: @llvm.x86.avx512.exp2.ps
-  return _mm512_maskz_exp2a23_round_ps(m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512 test_mm512_exp2a23_ps(__m512 a) {
-  // CHECK-LABEL: @test_mm512_exp2a23_ps
-  // CHECK: @llvm.x86.avx512.exp2.ps
-  return _mm512_exp2a23_ps(a);
-}
-
-__m512 test_mm512_mask_exp2a23_ps(__m512 s, __mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_mask_exp2a23_ps
-  // CHECK: @llvm.x86.avx512.exp2.ps
-  return _mm512_mask_exp2a23_ps(s, m, a);
-}
-
-__m512 test_mm512_maskz_exp2a23_ps(__mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_maskz_exp2a23_ps
-  // CHECK: @llvm.x86.avx512.exp2.ps
-  return _mm512_maskz_exp2a23_ps(m, a);
-}
-
diff --git a/clang/test/CodeGen/X86/avx512pf-builtins.c b/clang/test/CodeGen/X86/avx512pf-builtins.c
deleted file mode 100644
index 3a117ed..0000000
--- a/clang/test/CodeGen/X86/avx512pf-builtins.c
+++ /dev/null
@@ -1,100 +0,0 @@
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512pf -emit-llvm -o - -Wall | FileCheck %s
-
-
-#include <immintrin.h>
-
-void test_mm512_mask_prefetch_i32gather_pd(__m256i index, __mmask8 mask, void const *addr) {
-  // CHECK-LABEL: @test_mm512_mask_prefetch_i32gather_pd
-  // CHECK: @llvm.x86.avx512.gatherpf.dpd
-  return _mm512_mask_prefetch_i32gather_pd(index, mask, addr, 2, _MM_HINT_T0); 
-}
-
-void test_mm512_prefetch_i32gather_pd(__m256i index, void const *addr) {
-  // CHECK-LABEL: @test_mm512_prefetch_i32gather_pd
-  // CHECK: @llvm.x86.avx512.gatherpf.dpd
-  return _mm512_prefetch_i32gather_pd(index, addr, 2, _MM_HINT_T0); 
-}
-
-void test_mm512_mask_prefetch_i32gather_ps(__m512i index, __mmask16 mask, void const *addr) {
-  // CHECK-LABEL: @test_mm512_mask_prefetch_i32gather_ps
-  // CHECK: @llvm.x86.avx512.gatherpf.dps
-  return _mm512_mask_prefetch_i32gather_ps(index, mask, addr, 2, _MM_HINT_T0); 
-}
-
-void test_mm512_prefetch_i32gather_ps(__m512i index,  void const *addr) {
-  // CHECK-LABEL: @test_mm512_prefetch_i32gather_ps
-  // CHECK: @llvm.x86.avx512.gatherpf.dps
-  return _mm512_prefetch_i32gather_ps(index, addr, 2, _MM_HINT_T0); 
-}
-
-void test_mm512_mask_prefetch_i64gather_pd(__m512i index, __mmask8 mask, void const *addr) {
-  // CHECK-LABEL: @test_mm512_mask_prefetch_i64gather_pd
-  // CHECK: @llvm.x86.avx512.gatherpf.qpd
-  return _mm512_mask_prefetch_i64gather_pd(index, mask, addr, 2, _MM_HINT_T0); 
-}
-
-void test_mm512_prefetch_i64gather_pd(__m512i index, void const *addr) {
-  // CHECK-LABEL: @test_mm512_prefetch_i64gather_pd
-  // CHECK: @llvm.x86.avx512.gatherpf.qpd
-  return _mm512_prefetch_i64gather_pd(index, addr, 2, _MM_HINT_T0); 
-}
-
-void test_mm512_mask_prefetch_i64gather_ps(__m512i index, __mmask8 mask, void const *addr) {
-  // CHECK-LABEL: @test_mm512_mask_prefetch_i64gather_ps
-  // CHECK: @llvm.x86.avx512.gatherpf.qps
-  return _mm512_mask_prefetch_i64gather_ps(index, mask, addr, 2, _MM_HINT_T0); 
-}
-
-void test_mm512_prefetch_i64gather_ps(__m512i index, void const *addr) {
-  // CHECK-LABEL: @test_mm512_prefetch_i64gather_ps
-  // CHECK: @llvm.x86.avx512.gatherpf.qps
-  return _mm512_prefetch_i64gather_ps(index, addr, 2, _MM_HINT_T0); 
-}
-
-void test_mm512_prefetch_i32scatter_pd(void *addr, __m256i index) {
-  // CHECK-LABEL: @test_mm512_prefetch_i32scatter_pd
-  // CHECK: @llvm.x86.avx512.scatterpf.dpd.512
-  return _mm512_prefetch_i32scatter_pd(addr, index, 1, _MM_HINT_T1); 
-}
-
-void test_mm512_mask_prefetch_i32scatter_pd(void *addr, __mmask8 mask, __m256i index) {
-  // CHECK-LABEL: @test_mm512_mask_prefetch_i32scatter_pd
-  // CHECK: @llvm.x86.avx512.scatterpf.dpd.512
-  return _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, 1, _MM_HINT_T1); 
-}
-
-void test_mm512_prefetch_i32scatter_ps(void *addr, __m512i index) {
-  // CHECK-LABEL: @test_mm512_prefetch_i32scatter_ps
-  // CHECK: @llvm.x86.avx512.scatterpf.dps.512
-  return _mm512_prefetch_i32scatter_ps(addr, index, 1, _MM_HINT_T1); 
-}
-
-void test_mm512_mask_prefetch_i32scatter_ps(void *addr, __mmask16 mask, __m512i index) {
-  // CHECK-LABEL: @test_mm512_mask_prefetch_i32scatter_ps
-  // CHECK: @llvm.x86.avx512.scatterpf.dps.512
-  return _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, 1, _MM_HINT_T1); 
-}
-
-void test_mm512_prefetch_i64scatter_pd(void *addr, __m512i index) {
-  // CHECK-LABEL: @test_mm512_prefetch_i64scatter_pd
-  // CHECK: @llvm.x86.avx512.scatterpf.qpd.512
-  return _mm512_prefetch_i64scatter_pd(addr, index, 1, _MM_HINT_T1); 
-}
-
-void test_mm512_mask_prefetch_i64scatter_pd(void *addr, __mmask16 mask, __m512i index) {
-  // CHECK-LABEL: @test_mm512_mask_prefetch_i64scatter_pd
-  // CHECK: @llvm.x86.avx512.scatterpf.qpd.512
-  return _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, 1, _MM_HINT_T1); 
-}
-
-void test_mm512_prefetch_i64scatter_ps(void *addr, __m512i index) {
-  // CHECK-LABEL: @test_mm512_prefetch_i64scatter_ps
-  // CHECK: @llvm.x86.avx512.scatterpf.qps.512
-  return _mm512_prefetch_i64scatter_ps(addr, index, 1, _MM_HINT_T1); 
-}
-
-void test_mm512_mask_prefetch_i64scatter_ps(void *addr, __mmask16 mask, __m512i index) {
-  // CHECK-LABEL: @test_mm512_mask_prefetch_i64scatter_ps
-  // CHECK: @llvm.x86.avx512.scatterpf.qps.512
-  return _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, 1, _MM_HINT_T1); 
-}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c
index c442d2c..d894e98 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c
@@ -2,12 +2,14 @@
 
 // REQUIRES: aarch64-registered-target
 
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
-#include <arm_sme.h>
+#include <arm_sve.h>
 
 #if defined __ARM_FEATURE_SME
 #define MODE_ATTR __arm_streaming
@@ -16,7 +18,7 @@
 #endif
 
 #ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.§
+// A simple used,unused... macro, long enough to represent any SVE builtin.
 #define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
 #else
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret-bfloat.c
index bf2cd23..41208bf 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret-bfloat.c
@@ -4,6 +4,10 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x2 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x3 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x4 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x2 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x3 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x4 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x2 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE2
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x3 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3
@@ -18,9 +22,16 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x4 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE4
 
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR
+#endif
+
 #ifdef TUPLE
 #define TYPE_1(base,tuple) base ## tuple ## _t
 #define TYPE_0(base,tuple) TYPE_1(base,tuple)
@@ -81,7 +92,7 @@
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svint8) test_svreinterpret_s8_bf16(TYPE(svbfloat16) op) {
+TYPE(svint8) test_svreinterpret_s8_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_s8, _bf16)(op);
 }
 
@@ -125,7 +136,7 @@ TYPE(svint8) test_svreinterpret_s8_bf16(TYPE(svbfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svint16) test_svreinterpret_s16_bf16(TYPE(svbfloat16) op) {
+TYPE(svint16) test_svreinterpret_s16_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_s16, _bf16)(op);
 }
 
@@ -169,7 +180,7 @@ TYPE(svint16) test_svreinterpret_s16_bf16(TYPE(svbfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svint32) test_svreinterpret_s32_bf16(TYPE(svbfloat16) op) {
+TYPE(svint32) test_svreinterpret_s32_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_s32, _bf16)(op);
 }
 // CHECK-LABEL: @test_svreinterpret_s64_bf16(
@@ -212,7 +223,7 @@ TYPE(svint32) test_svreinterpret_s32_bf16(TYPE(svbfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svint64) test_svreinterpret_s64_bf16(TYPE(svbfloat16) op) {
+TYPE(svint64) test_svreinterpret_s64_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_s64, _bf16)(op);
 }
 
@@ -256,7 +267,7 @@ TYPE(svint64) test_svreinterpret_s64_bf16(TYPE(svbfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svuint8) test_svreinterpret_u8_bf16(TYPE(svbfloat16) op) {
+TYPE(svuint8) test_svreinterpret_u8_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_u8, _bf16)(op);
 }
 
@@ -300,7 +311,7 @@ TYPE(svuint8) test_svreinterpret_u8_bf16(TYPE(svbfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svuint16) test_svreinterpret_u16_bf16(TYPE(svbfloat16) op) {
+TYPE(svuint16) test_svreinterpret_u16_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_u16, _bf16)(op);
 }
 
@@ -344,7 +355,7 @@ TYPE(svuint16) test_svreinterpret_u16_bf16(TYPE(svbfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svuint32) test_svreinterpret_u32_bf16(TYPE(svbfloat16) op) {
+TYPE(svuint32) test_svreinterpret_u32_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_u32, _bf16)(op);
 }
 
@@ -388,7 +399,7 @@ TYPE(svuint32) test_svreinterpret_u32_bf16(TYPE(svbfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svuint64) test_svreinterpret_u64_bf16(TYPE(svbfloat16) op) {
+TYPE(svuint64) test_svreinterpret_u64_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_u64, _bf16)(op);
 }
 
@@ -432,7 +443,7 @@ TYPE(svuint64) test_svreinterpret_u64_bf16(TYPE(svbfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x bfloat>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_s8(TYPE(svint8) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_s8(TYPE(svint8) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _s8)(op);
 }
 
@@ -476,7 +487,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_s8(TYPE(svint8) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 32 x bfloat>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_s16(TYPE(svint16) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_s16(TYPE(svint16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _s16)(op);
 }
 
@@ -520,7 +531,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_s16(TYPE(svint16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x bfloat>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_s32(TYPE(svint32) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_s32(TYPE(svint32) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _s32)(op);
 }
 
@@ -564,7 +575,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_s32(TYPE(svint32) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x bfloat>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_s64(TYPE(svint64) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_s64(TYPE(svint64) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _s64)(op);
 }
 
@@ -608,7 +619,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_s64(TYPE(svint64) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x bfloat>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_u8(TYPE(svuint8) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_u8(TYPE(svuint8) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _u8)(op);
 }
 
@@ -652,7 +663,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_u8(TYPE(svuint8) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 32 x bfloat>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_u16(TYPE(svuint16) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_u16(TYPE(svuint16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _u16)(op);
 }
 
@@ -696,7 +707,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_u16(TYPE(svuint16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x bfloat>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_u32(TYPE(svuint32) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_u32(TYPE(svuint32) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _u32)(op);
 }
 
@@ -740,7 +751,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_u32(TYPE(svuint32) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x bfloat>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_u64(TYPE(svuint64) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_u64(TYPE(svuint64) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _u64)(op);
 }
 
@@ -776,7 +787,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_u64(TYPE(svuint64) op) {
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[OP:%.*]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_bf16(TYPE(svbfloat16) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _bf16)(op);
 }
 
@@ -820,7 +831,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_bf16(TYPE(svbfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 32 x bfloat>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_f16(TYPE(svfloat16) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_f16(TYPE(svfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _f16)(op);
 }
 
@@ -864,7 +875,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_f16(TYPE(svfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 32 x bfloat>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_f32(TYPE(svfloat32) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_f32(TYPE(svfloat32) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _f32)(op);
 }
 
@@ -908,7 +919,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_f32(TYPE(svfloat32) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 32 x bfloat>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_f64(TYPE(svfloat64) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_f64(TYPE(svfloat64) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _f64)(op);
 }
 
@@ -952,7 +963,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_f64(TYPE(svfloat64) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 16 x float>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_bf16(TYPE(svbfloat16) op) {
+TYPE(svfloat32) test_svreinterpret_f32_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_f32, _bf16)(op);
 }
 
@@ -996,7 +1007,7 @@ TYPE(svfloat32) test_svreinterpret_f32_bf16(TYPE(svbfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 32 x half>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_bf16(TYPE(svbfloat16) op) {
+TYPE(svfloat16) test_svreinterpret_f16_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_f16, _bf16)(op);
 }
 
@@ -1040,6 +1051,6 @@ TYPE(svfloat16) test_svreinterpret_f16_bf16(TYPE(svbfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 8 x double>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_bf16(TYPE(svbfloat16) op) {
+TYPE(svfloat64) test_svreinterpret_f64_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_f64, _bf16)(op);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret.c
index 3d9d5c3..e61bbf3 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret.c
@@ -4,6 +4,10 @@
 // RUN: %clang_cc1 -DTUPLE=x2 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
 // RUN: %clang_cc1 -DTUPLE=x3 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
 // RUN: %clang_cc1 -DTUPLE=x4 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DTUPLE=x2 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
+// RUN: %clang_cc1 -DTUPLE=x3 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
+// RUN: %clang_cc1 -DTUPLE=x4 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DTUPLE=x2 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE2
 // RUN: %clang_cc1 -DTUPLE=x3 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3
@@ -17,9 +21,16 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x3 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x4 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE4
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR
+#endif
+
 #ifdef TUPLE
 #define TYPE_1(base,tuple) base ## tuple ## _t
 #define TYPE_0(base,tuple) TYPE_1(base,tuple)
@@ -72,7 +83,7 @@
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[OP:%.*]]
 //
-TYPE(svint8) test_svreinterpret_s8_s8(TYPE(svint8) op)
+TYPE(svint8) test_svreinterpret_s8_s8(TYPE(svint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s8,_s8)(op);
 }
@@ -117,7 +128,7 @@ TYPE(svint8) test_svreinterpret_s8_s8(TYPE(svint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svint8) test_svreinterpret_s8_s16(TYPE(svint16) op)
+TYPE(svint8) test_svreinterpret_s8_s16(TYPE(svint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s8,_s16)(op);
 }
@@ -162,7 +173,7 @@ TYPE(svint8) test_svreinterpret_s8_s16(TYPE(svint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svint8) test_svreinterpret_s8_s32(TYPE(svint32) op)
+TYPE(svint8) test_svreinterpret_s8_s32(TYPE(svint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s8,_s32)(op);
 }
@@ -207,7 +218,7 @@ TYPE(svint8) test_svreinterpret_s8_s32(TYPE(svint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svint8) test_svreinterpret_s8_s64(TYPE(svint64) op)
+TYPE(svint8) test_svreinterpret_s8_s64(TYPE(svint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s8,_s64)(op);
 }
@@ -244,7 +255,7 @@ TYPE(svint8) test_svreinterpret_s8_s64(TYPE(svint64) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[OP:%.*]]
 //
-TYPE(svint8) test_svreinterpret_s8_u8(TYPE(svuint8) op)
+TYPE(svint8) test_svreinterpret_s8_u8(TYPE(svuint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s8,_u8)(op);
 }
@@ -289,7 +300,7 @@ TYPE(svint8) test_svreinterpret_s8_u8(TYPE(svuint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svint8) test_svreinterpret_s8_u16(TYPE(svuint16) op)
+TYPE(svint8) test_svreinterpret_s8_u16(TYPE(svuint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s8,_u16)(op);
 }
@@ -335,7 +346,7 @@ TYPE(svint8) test_svreinterpret_s8_u16(TYPE(svuint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svint8) test_svreinterpret_s8_u32(TYPE(svuint32) op)
+TYPE(svint8) test_svreinterpret_s8_u32(TYPE(svuint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s8,_u32)(op);
 }
@@ -381,7 +392,7 @@ TYPE(svint8) test_svreinterpret_s8_u32(TYPE(svuint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svint8) test_svreinterpret_s8_u64(TYPE(svuint64) op)
+TYPE(svint8) test_svreinterpret_s8_u64(TYPE(svuint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s8,_u64)(op);
 }
@@ -426,7 +437,7 @@ TYPE(svint8) test_svreinterpret_s8_u64(TYPE(svuint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svint8) test_svreinterpret_s8_f16(TYPE(svfloat16) op)
+TYPE(svint8) test_svreinterpret_s8_f16(TYPE(svfloat16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s8,_f16)(op);
 }
@@ -471,7 +482,7 @@ TYPE(svint8) test_svreinterpret_s8_f16(TYPE(svfloat16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svint8) test_svreinterpret_s8_f32(TYPE(svfloat32) op)
+TYPE(svint8) test_svreinterpret_s8_f32(TYPE(svfloat32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s8,_f32)(op);
 }
@@ -516,7 +527,7 @@ TYPE(svint8) test_svreinterpret_s8_f32(TYPE(svfloat32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svint8) test_svreinterpret_s8_f64(TYPE(svfloat64) op)
+TYPE(svint8) test_svreinterpret_s8_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s8,_f64)(op);
 }
@@ -561,7 +572,7 @@ TYPE(svint8) test_svreinterpret_s8_f64(TYPE(svfloat64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svint16) test_svreinterpret_s16_s8(TYPE(svint8) op)
+TYPE(svint16) test_svreinterpret_s16_s8(TYPE(svint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s16,_s8)(op);
 }
@@ -598,7 +609,7 @@ TYPE(svint16) test_svreinterpret_s16_s8(TYPE(svint8) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[OP:%.*]]
 //
-TYPE(svint16) test_svreinterpret_s16_s16(TYPE(svint16) op)
+TYPE(svint16) test_svreinterpret_s16_s16(TYPE(svint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s16,_s16)(op);
 }
@@ -643,7 +654,7 @@ TYPE(svint16) test_svreinterpret_s16_s16(TYPE(svint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svint16) test_svreinterpret_s16_s32(TYPE(svint32) op)
+TYPE(svint16) test_svreinterpret_s16_s32(TYPE(svint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s16,_s32)(op);
 }
@@ -688,7 +699,7 @@ TYPE(svint16) test_svreinterpret_s16_s32(TYPE(svint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svint16) test_svreinterpret_s16_s64(TYPE(svint64) op)
+TYPE(svint16) test_svreinterpret_s16_s64(TYPE(svint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s16,_s64)(op);
 }
@@ -733,7 +744,7 @@ TYPE(svint16) test_svreinterpret_s16_s64(TYPE(svint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svint16) test_svreinterpret_s16_u8(TYPE(svuint8) op)
+TYPE(svint16) test_svreinterpret_s16_u8(TYPE(svuint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s16,_u8)(op);
 }
@@ -770,7 +781,7 @@ TYPE(svint16) test_svreinterpret_s16_u8(TYPE(svuint8) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[OP:%.*]]
 //
-TYPE(svint16) test_svreinterpret_s16_u16(TYPE(svuint16) op)
+TYPE(svint16) test_svreinterpret_s16_u16(TYPE(svuint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s16,_u16)(op);
 }
@@ -815,7 +826,7 @@ TYPE(svint16) test_svreinterpret_s16_u16(TYPE(svuint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svint16) test_svreinterpret_s16_u32(TYPE(svuint32) op)
+TYPE(svint16) test_svreinterpret_s16_u32(TYPE(svuint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s16,_u32)(op);
 }
@@ -860,7 +871,7 @@ TYPE(svint16) test_svreinterpret_s16_u32(TYPE(svuint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svint16) test_svreinterpret_s16_u64(TYPE(svuint64) op)
+TYPE(svint16) test_svreinterpret_s16_u64(TYPE(svuint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s16,_u64)(op);
 }
@@ -905,7 +916,7 @@ TYPE(svint16) test_svreinterpret_s16_u64(TYPE(svuint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svint16) test_svreinterpret_s16_f16(TYPE(svfloat16) op)
+TYPE(svint16) test_svreinterpret_s16_f16(TYPE(svfloat16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s16,_f16)(op);
 }
@@ -950,7 +961,7 @@ TYPE(svint16) test_svreinterpret_s16_f16(TYPE(svfloat16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svint16) test_svreinterpret_s16_f32(TYPE(svfloat32) op)
+TYPE(svint16) test_svreinterpret_s16_f32(TYPE(svfloat32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s16,_f32)(op);
 }
@@ -995,7 +1006,7 @@ TYPE(svint16) test_svreinterpret_s16_f32(TYPE(svfloat32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svint16) test_svreinterpret_s16_f64(TYPE(svfloat64) op)
+TYPE(svint16) test_svreinterpret_s16_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s16,_f64)(op);
 }
@@ -1040,7 +1051,7 @@ TYPE(svint16) test_svreinterpret_s16_f64(TYPE(svfloat64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svint32) test_svreinterpret_s32_s8(TYPE(svint8) op)
+TYPE(svint32) test_svreinterpret_s32_s8(TYPE(svint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s32,_s8)(op);
 }
@@ -1085,7 +1096,7 @@ TYPE(svint32) test_svreinterpret_s32_s8(TYPE(svint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svint32) test_svreinterpret_s32_s16(TYPE(svint16) op)
+TYPE(svint32) test_svreinterpret_s32_s16(TYPE(svint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s32,_s16)(op);
 }
@@ -1122,7 +1133,7 @@ TYPE(svint32) test_svreinterpret_s32_s16(TYPE(svint16) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[OP:%.*]]
 //
-TYPE(svint32) test_svreinterpret_s32_s32(TYPE(svint32) op)
+TYPE(svint32) test_svreinterpret_s32_s32(TYPE(svint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s32,_s32)(op);
 }
@@ -1167,7 +1178,7 @@ TYPE(svint32) test_svreinterpret_s32_s32(TYPE(svint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svint32) test_svreinterpret_s32_s64(TYPE(svint64) op)
+TYPE(svint32) test_svreinterpret_s32_s64(TYPE(svint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s32,_s64)(op);
 }
@@ -1212,7 +1223,7 @@ TYPE(svint32) test_svreinterpret_s32_s64(TYPE(svint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svint32) test_svreinterpret_s32_u8(TYPE(svuint8) op)
+TYPE(svint32) test_svreinterpret_s32_u8(TYPE(svuint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s32,_u8)(op);
 }
@@ -1257,7 +1268,7 @@ TYPE(svint32) test_svreinterpret_s32_u8(TYPE(svuint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svint32) test_svreinterpret_s32_u16(TYPE(svuint16) op)
+TYPE(svint32) test_svreinterpret_s32_u16(TYPE(svuint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s32,_u16)(op);
 }
@@ -1294,7 +1305,7 @@ TYPE(svint32) test_svreinterpret_s32_u16(TYPE(svuint16) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[OP:%.*]]
 //
-TYPE(svint32) test_svreinterpret_s32_u32(TYPE(svuint32) op)
+TYPE(svint32) test_svreinterpret_s32_u32(TYPE(svuint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s32,_u32)(op);
 }
@@ -1339,7 +1350,7 @@ TYPE(svint32) test_svreinterpret_s32_u32(TYPE(svuint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svint32) test_svreinterpret_s32_u64(TYPE(svuint64) op)
+TYPE(svint32) test_svreinterpret_s32_u64(TYPE(svuint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s32,_u64)(op);
 }
@@ -1384,7 +1395,7 @@ TYPE(svint32) test_svreinterpret_s32_u64(TYPE(svuint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svint32) test_svreinterpret_s32_f16(TYPE(svfloat16) op)
+TYPE(svint32) test_svreinterpret_s32_f16(TYPE(svfloat16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s32,_f16)(op);
 }
@@ -1429,7 +1440,7 @@ TYPE(svint32) test_svreinterpret_s32_f16(TYPE(svfloat16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svint32) test_svreinterpret_s32_f32(TYPE(svfloat32) op)
+TYPE(svint32) test_svreinterpret_s32_f32(TYPE(svfloat32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s32,_f32)(op);
 }
@@ -1475,7 +1486,7 @@ TYPE(svint32) test_svreinterpret_s32_f32(TYPE(svfloat32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svint32) test_svreinterpret_s32_f64(TYPE(svfloat64) op)
+TYPE(svint32) test_svreinterpret_s32_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s32,_f64)(op);
 }
@@ -1520,7 +1531,7 @@ TYPE(svint32) test_svreinterpret_s32_f64(TYPE(svfloat64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svint64) test_svreinterpret_s64_s8(TYPE(svint8) op)
+TYPE(svint64) test_svreinterpret_s64_s8(TYPE(svint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s64,_s8)(op);
 }
@@ -1565,7 +1576,7 @@ TYPE(svint64) test_svreinterpret_s64_s8(TYPE(svint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svint64) test_svreinterpret_s64_s16(TYPE(svint16) op)
+TYPE(svint64) test_svreinterpret_s64_s16(TYPE(svint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s64,_s16)(op);
 }
@@ -1610,7 +1621,7 @@ TYPE(svint64) test_svreinterpret_s64_s16(TYPE(svint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svint64) test_svreinterpret_s64_s32(TYPE(svint32) op)
+TYPE(svint64) test_svreinterpret_s64_s32(TYPE(svint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s64,_s32)(op);
 }
@@ -1647,7 +1658,7 @@ TYPE(svint64) test_svreinterpret_s64_s32(TYPE(svint32) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[OP:%.*]]
 //
-TYPE(svint64) test_svreinterpret_s64_s64(TYPE(svint64) op)
+TYPE(svint64) test_svreinterpret_s64_s64(TYPE(svint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s64,_s64)(op);
 }
@@ -1692,7 +1703,7 @@ TYPE(svint64) test_svreinterpret_s64_s64(TYPE(svint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svint64) test_svreinterpret_s64_u8(TYPE(svuint8) op)
+TYPE(svint64) test_svreinterpret_s64_u8(TYPE(svuint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s64,_u8)(op);
 }
@@ -1737,7 +1748,7 @@ TYPE(svint64) test_svreinterpret_s64_u8(TYPE(svuint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svint64) test_svreinterpret_s64_u16(TYPE(svuint16) op)
+TYPE(svint64) test_svreinterpret_s64_u16(TYPE(svuint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s64,_u16)(op);
 }
@@ -1782,7 +1793,7 @@ TYPE(svint64) test_svreinterpret_s64_u16(TYPE(svuint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svint64) test_svreinterpret_s64_u32(TYPE(svuint32) op)
+TYPE(svint64) test_svreinterpret_s64_u32(TYPE(svuint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s64,_u32)(op);
 }
@@ -1819,7 +1830,7 @@ TYPE(svint64) test_svreinterpret_s64_u32(TYPE(svuint32) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[OP:%.*]]
 //
-TYPE(svint64) test_svreinterpret_s64_u64(TYPE(svuint64) op)
+TYPE(svint64) test_svreinterpret_s64_u64(TYPE(svuint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s64,_u64)(op);
 }
@@ -1864,7 +1875,7 @@ TYPE(svint64) test_svreinterpret_s64_u64(TYPE(svuint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svint64) test_svreinterpret_s64_f16(TYPE(svfloat16) op)
+TYPE(svint64) test_svreinterpret_s64_f16(TYPE(svfloat16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s64,_f16)(op);
 }
@@ -1909,7 +1920,7 @@ TYPE(svint64) test_svreinterpret_s64_f16(TYPE(svfloat16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svint64) test_svreinterpret_s64_f32(TYPE(svfloat32) op)
+TYPE(svint64) test_svreinterpret_s64_f32(TYPE(svfloat32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s64,_f32)(op);
 }
@@ -1954,7 +1965,7 @@ TYPE(svint64) test_svreinterpret_s64_f32(TYPE(svfloat32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svint64) test_svreinterpret_s64_f64(TYPE(svfloat64) op)
+TYPE(svint64) test_svreinterpret_s64_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s64,_f64)(op);
 }
@@ -1991,7 +2002,7 @@ TYPE(svint64) test_svreinterpret_s64_f64(TYPE(svfloat64) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[OP:%.*]]
 //
-TYPE(svuint8) test_svreinterpret_u8_s8(TYPE(svint8) op)
+TYPE(svuint8) test_svreinterpret_u8_s8(TYPE(svint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u8,_s8)(op);
 }
@@ -2036,7 +2047,7 @@ TYPE(svuint8) test_svreinterpret_u8_s8(TYPE(svint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svuint8) test_svreinterpret_u8_s16(TYPE(svint16) op)
+TYPE(svuint8) test_svreinterpret_u8_s16(TYPE(svint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u8,_s16)(op);
 }
@@ -2081,7 +2092,7 @@ TYPE(svuint8) test_svreinterpret_u8_s16(TYPE(svint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svuint8) test_svreinterpret_u8_s32(TYPE(svint32) op)
+TYPE(svuint8) test_svreinterpret_u8_s32(TYPE(svint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u8,_s32)(op);
 }
@@ -2126,7 +2137,7 @@ TYPE(svuint8) test_svreinterpret_u8_s32(TYPE(svint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svuint8) test_svreinterpret_u8_s64(TYPE(svint64) op)
+TYPE(svuint8) test_svreinterpret_u8_s64(TYPE(svint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u8,_s64)(op);
 }
@@ -2163,7 +2174,7 @@ TYPE(svuint8) test_svreinterpret_u8_s64(TYPE(svint64) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[OP:%.*]]
 //
-TYPE(svuint8) test_svreinterpret_u8_u8(TYPE(svuint8) op)
+TYPE(svuint8) test_svreinterpret_u8_u8(TYPE(svuint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u8,_u8)(op);
 }
@@ -2208,7 +2219,7 @@ TYPE(svuint8) test_svreinterpret_u8_u8(TYPE(svuint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svuint8) test_svreinterpret_u8_u16(TYPE(svuint16) op)
+TYPE(svuint8) test_svreinterpret_u8_u16(TYPE(svuint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u8,_u16)(op);
 }
@@ -2253,7 +2264,7 @@ TYPE(svuint8) test_svreinterpret_u8_u16(TYPE(svuint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svuint8) test_svreinterpret_u8_u32(TYPE(svuint32) op)
+TYPE(svuint8) test_svreinterpret_u8_u32(TYPE(svuint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u8,_u32)(op);
 }
@@ -2298,7 +2309,7 @@ TYPE(svuint8) test_svreinterpret_u8_u32(TYPE(svuint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svuint8) test_svreinterpret_u8_u64(TYPE(svuint64) op)
+TYPE(svuint8) test_svreinterpret_u8_u64(TYPE(svuint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u8,_u64)(op);
 }
@@ -2343,7 +2354,7 @@ TYPE(svuint8) test_svreinterpret_u8_u64(TYPE(svuint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svuint8) test_svreinterpret_u8_f16(TYPE(svfloat16) op)
+TYPE(svuint8) test_svreinterpret_u8_f16(TYPE(svfloat16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u8,_f16)(op);
 }
@@ -2388,7 +2399,7 @@ TYPE(svuint8) test_svreinterpret_u8_f16(TYPE(svfloat16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svuint8) test_svreinterpret_u8_f32(TYPE(svfloat32) op)
+TYPE(svuint8) test_svreinterpret_u8_f32(TYPE(svfloat32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u8,_f32)(op);
 }
@@ -2433,7 +2444,7 @@ TYPE(svuint8) test_svreinterpret_u8_f32(TYPE(svfloat32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svuint8) test_svreinterpret_u8_f64(TYPE(svfloat64) op)
+TYPE(svuint8) test_svreinterpret_u8_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u8,_f64)(op);
 }
@@ -2478,7 +2489,7 @@ TYPE(svuint8) test_svreinterpret_u8_f64(TYPE(svfloat64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svuint16) test_svreinterpret_u16_s8(TYPE(svint8) op)
+TYPE(svuint16) test_svreinterpret_u16_s8(TYPE(svint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u16,_s8)(op);
 }
@@ -2515,7 +2526,7 @@ TYPE(svuint16) test_svreinterpret_u16_s8(TYPE(svint8) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[OP:%.*]]
 //
-TYPE(svuint16) test_svreinterpret_u16_s16(TYPE(svint16) op)
+TYPE(svuint16) test_svreinterpret_u16_s16(TYPE(svint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u16,_s16)(op);
 }
@@ -2560,7 +2571,7 @@ TYPE(svuint16) test_svreinterpret_u16_s16(TYPE(svint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svuint16) test_svreinterpret_u16_s32(TYPE(svint32) op)
+TYPE(svuint16) test_svreinterpret_u16_s32(TYPE(svint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u16,_s32)(op);
 }
@@ -2605,7 +2616,7 @@ TYPE(svuint16) test_svreinterpret_u16_s32(TYPE(svint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svuint16) test_svreinterpret_u16_s64(TYPE(svint64) op)
+TYPE(svuint16) test_svreinterpret_u16_s64(TYPE(svint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u16,_s64)(op);
 }
@@ -2650,7 +2661,7 @@ TYPE(svuint16) test_svreinterpret_u16_s64(TYPE(svint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svuint16) test_svreinterpret_u16_u8(TYPE(svuint8) op)
+TYPE(svuint16) test_svreinterpret_u16_u8(TYPE(svuint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u16,_u8)(op);
 }
@@ -2687,7 +2698,7 @@ TYPE(svuint16) test_svreinterpret_u16_u8(TYPE(svuint8) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[OP:%.*]]
 //
-TYPE(svuint16) test_svreinterpret_u16_u16(TYPE(svuint16) op)
+TYPE(svuint16) test_svreinterpret_u16_u16(TYPE(svuint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u16,_u16)(op);
 }
@@ -2732,7 +2743,7 @@ TYPE(svuint16) test_svreinterpret_u16_u16(TYPE(svuint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svuint16) test_svreinterpret_u16_u32(TYPE(svuint32) op)
+TYPE(svuint16) test_svreinterpret_u16_u32(TYPE(svuint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u16,_u32)(op);
 }
@@ -2777,7 +2788,7 @@ TYPE(svuint16) test_svreinterpret_u16_u32(TYPE(svuint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svuint16) test_svreinterpret_u16_u64(TYPE(svuint64) op)
+TYPE(svuint16) test_svreinterpret_u16_u64(TYPE(svuint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u16,_u64)(op);
 }
@@ -2822,7 +2833,7 @@ TYPE(svuint16) test_svreinterpret_u16_u64(TYPE(svuint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svuint16) test_svreinterpret_u16_f16(TYPE(svfloat16) op)
+TYPE(svuint16) test_svreinterpret_u16_f16(TYPE(svfloat16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u16,_f16)(op);
 }
@@ -2867,7 +2878,7 @@ TYPE(svuint16) test_svreinterpret_u16_f16(TYPE(svfloat16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svuint16) test_svreinterpret_u16_f32(TYPE(svfloat32) op)
+TYPE(svuint16) test_svreinterpret_u16_f32(TYPE(svfloat32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u16,_f32)(op);
 }
@@ -2912,7 +2923,7 @@ TYPE(svuint16) test_svreinterpret_u16_f32(TYPE(svfloat32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svuint16) test_svreinterpret_u16_f64(TYPE(svfloat64) op)
+TYPE(svuint16) test_svreinterpret_u16_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u16,_f64)(op);
 }
@@ -2957,7 +2968,7 @@ TYPE(svuint16) test_svreinterpret_u16_f64(TYPE(svfloat64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svuint32) test_svreinterpret_u32_s8(TYPE(svint8) op)
+TYPE(svuint32) test_svreinterpret_u32_s8(TYPE(svint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u32,_s8)(op);
 }
@@ -3002,7 +3013,7 @@ TYPE(svuint32) test_svreinterpret_u32_s8(TYPE(svint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svuint32) test_svreinterpret_u32_s16(TYPE(svint16) op)
+TYPE(svuint32) test_svreinterpret_u32_s16(TYPE(svint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u32,_s16)(op);
 }
@@ -3039,7 +3050,7 @@ TYPE(svuint32) test_svreinterpret_u32_s16(TYPE(svint16) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[OP:%.*]]
 //
-TYPE(svuint32) test_svreinterpret_u32_s32(TYPE(svint32) op)
+TYPE(svuint32) test_svreinterpret_u32_s32(TYPE(svint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u32,_s32)(op);
 }
@@ -3084,7 +3095,7 @@ TYPE(svuint32) test_svreinterpret_u32_s32(TYPE(svint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svuint32) test_svreinterpret_u32_s64(TYPE(svint64) op)
+TYPE(svuint32) test_svreinterpret_u32_s64(TYPE(svint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u32,_s64)(op);
 }
@@ -3129,7 +3140,7 @@ TYPE(svuint32) test_svreinterpret_u32_s64(TYPE(svint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svuint32) test_svreinterpret_u32_u8(TYPE(svuint8) op)
+TYPE(svuint32) test_svreinterpret_u32_u8(TYPE(svuint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u32,_u8)(op);
 }
@@ -3174,7 +3185,7 @@ TYPE(svuint32) test_svreinterpret_u32_u8(TYPE(svuint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svuint32) test_svreinterpret_u32_u16(TYPE(svuint16) op)
+TYPE(svuint32) test_svreinterpret_u32_u16(TYPE(svuint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u32,_u16)(op);
 }
@@ -3211,7 +3222,7 @@ TYPE(svuint32) test_svreinterpret_u32_u16(TYPE(svuint16) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[OP:%.*]]
 //
-TYPE(svuint32) test_svreinterpret_u32_u32(TYPE(svuint32) op)
+TYPE(svuint32) test_svreinterpret_u32_u32(TYPE(svuint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u32,_u32)(op);
 }
@@ -3256,7 +3267,7 @@ TYPE(svuint32) test_svreinterpret_u32_u32(TYPE(svuint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svuint32) test_svreinterpret_u32_u64(TYPE(svuint64) op)
+TYPE(svuint32) test_svreinterpret_u32_u64(TYPE(svuint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u32,_u64)(op);
 }
@@ -3301,7 +3312,7 @@ TYPE(svuint32) test_svreinterpret_u32_u64(TYPE(svuint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svuint32) test_svreinterpret_u32_f16(TYPE(svfloat16) op)
+TYPE(svuint32) test_svreinterpret_u32_f16(TYPE(svfloat16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u32,_f16)(op);
 }
@@ -3346,7 +3357,7 @@ TYPE(svuint32) test_svreinterpret_u32_f16(TYPE(svfloat16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svuint32) test_svreinterpret_u32_f32(TYPE(svfloat32) op)
+TYPE(svuint32) test_svreinterpret_u32_f32(TYPE(svfloat32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u32,_f32)(op);
 }
@@ -3391,7 +3402,7 @@ TYPE(svuint32) test_svreinterpret_u32_f32(TYPE(svfloat32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svuint32) test_svreinterpret_u32_f64(TYPE(svfloat64) op)
+TYPE(svuint32) test_svreinterpret_u32_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u32,_f64)(op);
 }
@@ -3436,7 +3447,7 @@ TYPE(svuint32) test_svreinterpret_u32_f64(TYPE(svfloat64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svuint64) test_svreinterpret_u64_s8(TYPE(svint8) op)
+TYPE(svuint64) test_svreinterpret_u64_s8(TYPE(svint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u64,_s8)(op);
 }
@@ -3481,7 +3492,7 @@ TYPE(svuint64) test_svreinterpret_u64_s8(TYPE(svint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svuint64) test_svreinterpret_u64_s16(TYPE(svint16) op)
+TYPE(svuint64) test_svreinterpret_u64_s16(TYPE(svint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u64,_s16)(op);
 }
@@ -3526,7 +3537,7 @@ TYPE(svuint64) test_svreinterpret_u64_s16(TYPE(svint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svuint64) test_svreinterpret_u64_s32(TYPE(svint32) op)
+TYPE(svuint64) test_svreinterpret_u64_s32(TYPE(svint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u64,_s32)(op);
 }
@@ -3563,7 +3574,7 @@ TYPE(svuint64) test_svreinterpret_u64_s32(TYPE(svint32) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[OP:%.*]]
 //
-TYPE(svuint64) test_svreinterpret_u64_s64(TYPE(svint64) op)
+TYPE(svuint64) test_svreinterpret_u64_s64(TYPE(svint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u64,_s64)(op);
 }
@@ -3608,7 +3619,7 @@ TYPE(svuint64) test_svreinterpret_u64_s64(TYPE(svint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svuint64) test_svreinterpret_u64_u8(TYPE(svuint8) op)
+TYPE(svuint64) test_svreinterpret_u64_u8(TYPE(svuint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u64,_u8)(op);
 }
@@ -3653,7 +3664,7 @@ TYPE(svuint64) test_svreinterpret_u64_u8(TYPE(svuint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svuint64) test_svreinterpret_u64_u16(TYPE(svuint16) op)
+TYPE(svuint64) test_svreinterpret_u64_u16(TYPE(svuint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u64,_u16)(op);
 }
@@ -3698,7 +3709,7 @@ TYPE(svuint64) test_svreinterpret_u64_u16(TYPE(svuint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svuint64) test_svreinterpret_u64_u32(TYPE(svuint32) op)
+TYPE(svuint64) test_svreinterpret_u64_u32(TYPE(svuint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u64,_u32)(op);
 }
@@ -3735,7 +3746,7 @@ TYPE(svuint64) test_svreinterpret_u64_u32(TYPE(svuint32) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[OP:%.*]]
 //
-TYPE(svuint64) test_svreinterpret_u64_u64(TYPE(svuint64) op)
+TYPE(svuint64) test_svreinterpret_u64_u64(TYPE(svuint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u64,_u64)(op);
 }
@@ -3780,7 +3791,7 @@ TYPE(svuint64) test_svreinterpret_u64_u64(TYPE(svuint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svuint64) test_svreinterpret_u64_f16(TYPE(svfloat16) op)
+TYPE(svuint64) test_svreinterpret_u64_f16(TYPE(svfloat16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u64,_f16)(op);
 }
@@ -3825,7 +3836,7 @@ TYPE(svuint64) test_svreinterpret_u64_f16(TYPE(svfloat16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svuint64) test_svreinterpret_u64_f32(TYPE(svfloat32) op)
+TYPE(svuint64) test_svreinterpret_u64_f32(TYPE(svfloat32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u64,_f32)(op);
 }
@@ -3870,7 +3881,7 @@ TYPE(svuint64) test_svreinterpret_u64_f32(TYPE(svfloat32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svuint64) test_svreinterpret_u64_f64(TYPE(svfloat64) op)
+TYPE(svuint64) test_svreinterpret_u64_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u64,_f64)(op);
 }
@@ -3915,7 +3926,7 @@ TYPE(svuint64) test_svreinterpret_u64_f64(TYPE(svfloat64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x half>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_s8(TYPE(svint8) op)
+TYPE(svfloat16) test_svreinterpret_f16_s8(TYPE(svint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f16,_s8)(op);
 }
@@ -3960,7 +3971,7 @@ TYPE(svfloat16) test_svreinterpret_f16_s8(TYPE(svint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 32 x half>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_s16(TYPE(svint16) op)
+TYPE(svfloat16) test_svreinterpret_f16_s16(TYPE(svint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f16,_s16)(op);
 }
@@ -4005,7 +4016,7 @@ TYPE(svfloat16) test_svreinterpret_f16_s16(TYPE(svint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x half>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_s32(TYPE(svint32) op)
+TYPE(svfloat16) test_svreinterpret_f16_s32(TYPE(svint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f16,_s32)(op);
 }
@@ -4050,7 +4061,7 @@ TYPE(svfloat16) test_svreinterpret_f16_s32(TYPE(svint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x half>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_s64(TYPE(svint64) op)
+TYPE(svfloat16) test_svreinterpret_f16_s64(TYPE(svint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f16,_s64)(op);
 }
@@ -4095,7 +4106,7 @@ TYPE(svfloat16) test_svreinterpret_f16_s64(TYPE(svint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x half>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_u8(TYPE(svuint8) op)
+TYPE(svfloat16) test_svreinterpret_f16_u8(TYPE(svuint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f16,_u8)(op);
 }
@@ -4140,7 +4151,7 @@ TYPE(svfloat16) test_svreinterpret_f16_u8(TYPE(svuint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 32 x half>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_u16(TYPE(svuint16) op)
+TYPE(svfloat16) test_svreinterpret_f16_u16(TYPE(svuint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f16,_u16)(op);
 }
@@ -4185,7 +4196,7 @@ TYPE(svfloat16) test_svreinterpret_f16_u16(TYPE(svuint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x half>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_u32(TYPE(svuint32) op)
+TYPE(svfloat16) test_svreinterpret_f16_u32(TYPE(svuint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f16,_u32)(op);
 }
@@ -4230,7 +4241,7 @@ TYPE(svfloat16) test_svreinterpret_f16_u32(TYPE(svuint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x half>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_u64(TYPE(svuint64) op)
+TYPE(svfloat16) test_svreinterpret_f16_u64(TYPE(svuint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f16,_u64)(op);
 }
@@ -4267,7 +4278,7 @@ TYPE(svfloat16) test_svreinterpret_f16_u64(TYPE(svuint64) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[OP:%.*]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_f16(TYPE(svfloat16) op)
+TYPE(svfloat16) test_svreinterpret_f16_f16(TYPE(svfloat16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f16,_f16)(op);
 }
@@ -4312,7 +4323,7 @@ TYPE(svfloat16) test_svreinterpret_f16_f16(TYPE(svfloat16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 32 x half>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_f32(TYPE(svfloat32) op)
+TYPE(svfloat16) test_svreinterpret_f16_f32(TYPE(svfloat32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f16,_f32)(op);
 }
@@ -4357,7 +4368,7 @@ TYPE(svfloat16) test_svreinterpret_f16_f32(TYPE(svfloat32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 32 x half>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_f64(TYPE(svfloat64) op)
+TYPE(svfloat16) test_svreinterpret_f16_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f16,_f64)(op);
 }
@@ -4402,7 +4413,7 @@ TYPE(svfloat16) test_svreinterpret_f16_f64(TYPE(svfloat64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x float>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_s8(TYPE(svint8) op)
+TYPE(svfloat32) test_svreinterpret_f32_s8(TYPE(svint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f32,_s8)(op);
 }
@@ -4447,7 +4458,7 @@ TYPE(svfloat32) test_svreinterpret_f32_s8(TYPE(svint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x float>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_s16(TYPE(svint16) op)
+TYPE(svfloat32) test_svreinterpret_f32_s16(TYPE(svint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f32,_s16)(op);
 }
@@ -4492,7 +4503,7 @@ TYPE(svfloat32) test_svreinterpret_f32_s16(TYPE(svint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 16 x float>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_s32(TYPE(svint32) op)
+TYPE(svfloat32) test_svreinterpret_f32_s32(TYPE(svint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f32,_s32)(op);
 }
@@ -4537,7 +4548,7 @@ TYPE(svfloat32) test_svreinterpret_f32_s32(TYPE(svint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x float>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_s64(TYPE(svint64) op)
+TYPE(svfloat32) test_svreinterpret_f32_s64(TYPE(svint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f32,_s64)(op);
 }
@@ -4582,7 +4593,7 @@ TYPE(svfloat32) test_svreinterpret_f32_s64(TYPE(svint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x float>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_u8(TYPE(svuint8) op)
+TYPE(svfloat32) test_svreinterpret_f32_u8(TYPE(svuint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f32,_u8)(op);
 }
@@ -4627,7 +4638,7 @@ TYPE(svfloat32) test_svreinterpret_f32_u8(TYPE(svuint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x float>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_u16(TYPE(svuint16) op)
+TYPE(svfloat32) test_svreinterpret_f32_u16(TYPE(svuint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f32,_u16)(op);
 }
@@ -4672,7 +4683,7 @@ TYPE(svfloat32) test_svreinterpret_f32_u16(TYPE(svuint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 16 x float>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_u32(TYPE(svuint32) op)
+TYPE(svfloat32) test_svreinterpret_f32_u32(TYPE(svuint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f32,_u32)(op);
 }
@@ -4717,7 +4728,7 @@ TYPE(svfloat32) test_svreinterpret_f32_u32(TYPE(svuint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x float>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_u64(TYPE(svuint64) op)
+TYPE(svfloat32) test_svreinterpret_f32_u64(TYPE(svuint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f32,_u64)(op);
 }
@@ -4762,7 +4773,7 @@ TYPE(svfloat32) test_svreinterpret_f32_u64(TYPE(svuint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 16 x float>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_f16(TYPE(svfloat16) op)
+TYPE(svfloat32) test_svreinterpret_f32_f16(TYPE(svfloat16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f32,_f16)(op);
 }
@@ -4799,7 +4810,7 @@ TYPE(svfloat32) test_svreinterpret_f32_f16(TYPE(svfloat16) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[OP:%.*]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_f32(TYPE(svfloat32) op)
+TYPE(svfloat32) test_svreinterpret_f32_f32(TYPE(svfloat32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f32,_f32)(op);
 }
@@ -4844,7 +4855,7 @@ TYPE(svfloat32) test_svreinterpret_f32_f32(TYPE(svfloat32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 16 x float>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_f64(TYPE(svfloat64) op)
+TYPE(svfloat32) test_svreinterpret_f32_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f32,_f64)(op);
 }
@@ -4889,7 +4900,7 @@ TYPE(svfloat32) test_svreinterpret_f32_f64(TYPE(svfloat64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x double>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_s8(TYPE(svint8) op)
+TYPE(svfloat64) test_svreinterpret_f64_s8(TYPE(svint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_s8)(op);
 }
@@ -4934,7 +4945,7 @@ TYPE(svfloat64) test_svreinterpret_f64_s8(TYPE(svint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x double>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_s16(TYPE(svint16) op)
+TYPE(svfloat64) test_svreinterpret_f64_s16(TYPE(svint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_s16)(op);
 }
@@ -4979,7 +4990,7 @@ TYPE(svfloat64) test_svreinterpret_f64_s16(TYPE(svint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x double>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_s32(TYPE(svint32) op)
+TYPE(svfloat64) test_svreinterpret_f64_s32(TYPE(svint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_s32)(op);
 }
@@ -5024,7 +5035,7 @@ TYPE(svfloat64) test_svreinterpret_f64_s32(TYPE(svint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 8 x double>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_s64(TYPE(svint64) op)
+TYPE(svfloat64) test_svreinterpret_f64_s64(TYPE(svint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_s64)(op);
 }
@@ -5069,7 +5080,7 @@ TYPE(svfloat64) test_svreinterpret_f64_s64(TYPE(svint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x double>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_u8(TYPE(svuint8) op)
+TYPE(svfloat64) test_svreinterpret_f64_u8(TYPE(svuint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_u8)(op);
 }
@@ -5114,7 +5125,7 @@ TYPE(svfloat64) test_svreinterpret_f64_u8(TYPE(svuint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x double>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_u16(TYPE(svuint16) op)
+TYPE(svfloat64) test_svreinterpret_f64_u16(TYPE(svuint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_u16)(op);
 }
@@ -5159,7 +5170,7 @@ TYPE(svfloat64) test_svreinterpret_f64_u16(TYPE(svuint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x double>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_u32(TYPE(svuint32) op)
+TYPE(svfloat64) test_svreinterpret_f64_u32(TYPE(svuint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_u32)(op);
 }
@@ -5204,7 +5215,7 @@ TYPE(svfloat64) test_svreinterpret_f64_u32(TYPE(svuint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 8 x double>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_u64(TYPE(svuint64) op)
+TYPE(svfloat64) test_svreinterpret_f64_u64(TYPE(svuint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_u64)(op);
 }
@@ -5249,7 +5260,7 @@ TYPE(svfloat64) test_svreinterpret_f64_u64(TYPE(svuint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 8 x double>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_f16(TYPE(svfloat16) op)
+TYPE(svfloat64) test_svreinterpret_f64_f16(TYPE(svfloat16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_f16)(op);
 }
@@ -5294,7 +5305,7 @@ TYPE(svfloat64) test_svreinterpret_f64_f16(TYPE(svfloat16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 8 x double>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_f32(TYPE(svfloat32) op)
+TYPE(svfloat64) test_svreinterpret_f64_f32(TYPE(svfloat32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_f32)(op);
 }
@@ -5331,7 +5342,7 @@ TYPE(svfloat64) test_svreinterpret_f64_f32(TYPE(svfloat32) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[OP:%.*]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_f64(TYPE(svfloat64) op)
+TYPE(svfloat64) test_svreinterpret_f64_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_f64)(op);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c
deleted file mode 100644
index f278758..0000000
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c
+++ /dev/null
@@ -1,35 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -O1 -Werror -Wall -o /dev/null %s
-
-// Note: We need to run this test with '-O1' because oddly enough the svreinterpret is always inlined at -O0.
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// Test that svreinterpret is inlined (because it should be streaming-compatible)
-__attribute__((target("sme")))
-// CHECK-LABEL: @test_svreinterpret_s16_s8_from_streaming_mode(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x i16>
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z45test_svreinterpret_s16_s8_from_streaming_modeu10__SVInt8_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x i16>
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
-//
-svint16_t test_svreinterpret_s16_s8_from_streaming_mode(svint8_t op) __arm_streaming {
-  return SVE_ACLE_FUNC(svreinterpret_s16,_s8,,)(op);
-}
-
diff --git a/clang/test/CodeGen/aarch64-sve-vector-subscript-ops.c b/clang/test/CodeGen/aarch64-sve-vector-subscript-ops.c
index fb60c6d..52a05d0 100644
--- a/clang/test/CodeGen/aarch64-sve-vector-subscript-ops.c
+++ b/clang/test/CodeGen/aarch64-sve-vector-subscript-ops.c
@@ -88,3 +88,25 @@ float subscript_float32(svfloat32_t a, size_t b) {
 double subscript_float64(svfloat64_t a, size_t b) {
   return a[b];
 }
+
+// CHECK-LABEL: @subscript_write_float32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINS:%.*]] = insertelement <vscale x 4 x float> [[A:%.*]], float 1.000000e+00, i64 [[B:%.*]]
+// CHECK-NEXT:    ret <vscale x 4 x float> [[VECINS]]
+//
+svfloat32_t subscript_write_float32(svfloat32_t a, size_t b) {
+  a[b] = 1.0f;
+  return a;
+}
+
+// CHECK-LABEL: @subscript_read_write_float32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <vscale x 4 x float> [[A:%.*]], i64 [[B:%.*]]
+// CHECK-NEXT:    [[ADD:%.*]] = fadd float [[VECEXT]], 1.000000e+00
+// CHECK-NEXT:    [[VECINS:%.*]] = insertelement <vscale x 4 x float> [[A]], float [[ADD]], i64 [[B]]
+// CHECK-NEXT:    ret <vscale x 4 x float> [[VECINS]]
+//
+svfloat32_t subscript_read_write_float32(svfloat32_t a, size_t b) {
+  a[b] += 1.0f;
+  return a;
+}
diff --git a/clang/test/CodeGen/assume_attr.c b/clang/test/CodeGen/assume_attr.c
deleted file mode 100644
index 338a625..0000000
--- a/clang/test/CodeGen/assume_attr.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// RUN: %clang_cc1 -emit-llvm -triple i386-linux-gnu %s -o - | FileCheck %s
-// RUN: %clang_cc1 -x c -emit-pch -o %t %s
-// RUN: %clang_cc1 -include-pch %t %s -emit-llvm -o - | FileCheck %s
-
-// TODO: for "foo" and "bar", "after" is not added as it appears "after" the first use or definition respectively. There might be a way to allow that.
-
-// CHECK:   define{{.*}} void @bar() #0
-// CHECK:   define{{.*}} void @baz() #1
-// CHECK:   declare{{.*}} void @foo() #2
-// CHECK:      attributes #0
-// CHECK-SAME:   "llvm.assume"="bar:before1,bar:before2,bar:before3,bar:def1,bar:def2"
-// CHECK:      attributes #1
-// CHECK-SAME:   "llvm.assume"="baz:before1,baz:before2,baz:before3,baz:def1,baz:def2,baz:after"
-// CHECK:      attributes #2
-// CHECK-SAME:   "llvm.assume"="foo:before1,foo:before2,foo:before3"
-
-#ifndef HEADER
-#define HEADER
-
-/// foo: declarations only
-
-__attribute__((assume("foo:before1"))) void foo(void);
-
-__attribute__((assume("foo:before2")))
-__attribute__((assume("foo:before3"))) void
-foo(void);
-
-/// baz: static function declarations and a definition
-
-__attribute__((assume("baz:before1"))) static void baz(void);
-
-__attribute__((assume("baz:before2")))
-__attribute__((assume("baz:before3"))) static void
-baz(void);
-
-// Definition
-__attribute__((assume("baz:def1,baz:def2"))) static void baz(void) { foo(); }
-
-__attribute__((assume("baz:after"))) static void baz(void);
-
-/// bar: external function declarations and a definition
-
-__attribute__((assume("bar:before1"))) void bar(void);
-
-__attribute__((assume("bar:before2")))
-__attribute__((assume("bar:before3"))) void
-bar(void);
-
-// Definition
-__attribute__((assume("bar:def1,bar:def2"))) void bar(void) { baz(); }
-
-__attribute__((assume("bar:after"))) void bar(void);
-
-/// back to foo
-
-__attribute__((assume("foo:after"))) void foo(void);
-
-#endif
diff --git a/clang/test/CodeGen/attr-cpuspecific.c b/clang/test/CodeGen/attr-cpuspecific.c
index 2c3e693..628892d 100644
--- a/clang/test/CodeGen/attr-cpuspecific.c
+++ b/clang/test/CodeGen/attr-cpuspecific.c
@@ -75,8 +75,8 @@ void TwoVersions(void);
 // LINUX: define weak_odr ptr @TwoVersions.resolver()
 // LINUX: call void @__cpu_indicator_init
 // LINUX: %[[FEAT_INIT:.+]] = load i32, ptr getelementptr inbounds ({ i32, i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 3, i32 0), align 4
-// LINUX: %[[FEAT_JOIN:.+]] = and i32 %[[FEAT_INIT]], 59754495
-// LINUX: %[[FEAT_CHECK:.+]] = icmp eq i32 %[[FEAT_JOIN]], 59754495
+// LINUX: %[[FEAT_JOIN:.+]] = and i32 %[[FEAT_INIT]], 9422847
+// LINUX: %[[FEAT_CHECK:.+]] = icmp eq i32 %[[FEAT_JOIN]], 9422847
 // LINUX: ret ptr @TwoVersions.Z
 // LINUX: ret ptr @TwoVersions.S
 // LINUX: call void @llvm.trap
@@ -85,8 +85,8 @@ void TwoVersions(void);
 // WINDOWS: define weak_odr dso_local void @TwoVersions() comdat
 // WINDOWS: call void @__cpu_indicator_init()
 // WINDOWS: %[[FEAT_INIT:.+]] = load i32, ptr getelementptr inbounds ({ i32, i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 3, i32 0), align 4
-// WINDOWS: %[[FEAT_JOIN:.+]] = and i32 %[[FEAT_INIT]], 59754495
-// WINDOWS: %[[FEAT_CHECK:.+]] = icmp eq i32 %[[FEAT_JOIN]], 59754495
+// WINDOWS: %[[FEAT_JOIN:.+]] = and i32 %[[FEAT_INIT]], 9422847
+// WINDOWS: %[[FEAT_CHECK:.+]] = icmp eq i32 %[[FEAT_JOIN]], 9422847
 // WINDOWS: call void @TwoVersions.Z()
 // WINDOWS-NEXT: ret void
 // WINDOWS: call void @TwoVersions.S()
@@ -354,7 +354,7 @@ void OrderDispatchUsageSpecific(void) {}
 
 // CHECK: attributes #[[S]] = {{.*}}"target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
 // CHECK-SAME: "tune-cpu"="ivybridge"
-// CHECK: attributes #[[K]] = {{.*}}"target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+evex512,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
+// CHECK: attributes #[[K]] = {{.*}}"target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+evex512,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
 // CHECK-SAME: "tune-cpu"="knl"
 // CHECK: attributes #[[O]] = {{.*}}"target-features"="+cmov,+cx16,+cx8,+fxsr,+mmx,+movbe,+sahf,+sse,+sse2,+sse3,+ssse3,+x87"
 // CHECK-SAME: "tune-cpu"="atom"
diff --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c
index 3043986..3c2b511 100644
--- a/clang/test/CodeGen/attr-target-x86.c
+++ b/clang/test/CodeGen/attr-target-x86.c
@@ -59,9 +59,9 @@ void __attribute__((target("avx10.1-512"))) avx10_1_512(void) {}
 // CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87" "tune-cpu"="i686"
 // CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
 // CHECK-NOT: tune-cpu
-// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-aes,-avx,-avx10.1-256,-avx10.1-512,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sha512,-sm3,-sm4,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
+// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-aes,-avx,-avx10.1-256,-avx10.1-512,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sha512,-sm3,-sm4,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
 // CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+crc32,+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686"
-// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-avx,-avx10.1-256,-avx10.1-512,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-sha512,-sm3,-sm4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
+// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-avx,-avx10.1-256,-avx10.1-512,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-sha512,-sm3,-sm4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
 // CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-avx10.1-256,-avx10.1-512,-vaes"
 // CHECK-NOT: tune-cpu
 // CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-3dnow,-3dnowa,-mmx"
diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
index bcb1596..93a6ab0 100644
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@@ -11,6 +11,7 @@ typedef unsigned char u8x16 __attribute((vector_size(16)));
 typedef unsigned short u16x8 __attribute((vector_size(16)));
 typedef unsigned int u32x4 __attribute((vector_size(16)));
 typedef unsigned long long u64x2 __attribute((vector_size(16)));
+typedef __fp16 f16x8 __attribute((vector_size(16)));
 typedef float f32x4 __attribute((vector_size(16)));
 typedef double f64x2 __attribute((vector_size(16)));
 
@@ -813,6 +814,17 @@ void store_f16_f32(float val, __fp16 *addr) {
   // WEBASSEMBLY-NEXT: ret
 }
 
+f16x8 splat_f16x8(float a) {
+  // WEBASSEMBLY: %0 = tail call <8 x half> @llvm.wasm.splat.f16x8(float %a)
+  // WEBASSEMBLY-NEXT: ret <8 x half> %0
+  return __builtin_wasm_splat_f16x8(a);
+}
+
+float extract_lane_f16x8(f16x8 a, int i) {
+  // WEBASSEMBLY:  %0 = tail call float @llvm.wasm.extract.lane.f16x8(<8 x half> %a, i32 %i)
+  // WEBASSEMBLY-NEXT: ret float %0
+  return __builtin_wasm_extract_lane_f16x8(a, i);
+}
 __externref_t externref_null() {
   return __builtin_wasm_ref_null_extern();
   // WEBASSEMBLY: tail call ptr addrspace(10) @llvm.wasm.ref.null.extern()
diff --git a/clang/test/CodeGen/darwin-target-variant.c b/clang/test/CodeGen/darwin-target-variant.c
index 36caaae..9f4b36a 100644
--- a/clang/test/CodeGen/darwin-target-variant.c
+++ b/clang/test/CodeGen/darwin-target-variant.c
@@ -2,5 +2,5 @@
 
 // CHECK: !llvm.module.flags = !{!0, !1, !2
 // CHECK: !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 1]}
-// CHECK: !1 = !{i32 4, !"darwin.target_variant.triple", !"x86_64-apple-ios14-macabi"}
+// CHECK: !1 = !{i32 2, !"darwin.target_variant.triple", !"x86_64-apple-ios14-macabi"}
 // CHECK: !2 = !{i32 2, !"darwin.target_variant.SDK Version", [2 x i32] [i32 14, i32 1]}
diff --git a/clang/test/CodeGen/fat-lto-objects.c b/clang/test/CodeGen/fat-lto-objects.c
index b50567c..36a7368 100644
--- a/clang/test/CodeGen/fat-lto-objects.c
+++ b/clang/test/CodeGen/fat-lto-objects.c
@@ -62,7 +62,7 @@
 
 // ELF: .llvm.lto
 
-//      ASM: .section        .llvm.lto,"e",@progbits
+//      ASM: .section        .llvm.lto,"e",@llvm_lto
 // ASM-NEXT: .Lllvm.embedded.object:
 // ASM-NEXT:        .asciz  "BC
 // ASM-NEXT: .size   .Lllvm.embedded.object
diff --git a/clang/test/CodeGen/function-target-features.c b/clang/test/CodeGen/function-target-features.c
index 0d8bfc7..d6a73ff 100644
--- a/clang/test/CodeGen/function-target-features.c
+++ b/clang/test/CodeGen/function-target-features.c
@@ -4,7 +4,7 @@
 
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-feature +avx | FileCheck %s -check-prefix=AVX-FEATURE
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-feature +avx | FileCheck %s -check-prefix=AVX-NO-CPU
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-feature +avx512f -target-feature +avx512er | FileCheck %s -check-prefix=TWO-AVX
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-feature +avx512f -target-feature +avx512bw | FileCheck %s -check-prefix=TWO-AVX
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-cpu corei7 | FileCheck %s -check-prefix=CORE-CPU
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-cpu corei7 -target-feature +avx | FileCheck %s -check-prefix=CORE-CPU-AND-FEATURES
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-cpu x86-64 | FileCheck %s -check-prefix=X86-64-CPU
@@ -17,7 +17,7 @@ void foo(void) {}
 
 // AVX-FEATURE: "target-features"{{.*}}+avx
 // AVX-NO-CPU-NOT: target-cpu
-// TWO-AVX: "target-features"={{.*}}+avx512er{{.*}}+avx512f
+// TWO-AVX: "target-features"={{.*}}+avx512bw{{.*}}+avx512f
 // CORE-CPU: "target-cpu"="corei7"
 // CORE-CPU-AND-FEATURES: "target-cpu"="corei7" "target-features"={{.*}}+avx
 // X86-64-CPU: "target-cpu"="x86-64"
diff --git a/clang/test/CodeGen/functions.c b/clang/test/CodeGen/functions.c
index 1bbaa80..0cc999a 100644
--- a/clang/test/CodeGen/functions.c
+++ b/clang/test/CodeGen/functions.c
@@ -61,3 +61,15 @@ static void test9_helper(void) {}
 void test9(void) {
   (void) test9_helper;
 }
+
+// PR88917: don't crash
+int b();
+
+int main() {
+	return b(b);
+	// CHECK: call i32 @b(ptr noundef @b)
+}
+int b(int (*f)()){
+  return 0;
+}
+// CHECK-LABEL: define{{.*}} i32 @b(ptr noundef %f)
diff --git a/clang/test/CodeGen/target-builtin-noerror.c b/clang/test/CodeGen/target-builtin-noerror.c
index b438e50..2e16fd8 100644
--- a/clang/test/CodeGen/target-builtin-noerror.c
+++ b/clang/test/CodeGen/target-builtin-noerror.c
@@ -68,8 +68,6 @@ void verifyfeaturestrings(void) {
   (void)__builtin_cpu_supports("avx512bw");
   (void)__builtin_cpu_supports("avx512dq");
   (void)__builtin_cpu_supports("avx512cd");
-  (void)__builtin_cpu_supports("avx512er");
-  (void)__builtin_cpu_supports("avx512pf");
   (void)__builtin_cpu_supports("avx512vbmi");
   (void)__builtin_cpu_supports("avx512ifma");
   (void)__builtin_cpu_supports("avx5124vnniw");
diff --git a/clang/test/CodeGenCXX/assume_attr.cpp b/clang/test/CodeGenCXX/assume_attr.cpp
index dbe76501..962dcc4 100644
--- a/clang/test/CodeGenCXX/assume_attr.cpp
+++ b/clang/test/CodeGenCXX/assume_attr.cpp
@@ -8,77 +8,77 @@
 
 /// foo: declarations only
 
-__attribute__((assume("foo:before1"))) void foo();
+[[omp::assume("foo:before1")]] void foo();
 
-__attribute__((assume("foo:before2")))
-__attribute__((assume("foo:before3"))) void
+[[omp::assume("foo:before2")]]
+[[omp::assume("foo:before3")]] void
 foo();
 
 /// baz: static function declarations and a definition
 
-__attribute__((assume("baz:before1"))) static void baz();
+[[omp::assume("baz:before1")]] static void baz();
 
-__attribute__((assume("baz:before2")))
-__attribute__((assume("baz:before3"))) static void
+[[omp::assume("baz:before2")]]
+[[omp::assume("baz:before3")]] static void
 baz();
 
 // Definition
-__attribute__((assume("baz:def1,baz:def2"))) static void baz() { foo(); }
+[[omp::assume("baz:def1,baz:def2")]] static void baz() { foo(); }
 
-__attribute__((assume("baz:after"))) static void baz();
+[[omp::assume("baz:after")]] static void baz();
 
 /// bar: external function declarations and a definition
 
-__attribute__((assume("bar:before1"))) void bar();
+[[omp::assume("bar:before1")]] void bar();
 
-__attribute__((assume("bar:before2")))
-__attribute__((assume("bar:before3"))) void
+[[omp::assume("bar:before2")]]
+[[omp::assume("bar:before3")]] void
 bar();
 
 // Definition
-__attribute__((assume("bar:def1,bar:def2"))) void bar() { baz(); }
+[[omp::assume("bar:def1,bar:def2")]] void bar() { baz(); }
 
-__attribute__((assume("bar:after"))) void bar();
+[[omp::assume("bar:after")]] void bar();
 
 /// back to foo
 
-__attribute__((assume("foo:after"))) void foo();
+[[omp::assume("foo:after")]] void foo();
 
 /// class tests
 class C {
-  __attribute__((assume("C:private_method"))) void private_method();
-  __attribute__((assume("C:private_static"))) static void private_static();
+  [[omp::assume("C:private_method")]] void private_method();
+  [[omp::assume("C:private_static")]] static void private_static();
 
 public:
-  __attribute__((assume("C:public_method1"))) void public_method();
-  __attribute__((assume("C:public_static1"))) static void public_static();
+  [[omp::assume("C:public_method1")]] void public_method();
+  [[omp::assume("C:public_static1")]] static void public_static();
 };
 
-__attribute__((assume("C:public_method2"))) void C::public_method() {
+[[omp::assume("C:public_method2")]] void C::public_method() {
   private_method();
 }
 
-__attribute__((assume("C:public_static2"))) void C::public_static() {
+[[omp::assume("C:public_static2")]] void C::public_static() {
   private_static();
 }
 
 /// template tests
 template <typename T>
-__attribute__((assume("template_func<T>"))) void template_func() {}
+[[omp::assume("template_func<T>")]] void template_func() {}
 
 template <>
-__attribute__((assume("template_func<float>"))) void template_func<float>() {}
+[[omp::assume("template_func<float>")]] void template_func<float>() {}
 
 template <>
 void template_func<int>() {}
 
 template <typename T>
 struct S {
-  __attribute__((assume("S<T>::method"))) void method();
+  [[omp::assume("S<T>::method")]] void method();
 };
 
 template <>
-__attribute__((assume("S<float>::method"))) void S<float>::method() {}
+[[omp::assume("S<float>::method")]] void S<float>::method() {}
 
 template <>
 void S<int>::method() {}
diff --git a/clang/test/CodeGenCXX/cxx1y-sized-deallocation.cpp b/clang/test/CodeGenCXX/cxx1y-sized-deallocation.cpp
index 4e15657..55913aff 100644
--- a/clang/test/CodeGenCXX/cxx1y-sized-deallocation.cpp
+++ b/clang/test/CodeGenCXX/cxx1y-sized-deallocation.cpp
@@ -1,12 +1,12 @@
 // Check that delete exprs call the sized deallocation function if
-// -fsized-deallocation is passed in both C++11 and C++14.
+// -fsized-deallocation is passed in C++11 or std >= C++14.
 // RUN: %clang_cc1 -std=c++11 -fsized-deallocation %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s
-// RUN: %clang_cc1 -std=c++14 -fsized-deallocation %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s
+// RUN: %clang_cc1 -std=c++14 %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s
 
-// Check that we don't used sized deallocation without -fsized-deallocation and
-// C++14.
+// Check that we don't used sized deallocation with -fno-sized-deallocation or without C++14.
 // RUN: %clang_cc1 -std=c++11 %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s --check-prefix=CHECK-UNSIZED
-// RUN: %clang_cc1 -std=c++14 %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s --check-prefix=CHECK-UNSIZED
+// RUN: %clang_cc1 -std=c++14 %s -emit-llvm -triple x86_64-linux-gnu -fno-sized-deallocation -o - \
+// RUN:     | FileCheck %s --check-prefix=CHECK-UNSIZED
 
 // CHECK-UNSIZED-NOT: _ZdlPvm
 // CHECK-UNSIZED-NOT: _ZdaPvm
diff --git a/clang/test/CodeGenCXX/cxx1z-aligned-allocation.cpp b/clang/test/CodeGenCXX/cxx1z-aligned-allocation.cpp
index ab2e4b3..8823bc6 100644
--- a/clang/test/CodeGenCXX/cxx1z-aligned-allocation.cpp
+++ b/clang/test/CodeGenCXX/cxx1z-aligned-allocation.cpp
@@ -1,10 +1,10 @@
 // Check that delete exprs call aligned (de)allocation functions if
 // -faligned-allocation is passed in both C++11 and C++14.
 // RUN: %clang_cc1 -std=c++11 -fexceptions -fsized-deallocation -faligned-allocation %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s
-// RUN: %clang_cc1 -std=c++14 -fexceptions -fsized-deallocation -faligned-allocation %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s
-// RUN: %clang_cc1 -std=c++1z -fexceptions -fsized-deallocation %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s
+// RUN: %clang_cc1 -std=c++14 -fexceptions -faligned-allocation %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s
+// RUN: %clang_cc1 -std=c++1z -fexceptions %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s
 
-// RUN: %clang_cc1 -std=c++1z -fexceptions -fsized-deallocation %s -emit-llvm -triple x86_64-windows-msvc -o - | FileCheck %s --check-prefix=CHECK-MS
+// RUN: %clang_cc1 -std=c++1z -fexceptions %s -emit-llvm -triple x86_64-windows-msvc -o - | FileCheck %s --check-prefix=CHECK-MS
 
 // Check that we don't used aligned (de)allocation without -faligned-allocation or C++1z.
 // RUN: %clang_cc1 -std=c++14 -DUNALIGNED -fexceptions %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s --check-prefix=CHECK-UNALIGNED
diff --git a/clang/test/CodeGenCXX/cxx2a-destroying-delete.cpp b/clang/test/CodeGenCXX/cxx2a-destroying-delete.cpp
index 20264b6..f6f4a2f 100644
--- a/clang/test/CodeGenCXX/cxx2a-destroying-delete.cpp
+++ b/clang/test/CodeGenCXX/cxx2a-destroying-delete.cpp
@@ -108,10 +108,10 @@ struct J {
 // CHECK-MSABI-LABEL: define {{.*}}@"?j@@
 J *j() {
   // CHECK-ITANIUM: invoke {{.*}}@_ZN1JC1Ev(
-  // CHECK-ITANIUM: call {{.*}}@_ZdlPv(
+  // CHECK-ITANIUM: call {{.*}}@_ZdlPvm(
   // CHECK-NOT: }
   // CHECK-MSABI: invoke {{.*}}@"??0J@@Q{{AE|EAA}}@XZ"(
-  // CHECK-MSABI: call {{.*}}@"??3@YAXP{{E?}}AX@Z"(
+  // CHECK-MSABI: call {{.*}}@"??3@YAXP{{E?}}AX{{I|_K}}@Z"(
   return new J;
   // CHECK: }
 }
diff --git a/clang/test/CodeGenCXX/cxx2b-deducing-this.cpp b/clang/test/CodeGenCXX/cxx2b-deducing-this.cpp
index b755e80..649fe2a 100644
--- a/clang/test/CodeGenCXX/cxx2b-deducing-this.cpp
+++ b/clang/test/CodeGenCXX/cxx2b-deducing-this.cpp
@@ -182,3 +182,66 @@ auto dothing(int num)
   fun();
 }
 }
+
+namespace GH87210 {
+template <typename... Ts>
+struct Overloaded : Ts... {
+  using Ts::operator()...;
+};
+
+template <typename... Ts>
+Overloaded(Ts...) -> Overloaded<Ts...>;
+
+// CHECK-LABEL: define dso_local void @_ZN7GH872101fEv()
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[X:%.*]] = alloca i32
+// CHECK-NEXT:    [[Over:%.*]] = alloca %"{{.*}}Overloaded"
+// CHECK:         call noundef ptr @"_ZZN7GH872101fEvENH3$_0clINS_10OverloadedIJS0_EEEEEDaRT_"(ptr {{.*}} [[Over]])
+void f() {
+  int x;
+  Overloaded o {
+    // CHECK: define internal noundef ptr @"_ZZN7GH872101fEvENH3$_0clINS_10OverloadedIJS0_EEEEEDaRT_"(ptr {{.*}} [[Self:%.*]])
+    // CHECK-NEXT:  entry:
+    // CHECK-NEXT:    [[SelfAddr:%.*]] = alloca ptr
+    // CHECK-NEXT:    store ptr [[Self]], ptr [[SelfAddr]]
+    // CHECK-NEXT:    [[SelfPtr:%.*]] = load ptr, ptr [[SelfAddr]]
+    // CHECK-NEXT:    [[XRef:%.*]] = getelementptr inbounds %{{.*}}, ptr [[SelfPtr]], i32 0, i32 0
+    // CHECK-NEXT:    [[X:%.*]] = load ptr, ptr [[XRef]]
+    // CHECK-NEXT:    ret ptr [[X]]
+    [&](this auto& self) {
+      return &x;
+    }
+  };
+  o();
+}
+
+void g() {
+  int x;
+  Overloaded o {
+    [=](this auto& self) {
+      return x;
+    }
+  };
+  o();
+}
+}
+
+namespace GH89541 {
+// Same as above; just check that this doesn't crash.
+int one = 1;
+auto factory(int& x = one) {
+  return [&](this auto self) {
+    x;
+  };
+};
+
+using Base = decltype(factory());
+struct Derived : Base {
+  Derived() : Base(factory()) {}
+};
+
+void f() {
+  Derived d;
+  d();
+}
+}
diff --git a/clang/test/CodeGenCXX/delete-two-arg.cpp b/clang/test/CodeGenCXX/delete-two-arg.cpp
index 552634f..a0dcd03 100644
--- a/clang/test/CodeGenCXX/delete-two-arg.cpp
+++ b/clang/test/CodeGenCXX/delete-two-arg.cpp
@@ -43,7 +43,9 @@ namespace test2 {
     // CHECK-NEXT: br i1 [[T1]],
     // CHECK: [[T3:%.*]] = getelementptr inbounds i8, ptr [[T0]], i32 -4
     // CHECK-NEXT: [[T5:%.*]] = load i32, ptr [[T3]]
-    // CHECK-NEXT: call void @_ZdaPv(ptr noundef [[T3]])
+    // CHECK-NEXT: [[T6:%.*]] = mul i32 4, [[T5]]
+    // CHECK-NEXT: [[T7:%.*]] = add i32 [[T6]], 4
+    // CHECK-NEXT: call void @_ZdaPvj(ptr noundef [[T3]], i32 noundef [[T7]])
     // CHECK-NEXT: br label
     ::delete[] p;
   }
diff --git a/clang/test/CodeGenCXX/delete.cpp b/clang/test/CodeGenCXX/delete.cpp
index 1a418f4..d5b0dc6 100644
--- a/clang/test/CodeGenCXX/delete.cpp
+++ b/clang/test/CodeGenCXX/delete.cpp
@@ -16,7 +16,7 @@ void t3(S *s) {
   // CHECK: icmp {{.*}} null
   // CHECK: br i1
 
-  // CHECK: call void @_ZdlPv
+  // CHECK: call void @_ZdlPvm
 
   // Check the delete is inside the 'if !null' check unless we're optimizing
   // for size. FIXME: We could omit the branch entirely in this case.
@@ -35,7 +35,7 @@ struct T {
 void t4(T *t) {
   // CHECK: call void @_ZN1TD1Ev
   // CHECK-SIZE-NEXT: br
-  // CHECK: call void @_ZdlPv
+  // CHECK: call void @_ZdlPvm
   delete t;
 }
 
@@ -93,14 +93,16 @@ namespace test1 {
     // CHECK-NEXT: call void @_ZN5test11AD1Ev(ptr {{[^,]*}} [[CUR]])
     // CHECK-NEXT: [[ISDONE:%.*]] = icmp eq ptr [[CUR]], [[BEGIN]]
     // CHECK-NEXT: br i1 [[ISDONE]]
-    // CHECK:      call void @_ZdaPv(ptr noundef [[ALLOC]])
+    // CHECK:      [[MUL:%.*]] = mul i64 4, [[COUNT]]
+    // CHECK-NEXT: [[SIZE:%.*]] = add i64 [[MUL]], 8
+    // CHECK-NEXT: call void @_ZdaPvm(ptr noundef [[ALLOC]], i64 noundef [[SIZE]])
   }
 }
 
 namespace test2 {
   // CHECK-LABEL: define{{.*}} void @_ZN5test21fEPb
   void f(bool *b) {
-    // CHECK: call void @_ZdlPv(ptr
+    // CHECK: call void @_ZdlPvm(ptr{{.*}}i64
     delete b;
     // CHECK: call void @_ZdaPv(ptr
     delete [] b;
@@ -137,7 +139,7 @@ namespace test4 {
     // CHECK-NEXT: [[DTOR:%.*]] = load ptr, ptr [[T0]]
     // CHECK-NEXT: call void [[DTOR]](ptr {{[^,]*}} [[OBJ:%.*]])
     //   Call the global operator delete.
-    // CHECK-NEXT: call void @_ZdlPv(ptr noundef [[ALLOCATED]]) [[NUW:#[0-9]+]]
+    // CHECK-NEXT: call void @_ZdlPvm(ptr noundef [[ALLOCATED]], i64 noundef 8) [[NUW:#[0-9]+]]
     ::delete xp;
   }
 }
diff --git a/clang/test/CodeGenCXX/dllimport.cpp b/clang/test/CodeGenCXX/dllimport.cpp
index 6fec2f2..484866b 100644
--- a/clang/test/CodeGenCXX/dllimport.cpp
+++ b/clang/test/CodeGenCXX/dllimport.cpp
@@ -205,7 +205,7 @@ USEVAR(VarTmpl<ExplicitSpec_Imported>)
 // Functions
 //===----------------------------------------------------------------------===//
 
-// GNU-DAG: declare dso_local void @_ZdlPv(ptr)
+// GNU-DAG: declare dso_local void @_ZdlPv{{j|y}}(ptr, i{{32|64}})
 
 // Import function declaration.
 // MSC-DAG: declare dllimport void @"?decl@@YAXXZ"()
@@ -358,7 +358,7 @@ __declspec(dllimport) void operator delete(void*);
 __declspec(dllimport) inline int *ReferencingImportedNew() { return new int[2]; }
 // MO1-DAG: define available_externally dllimport ptr @"?ReferencingImportedNew@@YAPAHXZ"
 __declspec(dllimport) inline int *ReferencingImportedDelete() { delete (int*)nullptr; }
-// MO1-DAG: define available_externally dllimport ptr @"?ReferencingImportedDelete@@YAPAHXZ"
+// MO1-DAG: declare dllimport ptr @"?ReferencingImportedDelete@@YAPAHXZ"
 USE(ReferencingImportedNew)
 USE(ReferencingImportedDelete)
 struct ClassWithDtor { ~ClassWithDtor() {} };
diff --git a/clang/test/CodeGenCXX/fmv-namespace.cpp b/clang/test/CodeGenCXX/fmv-namespace.cpp
new file mode 100644
index 0000000..5bcd0da
--- /dev/null
+++ b/clang/test/CodeGenCXX/fmv-namespace.cpp
@@ -0,0 +1,93 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -emit-llvm %s -o - | FileCheck %s
+
+namespace Name {
+int __attribute((target_version("default"))) foo() { return 0; }
+}
+
+namespace Name {
+int __attribute((target_version("sve"))) foo() { return 1; }
+}
+
+int bar() { return Name::foo(); }
+
+namespace OtherName {
+int __attribute((target_version("sve"))) foo() { return 2; }
+}
+
+int baz() { return OtherName::foo(); }
+
+//.
+// CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
+// CHECK: @_ZN4Name3fooEv.ifunc = weak_odr alias i32 (), ptr @_ZN4Name3fooEv
+// CHECK: @_ZN9OtherName3fooEv.ifunc = weak_odr alias i32 (), ptr @_ZN9OtherName3fooEv
+// CHECK: @_ZN4Name3fooEv = weak_odr ifunc i32 (), ptr @_ZN4Name3fooEv.resolver
+// CHECK: @_ZN9OtherName3fooEv = weak_odr ifunc i32 (), ptr @_ZN9OtherName3fooEv.resolver
+//.
+// CHECK-LABEL: define dso_local noundef i32 @_ZN4Name3fooEv.default(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK-LABEL: define dso_local noundef i32 @_ZN4Name3fooEv._Msve(
+// CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret i32 1
+//
+//
+// CHECK-LABEL: define dso_local noundef i32 @_Z3barv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN4Name3fooEv()
+// CHECK-NEXT:    ret i32 [[CALL]]
+//
+//
+// CHECK-LABEL: define weak_odr ptr @_ZN4Name3fooEv.resolver() comdat {
+// CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1073741824
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1073741824
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
+// CHECK:       [[RESOLVER_RETURN]]:
+// CHECK-NEXT:    ret ptr @_ZN4Name3fooEv._Msve
+// CHECK:       [[RESOLVER_ELSE]]:
+// CHECK-NEXT:    ret ptr @_ZN4Name3fooEv.default
+//
+//
+// CHECK-LABEL: define dso_local noundef i32 @_ZN9OtherName3fooEv._Msve(
+// CHECK-SAME: ) #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret i32 2
+//
+//
+// CHECK-LABEL: define dso_local noundef i32 @_Z3bazv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN9OtherName3fooEv()
+// CHECK-NEXT:    ret i32 [[CALL]]
+//
+//
+// CHECK-LABEL: define weak_odr ptr @_ZN9OtherName3fooEv.resolver() comdat {
+// CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1073741824
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1073741824
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
+// CHECK:       [[RESOLVER_RETURN]]:
+// CHECK-NEXT:    ret ptr @_ZN9OtherName3fooEv._Msve
+// CHECK:       [[RESOLVER_ELSE]]:
+// CHECK-NEXT:    ret ptr @_ZN9OtherName3fooEv.default
+//
+//.
+// CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve" }
+// CHECK: attributes #[[ATTR2:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/CodeGenCXX/new.cpp b/clang/test/CodeGenCXX/new.cpp
index e278d9a..af22552 100644
--- a/clang/test/CodeGenCXX/new.cpp
+++ b/clang/test/CodeGenCXX/new.cpp
@@ -15,7 +15,7 @@ void t1() {
 }
 
 // CHECK: declare noundef nonnull ptr @_Znwm(i64 noundef) [[ATTR_NOBUILTIN:#[^ ]*]]
-// CHECK: declare void @_ZdlPv(ptr noundef) [[ATTR_NOBUILTIN_NOUNWIND:#[^ ]*]]
+// CHECK: declare void @_ZdlPvm(ptr noundef, i64 noundef) [[ATTR_NOBUILTIN_NOUNWIND:#[^ ]*]]
 // CHECK: declare noundef nonnull ptr @_Znam(i64 noundef) [[ATTR_NOBUILTIN]]
 // CHECK: declare void @_ZdaPv(ptr noundef) [[ATTR_NOBUILTIN_NOUNWIND]]
 
@@ -192,7 +192,7 @@ void f() {
   // CHECK: store i64 200
   delete[] new (nothrow) Alloc[10][20];
   // CHECK: call noalias noundef nonnull ptr @_Znwm
-  // CHECK: call void @_ZdlPv(ptr
+  // CHECK: call void @_ZdlPvm(ptr noundef {{%.*}}, i64 noundef 1)
   delete new bool;
   // CHECK: ret void
 }
@@ -317,7 +317,7 @@ namespace N3664 {
   void f() {
     // CHECK: call noalias noundef nonnull ptr @_Znwm(i64 noundef 4) [[ATTR_BUILTIN_NEW:#[^ ]*]]
     int *p = new int; // expected-note {{allocated with 'new' here}}
-    // CHECK: call void @_ZdlPv({{.*}}) [[ATTR_BUILTIN_DELETE:#[^ ]*]]
+    // CHECK: call void @_ZdlPvm({{.*}}) [[ATTR_BUILTIN_DELETE:#[^ ]*]]
     delete p;
 
     // CHECK: call noalias noundef nonnull ptr @_Znam(i64 noundef 12) [[ATTR_BUILTIN_NEW]]
diff --git a/clang/test/CodeGenCXX/ps-dllstorage-vtable-rtti.cpp b/clang/test/CodeGenCXX/ps-dllstorage-vtable-rtti.cpp
new file mode 100644
index 0000000..377e579
--- /dev/null
+++ b/clang/test/CodeGenCXX/ps-dllstorage-vtable-rtti.cpp
@@ -0,0 +1,114 @@
+/// For a class that has a vtable and typeinfo symbol for RTTI, if a user marks
+/// either:
+///
+///   (a) The entire class as dllexport (dllimport)
+///   (b) Any non-inline method of the class as dllexport (dllimport)
+///
+/// then Clang must export the vtable and typeinfo symbol from the TU where they
+/// are defined (the TU containing the definition of the Itanium C++ ABI "key
+/// function") and must import them in other modules where they are referenced.
+
+// RUN: %clang_cc1 -I%S -fdeclspec -triple x86_64-unknown-windows-itanium -emit-llvm -o - %s -fhalf-no-semantic-interposition \
+// RUN:   | FileCheck %s -check-prefix=WI
+// RUN: %clang_cc1 -I%S -fdeclspec -triple x86_64-scei-windows-itanium    -emit-llvm -o - %s -fhalf-no-semantic-interposition \
+// RUN:   | FileCheck %s --check-prefixes=PS
+// RUN: %clang_cc1 -I%S -fdeclspec -triple x86_64-scei-ps4 -emit-llvm -o - %s -fhalf-no-semantic-interposition \
+// RUN:   | FileCheck %s --check-prefixes=PS
+// RUN: %clang_cc1 -I%S -fdeclspec -triple x86_64-sie-ps5  -emit-llvm -o - %s -fhalf-no-semantic-interposition \
+// RUN:   | FileCheck %s --check-prefixes=PS
+
+#include <typeinfo>
+
+/// Case (a) -- Import Aspect
+/// The entire class is imported. The typeinfo symbol must also be imported, but
+/// the vtable will not be referenced, and so does not need to be imported.
+
+// PS-DAG: @_ZTI10FullImport = {{.*}}dllimport
+// WI-DAG: @_ZTI10FullImport = external dllimport constant ptr
+struct __declspec(dllimport) FullImport {
+  virtual void inlineFunc() const {}
+  virtual void key();
+  virtual void func();
+};
+
+/// 'FullImport::key()' is the key function, so the vtable and typeinfo symbol
+/// of 'FullImport' will be defined in the TU that contains the definition of
+/// 'key()' (and they must be exported from there).
+void FullImportTest() { typeid(FullImport).name(); }
+
+/// Case (a) -- Export Aspect
+/// The entire class is exported. The vtable and typeinfo symbols must also be
+/// exported.
+
+// PS-DAG: @_ZTV10FullExport = {{.*}}dllexport
+// WI-DAG: @_ZTV10FullExport = {{.*}}dllexport
+// PS-DAG: @_ZTI10FullExport = {{.*}}dllexport
+// WI-DAG: @_ZTI10FullExport = dso_local dllexport constant {
+struct __declspec(dllexport) FullExport {
+  virtual void inlineFunc() const {}
+  virtual void key();
+  virtual void func();
+};
+
+/// This is the key function of the class 'FullExport', so the vtable and
+/// typeinfo symbols of 'FullExport' will be defined in this TU, and so they
+/// must be exported from this TU.
+void FullExport::key() { typeid(FullExport).name(); }
+
+/// Case (b) -- Import Aspect
+/// The class as a whole is not imported, but a non-inline method of the class
+/// is, so the vtable and typeinfo symbol must be imported.
+
+// PS-DAG: @_ZTV10PartImport = {{.*}}dllimport
+// WI-DAG: @_ZTV10PartImport = external dso_local unnamed_addr constant {
+// PS-DAG: @_ZTI10PartImport = {{.*}}dllimport
+// WI-DAG: @_ZTI10PartImport = external dso_local constant ptr
+struct PartImport {
+  virtual void inlineFunc() const {}
+  virtual void key();
+  __declspec(dllimport) virtual void func();
+};
+
+/// 'PartImport::key()' is the key function, so the vtable and typeinfo symbol
+/// of 'PartImport' will be defined in the TU that contains the definition of
+/// 'key()' (and they must be exported from there). Here, we will reference the
+/// vtable and typeinfo symbol, so we must also import them.
+void PartImportTest() {
+  PartImport f;
+  typeid(PartImport).name();
+}
+
+/// Case (b) -- Export Aspect
+/// The class as a whole is not exported, but a non-inline method of the class
+/// is, so the vtable and typeinfo symbol must be exported.
+
+// PS-DAG: @_ZTV10PartExport = {{.*}}dllexport
+// WI-DAG: @_ZTV10PartExport = dso_local unnamed_addr constant {
+// PS-DAG: @_ZTI10PartExport = {{.*}}dllexport
+// WI-DAG: @_ZTI10PartExport = dso_local constant {
+struct PartExport {
+  virtual void inlineFunc() const {}
+  virtual void key();
+  __declspec(dllexport) virtual void func();
+};
+
+/// This is the key function of the class 'PartExport', so the vtable and
+/// typeinfo symbol of 'PartExport' will be defined in this TU, and so they must
+/// be exported from this TU.
+void PartExport::key() { typeid(PartExport).name(); }
+
+/// Case (b) -- Export Aspect
+/// The class as a whole is not exported, but the constructor of the class
+/// is, so the vtable and typeinfo symbol must be exported.
+
+// PS-DAG: @_ZTV10ConsExport = {{.*}}dllexport
+// WI-DAG: @_ZTV10ConsExport = dso_local unnamed_addr constant {
+// PS-DAG: @_ZTI10ConsExport = {{.*}}dllexport
+// WI-DAG: @_ZTI10ConsExport = dso_local constant {
+struct ConsExport {
+  __declspec(dllexport) ConsExport();
+  virtual void key();
+};
+
+ConsExport::ConsExport() {}
+void ConsExport::key() { typeid(ConsExport).name(); }
diff --git a/clang/test/CodeGenCXX/ps4-dllstorage-vtable-rtti.cpp b/clang/test/CodeGenCXX/ps4-dllstorage-vtable-rtti.cpp
deleted file mode 100644
index 5724e78..0000000
--- a/clang/test/CodeGenCXX/ps4-dllstorage-vtable-rtti.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-// For a class that has a vtable (and hence, also has a typeinfo symbol for
-// RTTI), if a user marks either:
-//
-//  (a) the entire class as dllexport (dllimport), or
-//  (b) all non-inline virtual methods of the class as dllexport (dllimport)
-//
-// then Clang must export the vtable and typeinfo symbol from the TU where they
-// are defined (the TU containing the definition of the Itanium C++ ABI "key
-// function"), and must import them in other modules where they are referenced.
-//
-// Conversely to point (b), if some (but not all) of the non-inline virtual
-// methods of a class are marked as dllexport (dllimport), then the vtable and
-// typeinfo symbols must not be exported (imported).  This will result in a
-// link-time failure when linking the importing module.  This link-time failure
-// is the desired behavior, because the Microsoft toolchain also gets a
-// link-time failure in these cases (and since __declspec(dllexport)
-// (__declspec(dllimport)) is a Microsoft extension, our intention is to mimic
-// that Microsoft behavior).
-//
-// Side note: It is within the bodies of constructors (and in some cases,
-// destructors) that the vtable is explicitly referenced.  In case (a) above,
-// where the entire class is exported (imported), then all constructors (among
-// other things) are exported (imported).  So for that situation, an importing
-// module for a well-formed program will not actually reference the vtable,
-// since constructor calls will all be to functions external to that module
-// (and imported into it, from the exporting module).  I.e., all vtable
-// references will be in that module where the constructor and destructor
-// bodies are, therefore, there will not be a need to import the vtable in
-// that case.
-//
-// This test contains 6 test classes:
-//   2 for point (a),
-//   2 for point (b),
-//   and 2 negative tests for the converse of point (b).
-//
-// The two tests for each of these points are one for importing, and one for
-// exporting.
-
-// RUN: %clang_cc1 -I%S -fdeclspec -triple x86_64-unknown-windows-itanium -emit-llvm -o - %s -fhalf-no-semantic-interposition | FileCheck %s -check-prefix=WI
-// RUN: %clang_cc1 -I%S -fdeclspec -triple x86_64-scei-windows-itanium -emit-llvm -o - %s -fhalf-no-semantic-interposition | FileCheck %s --check-prefixes=PS4,SCEI_WI
-// RUN: %clang_cc1 -I%S -fdeclspec -triple x86_64-scei-ps4 -emit-llvm -o - %s -fhalf-no-semantic-interposition | FileCheck %s --check-prefixes=PS4,SCEI_PS4
-// RUN: %clang_cc1 -I%S -fdeclspec -triple x86_64-sie-ps5  -emit-llvm -o - %s -fhalf-no-semantic-interposition | FileCheck %s --check-prefixes=PS4,SCEI_PS4
-
-#include <typeinfo>
-
-// Case (a) -- Import Aspect
-// The entire class is imported.  The typeinfo symbol must also be imported,
-// but the vtable will not be referenced, and so does not need to be imported
-// (as described in the "Side note", above).
-//
-// PS4-DAG: @_ZTI10FullImport = {{.*}}dllimport
-// WI-DAG: @_ZTI10FullImport = external dllimport constant ptr
-struct __declspec(dllimport) FullImport
-{
-  virtual void getId() {}
-  virtual void Bump();
-  virtual void Decrement();
-};
-
-// 'FullImport::Bump()' is the key function, so the vtable and typeinfo symbol
-// of 'FullImport' will be defined in the TU that contains the definition of
-// 'Bump()' (and they must be exported from there).
-void FullImportTest()
-{
-  typeid(FullImport).name();
-}
-
-///////////////////////////////////////////////////////////////////
-
-// Case (a) -- Export Aspect
-// The entire class is exported.  The vtable and typeinfo symbols must also be
-// exported,
-//
-// PS4-DAG: @_ZTV10FullExport ={{.*}}dllexport
-// WI-DAG: @_ZTV10FullExport ={{.*}}dllexport
-// PS4-DAG: @_ZTI10FullExport ={{.*}}dllexport
-// WI-DAG: @_ZTI10FullExport = dso_local dllexport constant {
-struct __declspec(dllexport) FullExport // Easy case: Entire class is exported.
-{
-  virtual void getId() {}
-  virtual void Bump();
-  virtual void Decrement();
-};
-
-// This is the key function of the class 'FullExport', so the vtable and
-// typeinfo symbols of 'FullExport' will be defined in this TU, and so they
-// must be exported from this TU.
-void FullExport::Bump()
-{
-  typeid(FullExport).name();
-}
-
-///////////////////////////////////////////////////////////////////
-
-// Case (b) -- Import Aspect
-// The class as a whole is not imported, but all non-inline virtual methods of
-// the class are, so the vtable and typeinfo symbol must be imported.
-//
-// PS4-DAG: @_ZTV9FooImport ={{.*}}dllimport
-// WI-DAG:  @_ZTV9FooImport = linkonce_odr dso_local unnamed_addr constant {
-// PS4-DAG: @_ZTI9FooImport ={{.*}}dllimport
-// WI-DAG:  @_ZTI9FooImport = linkonce_odr dso_local constant {
-
-
-struct FooImport
-{
-  virtual void getId() const {}
-  __declspec(dllimport) virtual void Bump();
-  __declspec(dllimport) virtual void Decrement();
-};
-
-// 'FooImport::Bump()' is the key function, so the vtable and typeinfo symbol
-// of 'FooImport' will be defined in the TU that contains the definition of
-// 'Bump()' (and they must be exported from there).  Here, we will reference
-// the vtable and typeinfo symbol, so we must also import them.
-void importTest()
-{
-  typeid(FooImport).name();
-}
-
-///////////////////////////////////////////////////////////////////
-
-// Case (b) -- Export Aspect
-// The class as a whole is not exported, but all non-inline virtual methods of
-// the class are, so the vtable and typeinfo symbol must be exported.
-//
-// PS4-DAG: @_ZTV9FooExport ={{.*}}dllexport
-// WI-DAG:  @_ZTV9FooExport = dso_local unnamed_addr constant {
-// PS4-DAG: @_ZTI9FooExport ={{.*}}dllexport
-// WI-DAG:  @_ZTI9FooExport = dso_local constant {
-struct FooExport
-{
-  virtual void getId() const {}
-  __declspec(dllexport) virtual void Bump();
-  __declspec(dllexport) virtual void Decrement();
-};
-
-// This is the key function of the class 'FooExport', so the vtable and
-// typeinfo symbol of 'FooExport' will be defined in this TU, and so they must
-// be exported from this TU.
-void FooExport::Bump()
-{
-  FooImport f;
-  typeid(FooExport).name();
-}
-
-///////////////////////////////////////////////////////////////////
-
-// The tests below verify that the associated vtable and typeinfo symbols are
-// not imported/exported.  These are the converse of case (b).
-//
-// Note that ultimately, if the module doing the importing calls a constructor
-// of the class with the vtable, or makes a reference to the typeinfo symbol of
-// the class, then this will result in an unresolved reference (to the vtable
-// or typeinfo symbol) when linking the importing module, and thus a link-time
-// failure.
-//
-// Note that with the Microsoft toolchain there will also be a link-time
-// failure when linking the module doing the importing.  With the Microsoft
-// toolchain, it will be an unresolved reference to the method 'Decrement()'
-// of the approriate class, rather than to the vtable or typeinfo symbol of
-// the class, because Microsoft defines the vtable and typeinfo symbol (weakly)
-// everywhere they are used.
-
-// Converse of case (b) -- Import Aspect
-// The class as a whole is not imported, and not all non-inline virtual methods
-// are imported, so the vtable and typeinfo symbol are not to be imported.
-//
-// CHECK-PS4: @_ZTV11FooNoImport = external dso_local unnamed_addr constant {
-// CHECK-WI:  @_ZTV11FooNoImport = linkonce_odr dso_local unnamed_addr constant {
-// CHECK-PS4: @_ZTI11FooNoImport = external dso_local constant ptr{{$}}
-// CHECK-WI:  @_ZTI11FooNoImport = linkonce_odr dso_local constant {
-struct FooNoImport
-{
-  virtual void getId() const {}
-  __declspec(dllimport) virtual void Bump();
-  virtual void Decrement();     // Not imported.
-  int mCounter;
-};
-
-void importNegativeTest()
-{
-  FooNoImport f;
-  typeid(FooNoImport).name();
-}
-
-///////////////////////////////////////////////////////////////////
-
-// Converse of case (b) -- Export Aspect
-// The class as a whole is not exported, and not all non-inline virtual methods
-// are exported, so the vtable and typeinfo symbol are not to be exported.
-//
-// SCEI_PS4-DAG: @_ZTV11FooNoImport = external unnamed_addr constant {
-// SCEI_WI-DAG:  @_ZTV11FooNoExport = dso_local unnamed_addr constant {
-
-// WI-DAG:       @_ZTV11FooNoExport = dso_local unnamed_addr constant {
-// SCEI_PS4-DAG: @_ZTI11FooNoExport = constant {
-// SCEI_WI-DAG:  @_ZTI11FooNoExport = dso_local constant {
-// WI-DAG:       @_ZTI11FooNoExport = dso_local constant {
-struct FooNoExport
-{
-  virtual void getId() const {}
-  __declspec(dllexport) virtual void Bump();
-  virtual void Decrement(); // Not exported.
-  int mCounter;
-};
-
-void FooNoExport::Bump()
-{
-  typeid(FooNoExport).name();
-}
diff --git a/clang/test/CodeGenCXX/weak-external.cpp b/clang/test/CodeGenCXX/weak-external.cpp
index 5eb262cd..e30d4de 100644
--- a/clang/test/CodeGenCXX/weak-external.cpp
+++ b/clang/test/CodeGenCXX/weak-external.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -triple %itanium_abi_triple %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -triple x86_64-unknown-linux-gnu %s -emit-llvm -o - | FileCheck %s
 // PR4262
 
 // CHECK-NOT: _ZNSs12_S_constructIPKcEEPcT_S3_RKSaIcESt20forward_iterator_tag
diff --git a/clang/test/CodeGenCoroutines/coro-aligned-alloc-2.cpp b/clang/test/CodeGenCoroutines/coro-aligned-alloc-2.cpp
index 21c2e45..bfa124b 100644
--- a/clang/test/CodeGenCoroutines/coro-aligned-alloc-2.cpp
+++ b/clang/test/CodeGenCoroutines/coro-aligned-alloc-2.cpp
@@ -1,9 +1,7 @@
 // Tests that the combination of -fcoro-aligned-allocation and -fsized-deallocation works well.
 // Test the compiler will chose sized deallocation correctly.
-// This is only enabled with `-fsized-deallocation` which is off by default.
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 \
 // RUN:   -fcoro-aligned-allocation -emit-llvm %s -o - -disable-llvm-passes \
-// RUN:   -fsized-deallocation \
 // RUN:   | FileCheck %s
 
 #include "Inputs/coroutine.h"
diff --git a/clang/test/CodeGenCoroutines/coro-aligned-alloc.cpp b/clang/test/CodeGenCoroutines/coro-aligned-alloc.cpp
index 8019926..156fa64 100644
--- a/clang/test/CodeGenCoroutines/coro-aligned-alloc.cpp
+++ b/clang/test/CodeGenCoroutines/coro-aligned-alloc.cpp
@@ -26,8 +26,9 @@ struct task {
 // CHECK: %[[aligned_new:.+]] = call{{.*}}@_ZnwmSt11align_val_t({{.*}}%[[coro_size]],{{.*}}%[[coro_align]])
 
 // CHECK: coro.free:
+// CHECK: %[[coro_size_for_free:.+]] = call{{.*}}@llvm.coro.size
 // CHECK: %[[coro_align_for_free:.+]] = call{{.*}}@llvm.coro.align
-// CHECK: call void @_ZdlPvSt11align_val_t({{.*}}[[coro_align_for_free]]
+// CHECK: call void @_ZdlPvmSt11align_val_t({{.*}}%[[coro_size_for_free]],{{.*}}%[[coro_align_for_free]])
 
 task f() {
     co_return 43;
@@ -58,8 +59,9 @@ void *operator new(std::size_t, std::align_val_t, std::nothrow_t) noexcept;
 // CHECK: %[[aligned_new:.+]] = call{{.*}}@_ZnwmSt11align_val_tSt9nothrow_t({{.*}}%[[coro_size]],{{.*}}%[[coro_align]])
 
 // CHECK: coro.free:
+// CHECK: %[[coro_size_for_free:.+]] = call{{.*}}@llvm.coro.size
 // CHECK: %[[coro_align_for_free:.+]] = call{{.*}}@llvm.coro.align
-// CHECK: call void @_ZdlPvSt11align_val_t({{.*}}[[coro_align_for_free]]
+// CHECK: call void @_ZdlPvmSt11align_val_t({{.*}}%[[coro_size_for_free]],{{.*}}%[[coro_align_for_free]])
 
 task2 f2() {
     co_return 43;
diff --git a/clang/test/CodeGenCoroutines/coro-alloc.cpp b/clang/test/CodeGenCoroutines/coro-alloc.cpp
index d026a0d..7b3be7e 100644
--- a/clang/test/CodeGenCoroutines/coro-alloc.cpp
+++ b/clang/test/CodeGenCoroutines/coro-alloc.cpp
@@ -70,7 +70,8 @@ extern "C" void f0(global_new_delete_tag) {
   // CHECK: br i1 %[[NeedDealloc]], label %[[FreeBB:.+]], label %[[Afterwards:.+]]
 
   // CHECK: [[FreeBB]]:
-  // CHECK: call void @_ZdlPv(ptr noundef %[[MEM]])
+  // CHECK: %[[SIZE:.+]] = call i64 @llvm.coro.size.i64()
+  // CHECK: call void @_ZdlPvm(ptr noundef %[[MEM]], i64 noundef %[[SIZE]])
   // CHECK: br label %[[Afterwards]]
 
   // CHECK: [[Afterwards]]:
@@ -99,7 +100,8 @@ extern "C" void f1(promise_new_tag ) {
 
   // CHECK: %[[FRAME:.+]] = call ptr @llvm.coro.begin(
   // CHECK: %[[MEM:.+]] = call ptr @llvm.coro.free(token %[[ID]], ptr %[[FRAME]])
-  // CHECK: call void @_ZdlPv(ptr noundef %[[MEM]])
+  // CHECK: %[[SIZE:.+]] = call i64 @llvm.coro.size.i64()
+  // CHECK: call void @_ZdlPvm(ptr noundef %[[MEM]], i64 noundef %[[SIZE]])
   co_return;
 }
 
diff --git a/clang/test/CodeGenCoroutines/coro-cleanup.cpp b/clang/test/CodeGenCoroutines/coro-cleanup.cpp
index 98f1507..4e77ac2 100644
--- a/clang/test/CodeGenCoroutines/coro-cleanup.cpp
+++ b/clang/test/CodeGenCoroutines/coro-cleanup.cpp
@@ -84,11 +84,13 @@ void f() {
   // CHECK: [[Cleanup]]:
   // CHECK: call void @_ZNSt16coroutine_traitsIJvEE12promise_typeD1Ev(
   // CHECK: %[[Mem0:.+]] = call ptr @llvm.coro.free(
-  // CHECK: call void @_ZdlPv(ptr noundef %[[Mem0]]
+  // CHECK: %[[SIZE:.+]] = call i64 @llvm.coro.size.i64()
+  // CHECK: call void @_ZdlPvm(ptr noundef %[[Mem0]], i64 noundef %[[SIZE]])
 
   // CHECK: [[Dealloc]]:
   // THROWEND:   %[[Mem:.+]] = call ptr @llvm.coro.free(
-  // THROWEND:   call void @_ZdlPv(ptr noundef %[[Mem]])
+  // THROWEND:   %[[SIZE:.+]] = call i64 @llvm.coro.size.i64()
+  // THROWEND:   call void @_ZdlPvm(ptr noundef %[[Mem]], i64 noundef %[[SIZE]])
 
   co_return;
 }
diff --git a/clang/test/CodeGenCoroutines/coro-dealloc.cpp b/clang/test/CodeGenCoroutines/coro-dealloc.cpp
index 3cdba6c..5a699ac 100644
--- a/clang/test/CodeGenCoroutines/coro-dealloc.cpp
+++ b/clang/test/CodeGenCoroutines/coro-dealloc.cpp
@@ -1,6 +1,5 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 \
 // RUN:   -emit-llvm %s -o - -disable-llvm-passes \
-// RUN:   -fsized-deallocation \
 // RUN:   | FileCheck %s
 
 #include "Inputs/coroutine.h"
@@ -21,7 +20,6 @@ struct task {
 };
 
 // Test the compiler will chose sized deallocation correctly.
-// This is only enabled with `-fsized-deallocation` which is off by default.
 void operator delete(void *ptr, std::size_t size) noexcept;
 
 // CHECK: define{{.*}}@_Z1fv
diff --git a/clang/test/CodeGenCoroutines/coro-gro.cpp b/clang/test/CodeGenCoroutines/coro-gro.cpp
index d4c3ff5..b621343 100644
--- a/clang/test/CodeGenCoroutines/coro-gro.cpp
+++ b/clang/test/CodeGenCoroutines/coro-gro.cpp
@@ -51,7 +51,8 @@ int f() {
 
   // CHECK: call void @_ZNSt16coroutine_traitsIiJEE12promise_typeD1Ev(
   // CHECK: %[[Mem:.+]] = call ptr @llvm.coro.free(
-  // CHECK: call void @_ZdlPv(ptr noundef %[[Mem]])
+  // CHECK: %[[SIZE:.+]] = call i64 @llvm.coro.size.i64()
+  // CHECK: call void @_ZdlPvm(ptr noundef %[[Mem]], i64 noundef %[[SIZE]])
 
   // Initialize retval from Gro and destroy Gro
   // Note this also tests delaying initialization when Gro and function return
diff --git a/clang/test/CodeGenCoroutines/pr56919.cpp b/clang/test/CodeGenCoroutines/pr56919.cpp
index c7de08e..baa8c27 100644
--- a/clang/test/CodeGenCoroutines/pr56919.cpp
+++ b/clang/test/CodeGenCoroutines/pr56919.cpp
@@ -111,12 +111,15 @@ Task<void> Bar() { co_await Baz(); }
 
 // CHECK: _Z3Quxv.destroy:{{.*}}
 // CHECK-NEXT: #
-// CHECK-NEXT: jmp	_ZdlPv
+// CHECK-NEXT: movl	$40, %esi
+// CHECK-NEXT: jmp	_ZdlPvm@PLT
 
 // CHECK: _Z3Bazv.destroy:{{.*}}
 // CHECK-NEXT: #
-// CHECK-NEXT: jmp	_ZdlPv
+// CHECK-NEXT: movl	$80, %esi
+// CHECK-NEXT: jmp	_ZdlPvm
 
 // CHECK: _Z3Barv.destroy:{{.*}}
 // CHECK-NEXT: #
-// CHECK-NEXT: jmp	_ZdlPv
+// CHECK-NEXT: movl	$120, %esi
+// CHECK-NEXT: jmp	_ZdlPvm
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl
new file mode 100644
index 0000000..fc5649d
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl
@@ -0,0 +1,52 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx940 -emit-llvm -o - %s | FileCheck %s
+// REQUIRES: amdgpu-registered-target
+
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+
+// CHECK-LABEL: @test_global_load_lds_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT:    call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_global_load_lds_u32(global u32* src, local u32 *dst) {
+  __builtin_amdgcn_global_load_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_global_load_lds_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT:    call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_global_load_lds_u16(global u16* src, local u16 *dst) {
+  __builtin_amdgcn_global_load_lds(src, dst, /*size=*/2, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_global_load_lds_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT:    call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_global_load_lds_u8(global u8* src, local u8 *dst) {
+  __builtin_amdgcn_global_load_lds(src, dst, /*size=*/1, /*offset=*/0, /*aux=*/0);
+}
diff --git a/clang/test/CoverageMapping/builtinmacro.c b/clang/test/CoverageMapping/builtinmacro.c
index abcdc19..5d5a176 100644
--- a/clang/test/CoverageMapping/builtinmacro.c
+++ b/clang/test/CoverageMapping/builtinmacro.c
@@ -4,7 +4,7 @@
 
 // CHECK: filename
 const char *filename (const char *name) { // CHECK-NEXT: File 0, [[@LINE]]:41 -> [[@LINE+3]]:2 = #0
-  static const char this_file[] = __FILE__;
+  static const char this_file[] = __FILE__; // CHECK-NEXT: File 0, [[@LINE]]:35 -> [[@LINE]]:35 = #0
   return this_file;
 }
 
diff --git a/clang/test/CoverageMapping/macros.c b/clang/test/CoverageMapping/macros.c
index 6bd3be4..fcf2117 100644
--- a/clang/test/CoverageMapping/macros.c
+++ b/clang/test/CoverageMapping/macros.c
@@ -80,12 +80,14 @@ void func7(void) { // CHECK-NEXT: File 0, [[@LINE]]:18 -> [[@LINE+6]]:2 = #0
   int kk,ll;       // CHECK-NEXT: File 0, [[@LINE+1]]:7 -> [[@LINE+1]]:8 = #0
   if (k)           // CHECK-NEXT: Branch,File 0, [[@LINE]]:7 -> [[@LINE]]:8 = #1
     m(k);          // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:9 -> [[@LINE]]:5 = #1
-  else             // CHECK-NEXT: Expansion,File 0, [[@LINE-1]]:5 -> [[@LINE-1]]:6 = #0
+  else             // CHECK-NEXT: Expansion,File 0, [[@LINE-1]]:5 -> [[@LINE-1]]:6 = #1
     l = m(l);      // CHECK-NEXT: Gap,File 0, [[@LINE-2]]:7 -> [[@LINE]]:5 = (#0 - #1)
 }                  // CHECK-NEXT: File 0, [[@LINE-1]]:5 -> [[@LINE-1]]:10 = (#0 - #1)
                    // CHECK-NEXT: Expansion,File 0, [[@LINE-2]]:9 -> [[@LINE-2]]:10 = (#0 - #1)
-                   // CHECK-NEXT: File 1, [[@LINE-9]]:14 -> [[@LINE-9]]:18 = #0
-                   // CHECK-NEXT: File 2, [[@LINE-10]]:14 -> [[@LINE-10]]:15 = (#0 - #1)
+                   // CHECK-NEXT: File 1, [[@LINE-9]]:14 -> [[@LINE-9]]:17 = #1
+                   // CHECK-NEXT: File 1, [[@LINE-10]]:14 -> [[@LINE-10]]:18 = #0
+                   // CHECK-NEXT: File 2, [[@LINE-11]]:14 -> [[@LINE-11]]:17 = (#0 - #1)
+                   // CHECK-NEXT: File 2, [[@LINE-12]]:14 -> [[@LINE-12]]:15 = (#0 - #1)
 
 int main(int argc, const char *argv[]) {
   func();
diff --git a/clang/test/CoverageMapping/mcdc-scratch-space.c b/clang/test/CoverageMapping/mcdc-scratch-space.c
new file mode 100644
index 0000000..2b5b12d
--- /dev/null
+++ b/clang/test/CoverageMapping/mcdc-scratch-space.c
@@ -0,0 +1,65 @@
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c99 -fcoverage-mcdc -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
+
+// CHECK: builtin_macro0:
+int builtin_macro0(int a) {
+  // CHECK: Decision,File 0, [[@LINE+1]]:11 -> [[@LINE+2]]:15 = M:0, C:2
+  return (__LINE__ // CHECK: Branch,File 0, [[@LINE]]:11 -> [[@LINE]]:11 = 0, 0 [1,2,0]
+          && a); //   CHECK: Branch,File 0, [[@LINE]]:14 -> [[@LINE]]:15 = #2, (#1 - #2) [2,0,0]
+}
+
+// CHECK: builtin_macro1:
+int builtin_macro1(int a) {
+  // CHECK: Decision,File 0, [[@LINE+1]]:11 -> [[@LINE+2]]:22 = M:0, C:2
+  return (a // CHECK: Branch,File 0, [[@LINE]]:11 -> [[@LINE]]:12 = (#0 - #1), #1 [1,0,2]
+          || __LINE__); // CHECK: Branch,File 0, [[@LINE]]:14 -> [[@LINE]]:14 = 0, 0 [2,0,0]
+}
+
+#define PRE(x) pre_##x
+
+// CHECK: pre0:
+int pre0(int pre_a, int b_post) {
+  // CHECK: Decision,File 0, [[@LINE+2]]:11 -> [[@LINE+3]]:20 = M:0, C:2
+  // CHECK: Expansion,File 0, [[@LINE+1]]:11 -> [[@LINE+1]]:14 = #0 (Expanded file = 1)
+  return (PRE(a)
+          && b_post);
+  // CHECK: Branch,File 0, [[@LINE-1]]:14 -> [[@LINE-1]]:20 = #2, (#1 - #2) [2,0,0]
+  // CHECK: Branch,File 1, [[@LINE-9]]:16 -> [[@LINE-9]]:22 = #1, (#0 - #1) [1,2,0]
+}
+
+#define pre_foo pre_a
+
+// CHECK: pre1:
+int pre1(int pre_a, int b_post) {
+  // CHECK: Decision,File 0, [[@LINE+3]]:11 -> [[@LINE+4]]:20 = M:0, C:2
+  // CHECK: Expansion,File 0, [[@LINE+2]]:11 -> [[@LINE+2]]:14 = #0 (Expanded file = 1)
+  // CHECK: Branch,File 0, [[@LINE+2]]:14 -> [[@LINE+2]]:20 = #2, (#1 - #2) [2,0,0]
+  return (PRE(foo)
+          && b_post);
+  // CHECK: Expansion,File 1, 17:16 -> 17:20 = #0 (Expanded file = 2)
+  // CHECK: Branch,File 2, 29:17 -> 29:22 = #1, (#0 - #1) [1,2,0]
+}
+
+#define POST(x) x##_post
+
+// CHECK: post0:
+int post0(int pre_a, int b_post) {
+  // CHECK: Decision,File 0, [[@LINE+2]]:11 -> [[@LINE+3]]:18 = M:0, C:2
+  // CHECK: Branch,File 0, [[@LINE+1]]:11 -> [[@LINE+1]]:16 = (#0 - #1), #1 [1,0,2]
+  return (pre_a
+          || POST(b));
+  // CHECK: Expansion,File 0, [[@LINE-1]]:14 -> [[@LINE-1]]:18 = #1 (Expanded file = 1)
+  // CHECK: Branch,File 1, [[@LINE-9]]:17 -> [[@LINE-9]]:20 = (#1 - #2), #2 [2,0,0]
+}
+
+#define bar_post b_post
+
+// CHECK: post1:
+int post1(int pre_a, int b_post) {
+  // CHECK: Decision,File 0, [[@LINE+3]]:11 -> [[@LINE+4]]:18 = M:0, C:2
+  // CHECK: Branch,File 0, [[@LINE+2]]:11 -> [[@LINE+2]]:16 = (#0 - #1), #1 [1,0,2]
+  // CHECK: Expansion,File 0, [[@LINE+2]]:14 -> [[@LINE+2]]:18 = 0 (Expanded file = 1)
+  return (pre_a
+          || POST(bar));
+  // CHECK: Expansion,File 1, 42:17 -> 42:18 = #1 (Expanded file = 2)
+  // CHECK: Branch,File 2, 54:18 -> 54:24 = (#1 - #2), #2 [2,0,0]
+}
diff --git a/clang/test/CoverageMapping/templates.cpp b/clang/test/CoverageMapping/templates.cpp
index 143e566..7e7f220 100644
--- a/clang/test/CoverageMapping/templates.cpp
+++ b/clang/test/CoverageMapping/templates.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -mllvm -emptyline-comment-coverage=false -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name templates.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -std=c++20 -mllvm -emptyline-comment-coverage=false -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name templates.cpp %s | FileCheck %s
 
 template<typename T>
 void unused(T x) {
@@ -30,5 +30,6 @@ namespace structural_value_crash {
 
   void test() {
     tpl_fn<arr>();
+    tpl_fn<&arr[1]>();
   }
 }
diff --git a/clang/test/Driver/Ofast.c b/clang/test/Driver/Ofast.c
index 1f9fc78..8b7f221 100644
--- a/clang/test/Driver/Ofast.c
+++ b/clang/test/Driver/Ofast.c
@@ -3,7 +3,9 @@
 // RUN: %clang -fno-fast-math -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
 // RUN: %clang -fno-strict-aliasing -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
 // RUN: %clang -fno-vectorize -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
-// RUN: %clang -Ofast -O2 -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-O2 %s
+// RUN: %clang -Ofast -O2 -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-O2 \
+// RUN:  %if target={{.*-windows-msvc.*}} %{ --check-prefix=CHECK-OFAST-O2-ALIASING-MSVC %} \
+// RUN:  %else %{ --check-prefix=CHECK-OFAST-O2-ALIASING %} %s
 // RUN: %clang -Ofast -fno-fast-math -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-NO-FAST-MATH %s
 // RUN: %clang -Ofast -fno-strict-aliasing -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-NO-STRICT-ALIASING %s
 // RUN: %clang -Ofast -fno-vectorize -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-NO-VECTORIZE %s
@@ -15,7 +17,8 @@
 // CHECK-OFAST: -vectorize-loops
 
 // CHECK-OFAST-O2: -cc1
-// CHECK-OFAST-O2-NOT: -relaxed-aliasing
+// CHECK-OFAST-O2-ALIASING-NOT: -relaxed-aliasing
+// CHECK-OFAST-O2-ALIASING-MSVC: -relaxed-aliasing
 // CHECK-OFAST-O2-NOT: -ffast-math
 // CHECK-OFAST-O2-NOT: -Ofast
 // CHECK-OFAST-O2: -vectorize-loops
diff --git a/clang/test/Driver/aarch64-v95a.c b/clang/test/Driver/aarch64-v95a.c
index 1037da6..62878f2 100644
--- a/clang/test/Driver/aarch64-v95a.c
+++ b/clang/test/Driver/aarch64-v95a.c
@@ -6,7 +6,7 @@
 // RUN: %clang -target aarch64 -mlittle-endian -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
 // RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
 // RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
-// GENERICV95A: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.5a"
+// GENERICV95A: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.5a"{{.*}}  "-target-feature" "+cpa"{{.*}} "-target-feature" "+faminmax"{{.*}}  "-target-feature" "+lut"
 
 // RUN: %clang -target aarch64_be -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A-BE %s
 // RUN: %clang -target aarch64_be -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A-BE %s
@@ -14,14 +14,10 @@
 // RUN: %clang -target aarch64 -mbig-endian -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A-BE %s
 // RUN: %clang -target aarch64_be -mbig-endian -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A-BE %s
 // RUN: %clang -target aarch64_be -mbig-endian -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A-BE %s
-// GENERICV95A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.5a"
+// GENERICV95A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.5a"{{.*}}  "-target-feature" "+cpa"{{.*}} "-target-feature" "+faminmax"{{.*}}  "-target-feature" "+lut"
 
 // ===== Features supported on aarch64 =====
 
-// RUN: %clang -target aarch64 -march=armv9.5a+cpa -### -c %s 2>&1 | FileCheck -check-prefix=V95A-CPA %s
-// RUN: %clang -target aarch64 -march=armv9.5-a+cpa -### -c %s 2>&1 | FileCheck -check-prefix=V95A-CPA %s
-// V95A-CPA: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.5a"{{.*}} "-target-feature" "+cpa"
-
 // RUN: %clang -target aarch64 -march=armv9.5a+pauth-lr -### -c %s 2>&1 | FileCheck -check-prefix=V95A-PAUTHLR %s
 // RUN: %clang -target aarch64 -march=armv9.5-a+pauth-lr -### -c %s 2>&1 | FileCheck -check-prefix=V95A-PAUTHLR %s
 // V95A-PAUTHLR: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.5a"{{.*}} "-target-feature" "+pauth-lr"
diff --git a/clang/test/Driver/android-unversioned-fallback-warning.cpp b/clang/test/Driver/android-unversioned-fallback-warning.cpp
index 62a951d..da666cc 100644
--- a/clang/test/Driver/android-unversioned-fallback-warning.cpp
+++ b/clang/test/Driver/android-unversioned-fallback-warning.cpp
@@ -14,14 +14,14 @@
 // RUN: %clang --target=aarch64-none-linux-android -ccc-install-dir %t/bin \
 // RUN:     -resource-dir %t/resource -### -c %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=NO-WARNING %s
-// NO-WARNING-NOT: Using unversioned Android target directory
+// NO-WARNING-NOT: using unversioned Android target directory
 
 // RUN: %clang --target=aarch64-none-linux-android21 -ccc-install-dir %t/bin \
 // RUN:     -resource-dir %t/resource -### -c %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=ANDROID21 -DDIR=%t -DSEP=%{fs-sep} %s
-// ANDROID21-DAG: Using unversioned Android target directory [[DIR]]/bin[[SEP]]..[[SEP]]include[[SEP]]aarch64-none-linux-android
-// ANDROID21-DAG: Using unversioned Android target directory [[DIR]]/bin[[SEP]]..[[SEP]]lib[[SEP]]aarch64-none-linux-android
-// ANDROID21-DAG: Using unversioned Android target directory [[DIR]]/resource[[SEP]]lib[[SEP]]aarch64-none-linux-android
+// ANDROID21-DAG: using unversioned Android target directory [[DIR]]/bin[[SEP]]..[[SEP]]include[[SEP]]aarch64-none-linux-android
+// ANDROID21-DAG: using unversioned Android target directory [[DIR]]/bin[[SEP]]..[[SEP]]lib[[SEP]]aarch64-none-linux-android
+// ANDROID21-DAG: using unversioned Android target directory [[DIR]]/resource[[SEP]]lib[[SEP]]aarch64-none-linux-android
 
 // 23 or newer should use the versioned directory
 // RUN: %clang --target=aarch64-none-linux-android23 -ccc-install-dir %t/bin \
diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c
index 75f49de..733f243 100644
--- a/clang/test/Driver/cl-options.c
+++ b/clang/test/Driver/cl-options.c
@@ -740,9 +740,10 @@
 // NOCLANG-SAME: "-vectorize-slp"
 // NOCLANG-NOT: "--dependent-lib=msvcrt"
 
-// RUN: %clang_cl -O2 -MD /clang:-fno-slp-vectorize /clang:-MD /clang:-MF /clang:my_dependency_file.dep -### -- %s 2>&1 | FileCheck -check-prefix=CLANG %s
+// RUN: %clang_cl -O2 -MD /clang:-fno-slp-vectorize /clang:-MD /clang:-MF /clang:my_dependency_file.dep /c /Fo%/t/cl-options.obj -### -- %s 2>&1 | FileCheck -DPREFIX=%/t -check-prefix=CLANG %s
 // CLANG: "--dependent-lib=msvcrt"
 // CLANG-SAME: "-dependency-file" "my_dependency_file.dep"
+// CLANG-SAME: "-MT" "[[PREFIX]]/cl-options.obj"
 // CLANG-NOT: "--dependent-lib=libcmt"
 // CLANG-NOT: "-vectorize-slp"
 
diff --git a/clang/test/Driver/cl-x86-flags.c b/clang/test/Driver/cl-x86-flags.c
index 716b02f..51b16f0 100644
--- a/clang/test/Driver/cl-x86-flags.c
+++ b/clang/test/Driver/cl-x86-flags.c
@@ -69,10 +69,7 @@
 // RUN: %clang_cl -m32 -arch:avx2 --target=i386-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=avx2 %s
 // avx2: invalid /arch: argument
 
-// RUN: %clang_cl -m32 -arch:AVX512F --target=i386-pc-windows /c /Fo%t.obj -Xclang -verify=KNL1 -DTEST_32_ARCH_AVX512F -- %s
-// KNL1-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
-// KNL1-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
-// KNL1-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
+// RUN: %clang_cl -m32 -arch:AVX512F --target=i386-pc-windows /c /Fo%t.obj -Xclang -verify -DTEST_32_ARCH_AVX512F -- %s
 #if defined(TEST_32_ARCH_AVX512F)
 #if _M_IX86_FP != 2 || !__AVX__ || !__AVX2__ || !__AVX512F__  || __AVX512BW__
 #error fail
@@ -112,10 +109,7 @@
 // RUN: %clang_cl -m64 -arch:avx2 --target=x86_64-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=avx264 %s
 // avx264: invalid /arch: argument
 
-// RUN: %clang_cl -m64 -arch:AVX512F --target=i386-pc-windows /c /Fo%t.obj -Xclang -verify=KNL2 -DTEST_64_ARCH_AVX512F -- %s
-// KNL2-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
-// KNL2-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
-// KNL2-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
+// RUN: %clang_cl -m64 -arch:AVX512F --target=i386-pc-windows /c /Fo%t.obj -Xclang -verify -DTEST_64_ARCH_AVX512F -- %s
 #if defined(TEST_64_ARCH_AVX512F)
 #if _M_IX86_FP || !__AVX__ || !__AVX2__ || !__AVX512F__  || __AVX512BW__
 #error fail
diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c
index 472d072..d69cd19 100644
--- a/clang/test/Driver/clang_f_opts.c
+++ b/clang/test/Driver/clang_f_opts.c
@@ -623,3 +623,9 @@
 // RUN: %clang -### --target=aarch64-windows-msvc -fno-ms-volatile %s 2>&1 | FileCheck -check-prefix=CHECK-NO-MS-VOLATILE %s
 // CHECK-MS-VOLATILE: -fms-volatile
 // CHECK-NO-MS-VOLATILE-NOT: -fms-volatile
+
+// RUN: %clang -### --target=x86_64-pc-windows-msvc %s 2>&1 | FileCheck -check-prefix=CHECK-NO-STRICT-ALIASING %s
+// RUN: %clang -### --target=x86_64-pc-windows-msvc -fstrict-aliasing %s 2>&1 | FileCheck -check-prefix=CHECK-STRICT-ALIASING %s
+// RUN: %clang -### --target=x86_64-pc-windows-msvc -fno-strict-aliasing %s 2>&1 | FileCheck -check-prefix=CHECK-NO-STRICT-ALIASING %s
+// CHECK-STRICT-ALIASING-NOT: -relaxed-aliasing
+// CHECK-NO-STRICT-ALIASING: -relaxed-aliasing
diff --git a/clang/test/Driver/cuda-cross-compiling.c b/clang/test/Driver/cuda-cross-compiling.c
index a1719a6..203bc06 100644
--- a/clang/test/Driver/cuda-cross-compiling.c
+++ b/clang/test/Driver/cuda-cross-compiling.c
@@ -83,8 +83,8 @@
 // RUN: not %clang -target nvptx64-nvidia-cuda -march=generic %s -### 2>&1 \
 // RUN:   | FileCheck -check-prefix=MISSING %s
 
-// MISSING: error: Must pass in an explicit nvptx64 gpu architecture to 'ptxas'
-// MISSING: error: Must pass in an explicit nvptx64 gpu architecture to 'nvlink'
+// MISSING: error: must pass in an explicit nvptx64 gpu architecture to 'ptxas'
+// MISSING: error: must pass in an explicit nvptx64 gpu architecture to 'nvlink'
 
 // RUN: %clang -target nvptx64-nvidia-cuda -flto -c %s -### 2>&1 \
 // RUN:   | FileCheck -check-prefix=GENERIC %s
diff --git a/clang/test/Driver/dxc_dxv_path.hlsl b/clang/test/Driver/dxc_dxv_path.hlsl
index 4845de1..db2c870 100644
--- a/clang/test/Driver/dxc_dxv_path.hlsl
+++ b/clang/test/Driver/dxc_dxv_path.hlsl
@@ -1,7 +1,7 @@
 // RUN: %clang_dxc -I test -Tlib_6_3  -### %s 2>&1 | FileCheck %s
 
 // Make sure report warning.
-// CHECK:dxv not found.
+// CHECK:dxv not found
 
 // RUN: echo "dxv" > %T/dxv && chmod 754 %T/dxv && %clang_dxc --dxv-path=%T %s -Tlib_6_3 -### 2>&1 | FileCheck %s --check-prefix=DXV_PATH
 // DXV_PATH:dxv{{(.exe)?}}" "-" "-o" "-"
diff --git a/clang/test/Driver/fast-math.c b/clang/test/Driver/fast-math.c
index 274f1f2..ffd0819 100644
--- a/clang/test/Driver/fast-math.c
+++ b/clang/test/Driver/fast-math.c
@@ -67,31 +67,31 @@
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
 //
 // Target defaults for -fmath-errno (reusing the above checks).
-// RUN: %clang -### -target i686-unknown-linux -c %s 2>&1 \
+// RUN: %clang -### --target=i686-unknown-linux -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,ERRNO %s
 // RUN: %clang -### -target i686-apple-darwin -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
-// RUN: %clang -### -target x86_64-unknown-freebsd -c %s 2>&1 \
+// RUN: %clang -### --target=x86_64-unknown-freebsd -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
-// RUN: %clang -### -target x86_64-unknown-netbsd -c %s 2>&1 \
+// RUN: %clang -### --target=x86_64-unknown-netbsd -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
-// RUN: %clang -### -target x86_64-unknown-openbsd -c %s 2>&1 \
+// RUN: %clang -### --target=x86_64-unknown-openbsd -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
 // RUN: %clang -### --target=x86_64-unknown-haiku -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
-// RUN: %clang -### -target x86_64-unknown-dragonfly -c %s 2>&1 \
+// RUN: %clang -### --target=x86_64-unknown-dragonfly -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
-// RUN: %clang -### -target x86_64-fuchsia -c %s 2>&1 \
+// RUN: %clang -### --target=x86_64-fuchsia -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
-// RUN: %clang -### -target x86_64-linux-android -c %s 2>&1 \
+// RUN: %clang -### --target=x86_64-linux-android -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
-// RUN: %clang -### -target x86_64-linux-musl -c %s 2>&1 \
+// RUN: %clang -### --target=x86_64-linux-musl -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
 // RUN: %clang -### --target=amdgcn-amd-amdhsa -nogpuinc -nogpulib -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
-// RUN: %clang -### -target amdgcn-amd-amdpal -c %s 2>&1 \
+// RUN: %clang -### --target=amdgcn-amd-amdpal -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
-// RUN: %clang -### -target amdgcn-mesa-mesa3d -c %s 2>&1   \
+// RUN: %clang -### --target=amdgcn-mesa-mesa3d -c %s 2>&1   \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
 //
 // Check that -ffast-math disables -fmath-errno, and -fno-fast-math merely
@@ -103,9 +103,9 @@
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
 // RUN: %clang -### -ffast-math -fmath-errno -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,ERRNO %s
-// RUN: %clang -### -target i686-unknown-linux -fno-fast-math -c %s 2>&1 \
+// RUN: %clang -### --target=i686-unknown-linux -fno-fast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,ERRNO %s
-// RUN: %clang -### -target i686-unknown-linux -fno-math-errno -fno-fast-math -c %s 2>&1 \
+// RUN: %clang -### --target=i686-unknown-linux -fno-math-errno -fno-fast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,ERRNO %s
 // RUN: %clang -### -target i686-apple-darwin -fno-fast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
diff --git a/clang/test/Driver/fat-archive-unbundle-ext.c b/clang/test/Driver/fat-archive-unbundle-ext.c
index e98b872..e797acc 100644
--- a/clang/test/Driver/fat-archive-unbundle-ext.c
+++ b/clang/test/Driver/fat-archive-unbundle-ext.c
@@ -2,7 +2,7 @@
 // UNSUPPORTED: target={{.*-windows.*}}, target={{.*}}-macosx{{.*}}, target={{.*-darwin.*}}, target={{.*}}-aix{{.*}}
 
 // Generate dummy fat object
-// RUN: %clang -O0 -target %itanium_abi_triple %s -c -o %t.host.o
+// RUN: %clang -O0 --target=%itanium_abi_triple %s -c -o %t.host.o
 // RUN: echo 'Content of device file' > %t.tgt.o
 // RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-%itanium_abi_triple -input=%t.host.o -input=%t.tgt.o -output=%t.fat.obj
 
diff --git a/clang/test/Driver/fatal-warnings.c b/clang/test/Driver/fatal-warnings.c
index 6239b25..12c239c 100644
--- a/clang/test/Driver/fatal-warnings.c
+++ b/clang/test/Driver/fatal-warnings.c
@@ -1,5 +1,5 @@
-// RUN: %clang -### %s -c -o tmp.o -target i686-pc-linux-gnu -integrated-as -Wa,--fatal-warnings 2>&1 | FileCheck %s
-// RUN: not %clang %s -c -o %t.o -target i686-pc-linux-gnu -integrated-as -Wa,--fatal-warnings 2>&1 %t.log
+// RUN: %clang -### %s -c -o tmp.o --target=i686-pc-linux-gnu -integrated-as -Wa,--fatal-warnings 2>&1 | FileCheck %s
+// RUN: not %clang %s -c -o %t.o --target=i686-pc-linux-gnu -integrated-as -Wa,--fatal-warnings 2>&1 %t.log
 // FileCheck --check-prefix=CHECK-AS %s -input-file %t.log
 
 // CHECK: "-cc1" {{.*}} "-massembler-fatal-warnings"
diff --git a/clang/test/Driver/fbinutils-version.c b/clang/test/Driver/fbinutils-version.c
index 56a49ed..14b44b4 100644
--- a/clang/test/Driver/fbinutils-version.c
+++ b/clang/test/Driver/fbinutils-version.c
@@ -1,29 +1,29 @@
-// RUN: %clang -### -c -target x86_64-linux %s -fbinutils-version=none 2>&1 | FileCheck %s --check-prefix=NONE
+// RUN: %clang -### -c --target=x86_64-linux %s -fbinutils-version=none 2>&1 | FileCheck %s --check-prefix=NONE
 
 // NONE: "-fbinutils-version=none"
 
-// RUN: %clang -### -c -target aarch64-linux %s -fbinutils-version=2 2>&1 | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang -### -c --target=aarch64-linux %s -fbinutils-version=2 2>&1 | FileCheck %s --check-prefix=CHECK2
 
 // CHECK2: "-fbinutils-version=2"
 
-// RUN: %clang -### -c -target aarch64-linux %s -fbinutils-version=2.35 2>&1 | FileCheck %s --check-prefix=CHECK2_35
+// RUN: %clang -### -c --target=aarch64-linux %s -fbinutils-version=2.35 2>&1 | FileCheck %s --check-prefix=CHECK2_35
 
 // CHECK2_35: "-fbinutils-version=2.35"
 
 /// Disallow -fbinutils-version=0 because we use $major==0 to indicate the MC
 /// default in the backend.
-// RUN: not %clang -c -target x86_64-linux %s -fbinutils-version=0 2>&1 | FileCheck %s --check-prefix=ERR0
+// RUN: not %clang -c --target=x86_64-linux %s -fbinutils-version=0 2>&1 | FileCheck %s --check-prefix=ERR0
 
 // ERR0: error: invalid argument '0' to -fbinutils-version=
 
-// RUN: not %clang -c -target x86_64-linux %s -fbinutils-version=nan 2>&1 | FileCheck %s --check-prefix=ERR1
+// RUN: not %clang -c --target=x86_64-linux %s -fbinutils-version=nan 2>&1 | FileCheck %s --check-prefix=ERR1
 
 // ERR1: error: invalid argument 'nan' to -fbinutils-version=
 
-// RUN: not %clang -c -target x86_64-linux %s -fbinutils-version=2. 2>&1 | FileCheck %s --check-prefix=ERR2
+// RUN: not %clang -c --target=x86_64-linux %s -fbinutils-version=2. 2>&1 | FileCheck %s --check-prefix=ERR2
 
 // ERR2: error: invalid argument '2.' to -fbinutils-version=
 
-// RUN: not %clang -c -target x86_64-linux %s -fbinutils-version=3.-14 2>&1 | FileCheck %s --check-prefix=ERR3
+// RUN: not %clang -c --target=x86_64-linux %s -fbinutils-version=3.-14 2>&1 | FileCheck %s --check-prefix=ERR3
 
 // ERR3: error: invalid argument '3.-14' to -fbinutils-version=
diff --git a/clang/test/Driver/fdirect-access-external-data.c b/clang/test/Driver/fdirect-access-external-data.c
index a6da776..4dfb700 100644
--- a/clang/test/Driver/fdirect-access-external-data.c
+++ b/clang/test/Driver/fdirect-access-external-data.c
@@ -1,13 +1,13 @@
 /// -fno-pic code defaults to -fdirect-access-external-data.
-// RUN: %clang -### -c -target x86_64 %s 2>&1 | FileCheck %s --check-prefix=DEFAULT
-// RUN: %clang -### -c -target x86_64 %s -fdirect-access-external-data 2>&1 | FileCheck %s --check-prefix=DEFAULT
-// RUN: %clang -### -c -target x86_64 %s -fdirect-access-external-data -fno-direct-access-external-data 2>&1 | FileCheck %s --check-prefix=INDIRECT
+// RUN: %clang -### -c --target=x86_64 %s 2>&1 | FileCheck %s --check-prefix=DEFAULT
+// RUN: %clang -### -c --target=x86_64 %s -fdirect-access-external-data 2>&1 | FileCheck %s --check-prefix=DEFAULT
+// RUN: %clang -### -c --target=x86_64 %s -fdirect-access-external-data -fno-direct-access-external-data 2>&1 | FileCheck %s --check-prefix=INDIRECT
 
 /// -fpie/-fpic code defaults to -fdirect-access-external-data.
-// RUN: %clang -### -c -target x86_64 %s -fpie 2>&1 | FileCheck %s --check-prefix=DEFAULT
-// RUN: %clang -### -c -target x86_64 %s -fpie -fno-direct-access-external-data -fdirect-access-external-data 2>&1 | FileCheck %s --check-prefix=DIRECT
-// RUN: %clang -### -c -target aarch64 %s -fpic 2>&1 | FileCheck %s --check-prefix=DEFAULT
-// RUN: %clang -### -c -target aarch64 %s -fpic -fdirect-access-external-data 2>&1 | FileCheck %s --check-prefix=DIRECT
+// RUN: %clang -### -c --target=x86_64 %s -fpie 2>&1 | FileCheck %s --check-prefix=DEFAULT
+// RUN: %clang -### -c --target=x86_64 %s -fpie -fno-direct-access-external-data -fdirect-access-external-data 2>&1 | FileCheck %s --check-prefix=DIRECT
+// RUN: %clang -### -c --target=aarch64 %s -fpic 2>&1 | FileCheck %s --check-prefix=DEFAULT
+// RUN: %clang -### -c --target=aarch64 %s -fpic -fdirect-access-external-data 2>&1 | FileCheck %s --check-prefix=DIRECT
 
 /// loongarch* targets default to -fno-direct-access-external-data even for -fno-pic.
 // RUN: %clang -### -c --target=loongarch64 -fno-pic %s 2>&1 | FileCheck %s --check-prefix=INDIRECT
diff --git a/clang/test/Driver/fembed-bitcode.c b/clang/test/Driver/fembed-bitcode.c
index 9705005..9081314 100644
--- a/clang/test/Driver/fembed-bitcode.c
+++ b/clang/test/Driver/fembed-bitcode.c
@@ -1,5 +1,5 @@
 // RUN: %clang -target x86_64-apple-macosx -fembed-bitcode=all -c %s -o /dev/null -### 2>&1 \
-// RUN:     | FileCheck -check-prefix CHECK-X64 %s
+// RUN:     | FileCheck --check-prefix=CHECK-X64 %s
 
 // CHECK-X64: "-cc1"
 
@@ -7,7 +7,7 @@
 // CHECK-X64-NOT: "-fdebug-compilation-dir
 
 // RUN: %clang -target armv7-apple-ios -fembed-bitcode=all -c %s -o /dev/null -### 2>&1 \
-// RUN:    | FileCheck -check-prefix CHECK-ARM %s
+// RUN:    | FileCheck --check-prefix=CHECK-ARM %s
 
 // CHECK-ARM: "-cc1"
 
@@ -17,7 +17,7 @@
 // CHECK-ARM-NOT: "-fdebug-compilation-dir
 
 // RUN: %clang -target arm64-apple-ios -fembed-bitcode=all -c %s -o /dev/null -### 2>&1 \
-// RUN:    | FileCheck -check-prefix CHECK-AARCH64 %s
+// RUN:    | FileCheck --check-prefix=CHECK-AARCH64 %s
 
 // CHECK-AARCH64: "-cc1"
 
@@ -26,12 +26,12 @@
 // CHECK-AARCH64: "darwinpcs"
 // CHECK-AARCH64-NOT: "-fdebug-compilation-dir
 
-// RUN: %clang -target hexagon-unknown-elf -ffixed-r19 -fembed-bitcode=all -c %s -### 2>&1 \
+// RUN: %clang --target=hexagon-unknown-elf -ffixed-r19 -fembed-bitcode=all -c %s -### 2>&1 \
 // RUN:     | FileCheck --check-prefix=CHECK-HEXAGON %s
 // CHECK-HEXAGON: "-target-feature"
 // CHECK-HEXAGON: "+reserved-r19"
 //
-// RUN: %clang -target wasm32-unknown-unknown -fembed-bitcode=all -pthread -c %s -o /dev/null -### 2>&1 \
+// RUN: %clang --target=wasm32-unknown-unknown -fembed-bitcode=all -pthread -c %s -o /dev/null -### 2>&1 \
 // RUN:     | FileCheck --check-prefix=CHECK-WASM %s
 
 // CHECK-WASM: "-cc1"
diff --git a/clang/test/Driver/fexcess-precision.c b/clang/test/Driver/fexcess-precision.c
index 68579b6..0aa1022 100644
--- a/clang/test/Driver/fexcess-precision.c
+++ b/clang/test/Driver/fexcess-precision.c
@@ -1,19 +1,19 @@
 // Note: %s must be preceded by --, otherwise it may be interpreted as a
 // command-line option, e.g. on Mac where %s is commonly under /Users.
 
-// RUN: %clang -### -target i386 -fexcess-precision=fast -c %s 2>&1  \
+// RUN: %clang -### --target=i386 -fexcess-precision=fast -c %s 2>&1  \
 // RUN:   | FileCheck --check-prefix=CHECK-FAST %s
-// RUN: %clang_cl -### -target i386 -fexcess-precision=fast -c -- %s 2>&1 \
+// RUN: %clang_cl -### --target=i386 -fexcess-precision=fast -c -- %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FAST %s
 
-// RUN: %clang -### -target i386 -fexcess-precision=standard -c %s 2>&1  \
+// RUN: %clang -### --target=i386 -fexcess-precision=standard -c %s 2>&1  \
 // RUN:   | FileCheck --check-prefix=CHECK-STD %s
-// RUN: %clang_cl -### -target i386 -fexcess-precision=standard -c -- %s 2>&1 \
+// RUN: %clang_cl -### --target=i386 -fexcess-precision=standard -c -- %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-STD %s
 
-// RUN: %clang -### -target i386 -fexcess-precision=16 -c %s 2>&1  \
+// RUN: %clang -### --target=i386 -fexcess-precision=16 -c %s 2>&1  \
 // RUN:   | FileCheck --check-prefix=CHECK-NONE %s
-// RUN: %clang_cl -### -target i386 -fexcess-precision=16 -c -- %s 2>&1 \
+// RUN: %clang_cl -### --target=i386 -fexcess-precision=16 -c -- %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NONE %s
 
 // RUN: not %clang -### --target=i386 -fexcess-precision=none -c %s 2>&1  \
@@ -21,19 +21,19 @@
 // RUN: not %clang_cl -### --target=i386 -fexcess-precision=none -c -- %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ERR-NONE %s
 
-// RUN: %clang -### -target x86_64 -fexcess-precision=fast -c %s 2>&1 \
+// RUN: %clang -### --target=x86_64 -fexcess-precision=fast -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FAST %s
-// RUN: %clang_cl -### -target x86_64 -fexcess-precision=fast -c -- %s 2>&1 \
+// RUN: %clang_cl -### --target=x86_64 -fexcess-precision=fast -c -- %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FAST %s
 
-// RUN: %clang -### -target x86_64 -fexcess-precision=standard -c %s 2>&1 \
+// RUN: %clang -### --target=x86_64 -fexcess-precision=standard -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-STD %s
-// RUN: %clang_cl -### -target x86_64 -fexcess-precision=standard -c \
+// RUN: %clang_cl -### --target=x86_64 -fexcess-precision=standard -c \
 // RUN: -- %s 2>&1 | FileCheck --check-prefix=CHECK-STD %s
 
-// RUN: %clang -### -target x86_64 -fexcess-precision=16 -c %s 2>&1 \
+// RUN: %clang -### --target=x86_64 -fexcess-precision=16 -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NONE %s
-// RUN: %clang_cl -### -target x86_64 -fexcess-precision=16 -c -- %s 2>&1 \
+// RUN: %clang_cl -### --target=x86_64 -fexcess-precision=16 -c -- %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NONE %s
 
 // RUN: not %clang -### --target=x86_64 -fexcess-precision=none -c %s 2>&1 \
@@ -41,14 +41,14 @@
 // RUN: not %clang_cl -### --target=x86_64 -fexcess-precision=none -c -- %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK-ERR-NONE %s
 
-// RUN: %clang -### -target aarch64 -fexcess-precision=fast -c %s 2>&1 \
+// RUN: %clang -### --target=aarch64 -fexcess-precision=fast -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK %s
-// RUN: %clang_cl -### -target aarch64 -fexcess-precision=fast -c -- %s 2>&1 \
+// RUN: %clang_cl -### --target=aarch64 -fexcess-precision=fast -c -- %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK %s
 
-// RUN: %clang -### -target aarch64 -fexcess-precision=standard -c %s 2>&1 \
+// RUN: %clang -### --target=aarch64 -fexcess-precision=standard -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK %s
-// RUN: %clang_cl -### -target aarch64 -fexcess-precision=standard -c \
+// RUN: %clang_cl -### --target=aarch64 -fexcess-precision=standard -c \
 // RUN: -- %s 2>&1 | FileCheck --check-prefix=CHECK %s
 
 // RUN: not %clang -### --target=aarch64 -fexcess-precision=16 -c %s 2>&1 \
diff --git a/clang/test/Driver/fextend-args.c b/clang/test/Driver/fextend-args.c
index 7f19f8c..0b72120 100644
--- a/clang/test/Driver/fextend-args.c
+++ b/clang/test/Driver/fextend-args.c
@@ -5,7 +5,7 @@
 // RUN: | FileCheck -check-prefix=CHECK-64 %s
 
 // Unsupported target
-// RUN: not %clang -target aarch64-unknown-windows-msvc -fextend-arguments=32 %s 2>&1 \
+// RUN: not %clang --target=aarch64-unknown-windows-msvc -fextend-arguments=32 %s 2>&1 \
 // RUN: | FileCheck -check-prefix=UNSUPPORTED-TARGET %s
 
 // Invalid option value
diff --git a/clang/test/Driver/fforce-dwarf-frame.c b/clang/test/Driver/fforce-dwarf-frame.c
index fb5442c..c4bc261 100644
--- a/clang/test/Driver/fforce-dwarf-frame.c
+++ b/clang/test/Driver/fforce-dwarf-frame.c
@@ -1,6 +1,6 @@
-// RUN: %clang -target arm -c -### %s -fforce-dwarf-frame 2>&1 | FileCheck --check-prefix=CHECK-ALWAYS %s
-// RUN: %clang -target arm -c -### %s -fno-force-dwarf-frame 2>&1 | FileCheck --check-prefix=CHECK-NO-ALWAYS %s
-// RUN: %clang -target arm -c -### %s 2>&1 | FileCheck --check-prefix=CHECK-NO-ALWAYS %s
+// RUN: %clang --target=arm -c -### %s -fforce-dwarf-frame 2>&1 | FileCheck --check-prefix=CHECK-ALWAYS %s
+// RUN: %clang --target=arm -c -### %s -fno-force-dwarf-frame 2>&1 | FileCheck --check-prefix=CHECK-NO-ALWAYS %s
+// RUN: %clang --target=arm -c -### %s 2>&1 | FileCheck --check-prefix=CHECK-NO-ALWAYS %s
 
 // CHECK-ALWAYS: -fforce-dwarf-frame
 // CHECK-NO-ALWAYS-NOT: -fforce-dwarf-frame
diff --git a/clang/test/Driver/fgnuc-version.c b/clang/test/Driver/fgnuc-version.c
index dea82bb..c5c8ca1 100644
--- a/clang/test/Driver/fgnuc-version.c
+++ b/clang/test/Driver/fgnuc-version.c
@@ -2,25 +2,25 @@
 // Verify -fgnuc-version parsing
 //
 
-// RUN: %clang -c %s -target i686-linux -### 2>&1 | FileCheck %s -check-prefix GNUC-DEFAULT
+// RUN: %clang -c %s --target=i686-linux -### 2>&1 | FileCheck %s --check-prefix=GNUC-DEFAULT
 // GNUC-DEFAULT: "-fgnuc-version=4.2.1"
 
-// RUN: %clang -c %s -target i686-linux -fgnuc-version=100.99.99 -### 2>&1 | FileCheck %s -check-prefix GNUC-OVERRIDE
+// RUN: %clang -c %s --target=i686-linux -fgnuc-version=100.99.99 -### 2>&1 | FileCheck %s --check-prefix=GNUC-OVERRIDE
 // GNUC-OVERRIDE: "-fgnuc-version=100.99.99"
 
-// RUN: %clang -c %s -target i686-linux -fgnuc-version=0 -### 2>&1 | FileCheck %s -check-prefix GNUC-DISABLE
-// RUN: %clang -c %s -target i686-linux -fgnuc-version= -### 2>&1 | FileCheck %s -check-prefix GNUC-DISABLE
+// RUN: %clang -c %s --target=i686-linux -fgnuc-version=0 -### 2>&1 | FileCheck %s --check-prefix=GNUC-DISABLE
+// RUN: %clang -c %s --target=i686-linux -fgnuc-version= -### 2>&1 | FileCheck %s --check-prefix=GNUC-DISABLE
 // GNUC-DISABLE-NOT: "-fgnuc-version=
 
-// RUN: not %clang -c %s -target i686-linux -fgnuc-version=100.100.10 2>&1 | FileCheck %s -check-prefix GNUC-INVALID
-// RUN: not %clang -c %s -target i686-linux -fgnuc-version=100.10.100 2>&1 | FileCheck %s -check-prefix GNUC-INVALID
-// RUN: not %clang -c %s -target i686-linux -fgnuc-version=-1.0.0 2>&1 | FileCheck %s -check-prefix GNUC-INVALID
+// RUN: not %clang -c %s --target=i686-linux -fgnuc-version=100.100.10 2>&1 | FileCheck %s --check-prefix=GNUC-INVALID
+// RUN: not %clang -c %s --target=i686-linux -fgnuc-version=100.10.100 2>&1 | FileCheck %s --check-prefix=GNUC-INVALID
+// RUN: not %clang -c %s --target=i686-linux -fgnuc-version=-1.0.0 2>&1 | FileCheck %s --check-prefix=GNUC-INVALID
 // GNUC-INVALID: error: invalid value {{.*}} in '-fgnuc-version={{.*}}'
 
-// RUN: %clang -fgnuc-version=100.99.99 %s -dM -E -o - | FileCheck %s -check-prefix GNUC-LARGE
+// RUN: %clang -fgnuc-version=100.99.99 %s -dM -E -o - | FileCheck %s --check-prefix=GNUC-LARGE
 // GNUC-LARGE: #define __GNUC_MINOR__ 99
 // GNUC-LARGE: #define __GNUC_PATCHLEVEL__ 99
 // GNUC-LARGE: #define __GNUC__ 100
 
-// RUN: %clang -fgnuc-version=100.99.99 -x c++ %s -dM -E -o - | FileCheck %s -check-prefix GXX-LARGE
+// RUN: %clang -fgnuc-version=100.99.99 -x c++ %s -dM -E -o - | FileCheck %s --check-prefix=GXX-LARGE
 // GXX-LARGE: #define __GNUG__ 100
diff --git a/clang/test/Driver/flags.c b/clang/test/Driver/flags.c
index da25a5c..16b7606 100644
--- a/clang/test/Driver/flags.c
+++ b/clang/test/Driver/flags.c
@@ -25,11 +25,11 @@
 // RUN: %clang -target armv7-apple-darwin10 -### -S -mno-implicit-float -mimplicit-float %s 2>&1 | FileCheck -check-prefix=TEST8 %s
 // TEST8-NOT: "-no-implicit-float"
 
-// RUN: %clang -target x86_64-linux-gnu -### -c -fclang-abi-compat=3.2 %s 2>&1 | FileCheck -check-prefix=TEST9 %s
+// RUN: %clang --target=x86_64-linux-gnu -### -c -fclang-abi-compat=3.2 %s 2>&1 | FileCheck -check-prefix=TEST9 %s
 // TEST9: "-fclang-abi-compat=3.2"
 //
-// RUN: %clang -target riscv32 -### -S -mno-implicit-float %s 2>&1 | FileCheck -check-prefix=TEST10 %s
+// RUN: %clang --target=riscv32 -### -S -mno-implicit-float %s 2>&1 | FileCheck -check-prefix=TEST10 %s
 // TEST10: "-no-implicit-float"
 //
-// RUN: %clang -target riscv64 -### -S -mno-implicit-float %s 2>&1 | FileCheck -check-prefix=TEST11 %s
+// RUN: %clang --target=riscv64 -### -S -mno-implicit-float %s 2>&1 | FileCheck -check-prefix=TEST11 %s
 // TEST11: "-no-implicit-float"
diff --git a/clang/test/Driver/flang/msvc-link.f90 b/clang/test/Driver/flang/msvc-link.f90
index 536da25..4637495 100644
--- a/clang/test/Driver/flang/msvc-link.f90
+++ b/clang/test/Driver/flang/msvc-link.f90
@@ -1,4 +1,4 @@
-! RUN: %clang --driver-mode=flang -target x86_64-pc-windows-msvc -### %s -Ltest 2>&1 | FileCheck %s
+! RUN: %clang --driver-mode=flang --target=x86_64-pc-windows-msvc -### %s -Ltest 2>&1 | FileCheck %s
 !
 ! Test that user provided paths come before the Flang runtimes
 ! CHECK: "-libpath:test"
diff --git a/clang/test/Driver/fmemprof.cpp b/clang/test/Driver/fmemprof.cpp
index b00d9f2..5165c44 100644
--- a/clang/test/Driver/fmemprof.cpp
+++ b/clang/test/Driver/fmemprof.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -target x86_64-linux-gnu -fmemory-profile %s -### 2>&1 | FileCheck %s
-// RUN: %clangxx -target x86_64-linux-gnu -fmemory-profile=foo %s -### 2>&1 | FileCheck %s --check-prefix=DIR
-// RUN: %clangxx -target x86_64-linux-gnu -fmemory-profile -fno-memory-profile %s -### 2>&1 | FileCheck %s --check-prefix=OFF
-// RUN: %clangxx -target x86_64-linux-gnu -fmemory-profile=foo -fno-memory-profile %s -### 2>&1 | FileCheck %s --check-prefix=OFF
+// RUN: %clangxx --target=x86_64-linux-gnu -fmemory-profile %s -### 2>&1 | FileCheck %s
+// RUN: %clangxx --target=x86_64-linux-gnu -fmemory-profile=foo %s -### 2>&1 | FileCheck %s --check-prefix=DIR
+// RUN: %clangxx --target=x86_64-linux-gnu -fmemory-profile -fno-memory-profile %s -### 2>&1 | FileCheck %s --check-prefix=OFF
+// RUN: %clangxx --target=x86_64-linux-gnu -fmemory-profile=foo -fno-memory-profile %s -### 2>&1 | FileCheck %s --check-prefix=OFF
 // CHECK: "-cc1" {{.*}} "-fmemory-profile"
 // CHECK: ld{{.*}}libclang_rt.memprof{{.*}}libclang_rt.memprof_cxx
 // DIR: "-cc1" {{.*}} "-fmemory-profile=foo"
@@ -9,7 +9,7 @@
 // OFF-NOT: "-fmemory-profile"
 // OFF-NOT: libclang_rt.memprof
 
-// RUN: %clangxx -target x86_64-linux-gnu -fmemory-profile-use=foo %s -### 2>&1 | FileCheck %s --check-prefix=USE
+// RUN: %clangxx --target=x86_64-linux-gnu -fmemory-profile-use=foo %s -### 2>&1 | FileCheck %s --check-prefix=USE
 // USE: "-cc1" {{.*}} "-fmemory-profile-use=foo"
 
 // RUN: not %clangxx --target=x86_64-linux-gnu -fmemory-profile -fmemory-profile-use=foo %s -### 2>&1 | FileCheck %s --check-prefix=CONFLICTWITHMEMPROFINSTR
diff --git a/clang/test/Driver/fopenmp.c b/clang/test/Driver/fopenmp.c
index 2919469..7d343ee 100644
--- a/clang/test/Driver/fopenmp.c
+++ b/clang/test/Driver/fopenmp.c
@@ -1,27 +1,27 @@
-// RUN: %clang -target x86_64-linux-gnu -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target x86_64-linux-gnu -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
-// RUN: %clang -target x86_64-linux-gnu -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
 // RUN: %clang -target x86_64-apple-darwin -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
 // RUN: %clang -target x86_64-apple-darwin -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
 // RUN: %clang -target x86_64-apple-darwin -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target x86_64-freebsd -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target x86_64-freebsd -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
-// RUN: %clang -target x86_64-freebsd -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target x86_64-netbsd -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target x86_64-netbsd -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
-// RUN: %clang -target x86_64-netbsd -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target x86_64-openbsd -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target x86_64-openbsd -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
-// RUN: %clang -target x86_64-openbsd -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target x86_64-dragonfly -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target x86_64-dragonfly -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
-// RUN: %clang -target x86_64-dragonfly -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target i386-pc-solaris2.11 -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target i386-pc-solaris2.11 -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
-// RUN: %clang -target i386-pc-solaris2.11 -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target x86_64-windows-gnu -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target x86_64-windows-gnu -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
-// RUN: %clang -target x86_64-windows-gnu -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-freebsd -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-freebsd -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
+// RUN: %clang --target=x86_64-freebsd -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-netbsd -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-netbsd -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
+// RUN: %clang --target=x86_64-netbsd -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-openbsd -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-openbsd -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
+// RUN: %clang --target=x86_64-openbsd -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-dragonfly -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-dragonfly -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
+// RUN: %clang --target=x86_64-dragonfly -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=i386-pc-solaris2.11 -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=i386-pc-solaris2.11 -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
+// RUN: %clang --target=i386-pc-solaris2.11 -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-windows-gnu -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-windows-gnu -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
+// RUN: %clang --target=x86_64-windows-gnu -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
 // RUN: %clang_cl --target=x86_64-windows-msvc /clang:-fopenmp=libomp /openmp -### -- %s 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
 // RUN: %clang_cl --target=x86_64-windows-msvc /clang:-fopenmp=libgomp /openmp -### -- %s 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
 // RUN: %clang_cl --target=x86_64-windows-msvc /clang:-fopenmp=libiomp5 /openmp -### -- %s 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
@@ -36,99 +36,99 @@
 // CHECK-CC1-NO-OPENMP: "-cc1"
 // CHECK-CC1-NO-OPENMP-NOT: "-fopenmp"
 //
-// RUN: %clang -target x86_64-linux-gnu -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
-// RUN: %clang -target x86_64-linux-gnu -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-RT
-// RUN: %clang -target x86_64-linux-gnu -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-RT
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
 //
-// RUN: %clang -target x86_64-linux-gnu -fopenmp=libomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
-// RUN: %clang -target x86_64-linux-gnu -fopenmp=libgomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-RT
-// RUN: %clang -target x86_64-linux-gnu -fopenmp=libiomp5 -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
-// RUN: %clang -target x86_64-linux-gnu -fopenmp=libiomp5 -static -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp=libomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp=libgomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-RT
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp=libiomp5 -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp=libiomp5 -static -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
 //
-// RUN: %clang -nostdlib -target x86_64-linux-gnu -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
-// RUN: %clang -nostdlib -target x86_64-linux-gnu -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
-// RUN: %clang -nostdlib -target x86_64-linux-gnu -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
+// RUN: %clang -nostdlib --target=x86_64-linux-gnu -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
+// RUN: %clang -nostdlib --target=x86_64-linux-gnu -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
+// RUN: %clang -nostdlib --target=x86_64-linux-gnu -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
 //
-// RUN: %clang -target x86_64-darwin -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
-// RUN: %clang -target x86_64-darwin -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
-// RUN: %clang -target x86_64-darwin -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
+// RUN: %clang --target=x86_64-darwin -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
+// RUN: %clang --target=x86_64-darwin -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
+// RUN: %clang --target=x86_64-darwin -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
 //
-// RUN: %clang -nostdlib -target x86_64-darwin -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
-// RUN: %clang -nostdlib -target x86_64-darwin -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
-// RUN: %clang -nostdlib -target x86_64-darwin -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
+// RUN: %clang -nostdlib --target=x86_64-darwin -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
+// RUN: %clang -nostdlib --target=x86_64-darwin -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
+// RUN: %clang -nostdlib --target=x86_64-darwin -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
 //
-// RUN: %clang -target x86_64-freebsd -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
-// RUN: %clang -target x86_64-freebsd -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
-// RUN: %clang -target x86_64-freebsd -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
+// RUN: %clang --target=x86_64-freebsd -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
+// RUN: %clang --target=x86_64-freebsd -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
+// RUN: %clang --target=x86_64-freebsd -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
 //
-// RUN: %clang -target x86_64-freebsd -fopenmp=libomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
-// RUN: %clang -target x86_64-freebsd -fopenmp=libgomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-NO-RT
-// RUN: %clang -target x86_64-freebsd -fopenmp=libiomp5 -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
-// RUN: %clang -target x86_64-freebsd -fopenmp=libiomp5 -static -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
+// RUN: %clang --target=x86_64-freebsd -fopenmp=libomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
+// RUN: %clang --target=x86_64-freebsd -fopenmp=libgomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-NO-RT
+// RUN: %clang --target=x86_64-freebsd -fopenmp=libiomp5 -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
+// RUN: %clang --target=x86_64-freebsd -fopenmp=libiomp5 -static -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
 //
-// RUN: %clang -nostdlib -target x86_64-freebsd -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
-// RUN: %clang -nostdlib -target x86_64-freebsd -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
-// RUN: %clang -nostdlib -target x86_64-freebsd -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
+// RUN: %clang -nostdlib --target=x86_64-freebsd -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
+// RUN: %clang -nostdlib --target=x86_64-freebsd -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
+// RUN: %clang -nostdlib --target=x86_64-freebsd -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
 //
-// RUN: %clang -target x86_64-netbsd -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
-// RUN: %clang -target x86_64-netbsd -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
-// RUN: %clang -target x86_64-netbsd -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
+// RUN: %clang --target=x86_64-netbsd -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
+// RUN: %clang --target=x86_64-netbsd -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
+// RUN: %clang --target=x86_64-netbsd -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
 //
-// RUN: %clang -target x86_64-netbsd -fopenmp=libomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
-// RUN: %clang -target x86_64-netbsd -fopenmp=libgomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-NO-RT
-// RUN: %clang -target x86_64-netbsd -fopenmp=libiomp5 -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
-// RUN: %clang -target x86_64-netbsd -fopenmp=libiomp5 -static -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
+// RUN: %clang --target=x86_64-netbsd -fopenmp=libomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
+// RUN: %clang --target=x86_64-netbsd -fopenmp=libgomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-NO-RT
+// RUN: %clang --target=x86_64-netbsd -fopenmp=libiomp5 -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
+// RUN: %clang --target=x86_64-netbsd -fopenmp=libiomp5 -static -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
 //
-// RUN: %clang -nostdlib -target x86_64-netbsd -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
-// RUN: %clang -nostdlib -target x86_64-netbsd -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
-// RUN: %clang -nostdlib -target x86_64-netbsd -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
+// RUN: %clang -nostdlib --target=x86_64-netbsd -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
+// RUN: %clang -nostdlib --target=x86_64-netbsd -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
+// RUN: %clang -nostdlib --target=x86_64-netbsd -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
 //
-// RUN: %clang -target x86_64-openbsd -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
-// RUN: %clang -target x86_64-openbsd -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
-// RUN: %clang -target x86_64-openbsd -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
+// RUN: %clang --target=x86_64-openbsd -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
+// RUN: %clang --target=x86_64-openbsd -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
+// RUN: %clang --target=x86_64-openbsd -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
 //
-// RUN: %clang -target x86_64-openbsd -fopenmp=libomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
-// RUN: %clang -target x86_64-openbsd -fopenmp=libgomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-NO-RT
-// RUN: %clang -target x86_64-openbsd -fopenmp=libiomp5 -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
-// RUN: %clang -target x86_64-openbsd -fopenmp=libiomp5 -static -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
+// RUN: %clang --target=x86_64-openbsd -fopenmp=libomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
+// RUN: %clang --target=x86_64-openbsd -fopenmp=libgomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-NO-RT
+// RUN: %clang --target=x86_64-openbsd -fopenmp=libiomp5 -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
+// RUN: %clang --target=x86_64-openbsd -fopenmp=libiomp5 -static -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
 //
-// RUN: %clang -nostdlib -target x86_64-openbsd -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
-// RUN: %clang -nostdlib -target x86_64-openbsd -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
-// RUN: %clang -nostdlib -target x86_64-openbsd -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
+// RUN: %clang -nostdlib --target=x86_64-openbsd -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
+// RUN: %clang -nostdlib --target=x86_64-openbsd -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
+// RUN: %clang -nostdlib --target=x86_64-openbsd -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
 //
-// RUN: %clang -target x86_64-dragonfly -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
-// RUN: %clang -target x86_64-dragonfly -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
-// RUN: %clang -target x86_64-dragonfly -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
+// RUN: %clang --target=x86_64-dragonfly -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
+// RUN: %clang --target=x86_64-dragonfly -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
+// RUN: %clang --target=x86_64-dragonfly -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
 //
-// RUN: %clang -target x86_64-dragonfly -fopenmp=libomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
-// RUN: %clang -target x86_64-dragonfly -fopenmp=libgomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-NO-RT
-// RUN: %clang -target x86_64-dragonfly -fopenmp=libiomp5 -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
-// RUN: %clang -target x86_64-dragonfly -fopenmp=libiomp5 -static -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
+// RUN: %clang --target=x86_64-dragonfly -fopenmp=libomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
+// RUN: %clang --target=x86_64-dragonfly -fopenmp=libgomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-NO-RT
+// RUN: %clang --target=x86_64-dragonfly -fopenmp=libiomp5 -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
+// RUN: %clang --target=x86_64-dragonfly -fopenmp=libiomp5 -static -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
 //
-// RUN: %clang -nostdlib -target x86_64-dragonfly -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
-// RUN: %clang -nostdlib -target x86_64-dragonfly -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
-// RUN: %clang -nostdlib -target x86_64-dragonfly -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
+// RUN: %clang -nostdlib --target=x86_64-dragonfly -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
+// RUN: %clang -nostdlib --target=x86_64-dragonfly -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
+// RUN: %clang -nostdlib --target=x86_64-dragonfly -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
 //
-// RUN: %clang -target i386-pc-solaris2.11 -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
-// RUN: %clang -target i386-pc-solaris2.11 -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
-// RUN: %clang -target i386-pc-solaris2.11 -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
+// RUN: %clang --target=i386-pc-solaris2.11 -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
+// RUN: %clang --target=i386-pc-solaris2.11 -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
+// RUN: %clang --target=i386-pc-solaris2.11 -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
 //
-// RUN: %clang -target i386-pc-solaris2.11 -fopenmp=libomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
-// RUN: %clang -target i386-pc-solaris2.11 -fopenmp=libgomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-NO-RT
-// RUN: %clang -target i386-pc-solaris2.11 -fopenmp=libiomp5 -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
-// RUN: %clang -target i386-pc-solaris2.11 -fopenmp=libiomp5 -static -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
+// RUN: %clang --target=i386-pc-solaris2.11 -fopenmp=libomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
+// RUN: %clang --target=i386-pc-solaris2.11 -fopenmp=libgomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-NO-RT
+// RUN: %clang --target=i386-pc-solaris2.11 -fopenmp=libiomp5 -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
+// RUN: %clang --target=i386-pc-solaris2.11 -fopenmp=libiomp5 -static -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
 //
-// RUN: %clang -nostdlib -target i386-pc-solaris2.11 -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
-// RUN: %clang -nostdlib -target i386-pc-solaris2.11 -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
-// RUN: %clang -nostdlib -target i386-pc-solaris2.11 -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
+// RUN: %clang -nostdlib --target=i386-pc-solaris2.11 -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
+// RUN: %clang -nostdlib --target=i386-pc-solaris2.11 -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
+// RUN: %clang -nostdlib --target=i386-pc-solaris2.11 -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
 //
-// RUN: %clang -target x86_64-windows-gnu -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
-// RUN: %clang -target x86_64-windows-gnu -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
-// RUN: %clang -target x86_64-windows-gnu -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5MD
+// RUN: %clang --target=x86_64-windows-gnu -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
+// RUN: %clang --target=x86_64-windows-gnu -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
+// RUN: %clang --target=x86_64-windows-gnu -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5MD
 //
-// RUN: %clang -nostdlib -target x86_64-windows-gnu -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
-// RUN: %clang -nostdlib -target x86_64-windows-gnu -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
-// RUN: %clang -nostdlib -target x86_64-windows-gnu -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5MD
+// RUN: %clang -nostdlib --target=x86_64-windows-gnu -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
+// RUN: %clang -nostdlib --target=x86_64-windows-gnu -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
+// RUN: %clang -nostdlib --target=x86_64-windows-gnu -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5MD
 //
 // CHECK-LD-OMP: "{{.*}}ld{{(.exe)?}}"
 // CHECK-LD-OMP: "-lomp"
@@ -172,7 +172,7 @@
 // CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC: "-{{B?}}static" {{.*}} "-liomp5"
 // CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC-NOT: "-Bdynamic"
 //
-// RUN: %clang -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-enable-irbuilder -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMPIRBUILDER
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp=libomp -fopenmp-enable-irbuilder -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMPIRBUILDER
 //
 // CHECK-CC1-OPENMPIRBUILDER: "-cc1"
 // CHECK-CC1-OPENMPIRBUILDER-SAME: "-fopenmp"
@@ -184,14 +184,14 @@
 // test the CC1 invocation. Instead, just ensure we do eventually link *some*
 // OpenMP runtime.
 //
-// RUN: %clang -target x86_64-linux-gnu -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
-// RUN: %clang -target x86_64-darwin -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
-// RUN: %clang -target x86_64-freebsd -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
-// RUN: %clang -target x86_64-netbsd -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
-// RUN: %clang -target x86_64-openbsd -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
-// RUN: %clang -target x86_64-dragonfly -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
-// RUN: %clang -target i386-pc-solaris2.11 -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
-// RUN: %clang -target x86_64-windows-gnu -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANYMD
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
+// RUN: %clang --target=x86_64-darwin -fopenmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
+// RUN: %clang --target=x86_64-freebsd -fopenmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
+// RUN: %clang --target=x86_64-netbsd -fopenmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
+// RUN: %clang --target=x86_64-openbsd -fopenmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
+// RUN: %clang --target=x86_64-dragonfly -fopenmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
+// RUN: %clang --target=i386-pc-solaris2.11 -fopenmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
+// RUN: %clang --target=x86_64-windows-gnu -fopenmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANYMD
 //
 // CHECK-LD-ANY: "{{.*}}ld{{(.exe)?}}"
 // CHECK-LD-ANY: "-l{{(omp|gomp|iomp5)}}"
diff --git a/clang/test/Driver/fortran.f95 b/clang/test/Driver/fortran.f95
index db3ff2d..275b188 100644
--- a/clang/test/Driver/fortran.f95
+++ b/clang/test/Driver/fortran.f95
@@ -1,21 +1,21 @@
 ! Check that the clang driver can invoke gcc to compile Fortran when in
 ! --driver-mode=clang. This is legacy behaviour - see also --driver-mode=flang.
 
-! RUN: %clang -target x86_64-unknown-linux-gnu -integrated-as -c %s -### 2>&1 \
+! RUN: %clang --target=x86_64-unknown-linux-gnu -integrated-as -c %s -### 2>&1 \
 ! RUN:   | FileCheck --check-prefix=CHECK-OBJECT %s
 ! CHECK-OBJECT: gcc
 ! CHECK-OBJECT: "-c"
 ! CHECK-OBJECT: "-x" "f95"
 ! CHECK-OBJECT-NOT: "-cc1as"
 
-! RUN: %clang -target x86_64-unknown-linux-gnu -integrated-as -S %s -### 2>&1 \
+! RUN: %clang --target=x86_64-unknown-linux-gnu -integrated-as -S %s -### 2>&1 \
 ! RUN:   | FileCheck --check-prefix=CHECK-ASM %s
 ! CHECK-ASM: gcc
 ! CHECK-ASM: "-S"
 ! CHECK-ASM: "-x" "f95"
 ! CHECK-ASM-NOT: "-cc1"
 
-! RUN: %clang -Wall -target x86_64-unknown-linux-gnu -integrated-as %s -o %t -### 2>&1 | FileCheck --check-prefix=CHECK-WARN %s
+! RUN: %clang -Wall --target=x86_64-unknown-linux-gnu -integrated-as %s -### 2>&1 | FileCheck --check-prefix=CHECK-WARN %s
 ! CHECK-WARN: gcc
 ! CHECK-WARN-NOT: "-Wall"
 ! CHECK-WARN: ld
diff --git a/clang/test/Driver/fpatchable-function-entry.c b/clang/test/Driver/fpatchable-function-entry.c
index 4d0d609..ab04fd3 100644
--- a/clang/test/Driver/fpatchable-function-entry.c
+++ b/clang/test/Driver/fpatchable-function-entry.c
@@ -1,23 +1,23 @@
-// RUN: %clang -target i386 %s -fpatchable-function-entry=1 -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target x86_64 %s -fpatchable-function-entry=1 -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target aarch64 %s -fpatchable-function-entry=1 -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target aarch64 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target loongarch32 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target loongarch64 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target riscv32 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target riscv64 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=i386 %s -fpatchable-function-entry=1 -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=x86_64 %s -fpatchable-function-entry=1 -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=aarch64 %s -fpatchable-function-entry=1 -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=aarch64 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=loongarch32 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=loongarch64 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=riscv32 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=riscv64 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
 // CHECK: "-fpatchable-function-entry=1"
 
-// RUN: %clang -target aarch64 -fsyntax-only %s -fpatchable-function-entry=1,1 -c -### 2>&1 | FileCheck --check-prefix=11 %s
+// RUN: %clang --target=aarch64 -fsyntax-only %s -fpatchable-function-entry=1,1 -c -### 2>&1 | FileCheck --check-prefix=11 %s
 // 11: "-fpatchable-function-entry=1" "-fpatchable-function-entry-offset=1"
-// RUN: %clang -target aarch64 -fsyntax-only %s -fpatchable-function-entry=2,1 -c -### 2>&1 | FileCheck --check-prefix=21 %s
+// RUN: %clang --target=aarch64 -fsyntax-only %s -fpatchable-function-entry=2,1 -c -### 2>&1 | FileCheck --check-prefix=21 %s
 // 21: "-fpatchable-function-entry=2" "-fpatchable-function-entry-offset=1"
 
-// RUN: not %clang -target ppc64 -fsyntax-only %s -fpatchable-function-entry=1 2>&1 | FileCheck --check-prefix=TARGET %s
+// RUN: not %clang --target=ppc64 -fsyntax-only %s -fpatchable-function-entry=1 2>&1 | FileCheck --check-prefix=TARGET %s
 // TARGET: error: unsupported option '-fpatchable-function-entry=1' for target 'ppc64'
 
-// RUN: not %clang -target x86_64 -fsyntax-only %s -fpatchable-function-entry=1,0, 2>&1 | FileCheck --check-prefix=EXCESS %s
+// RUN: not %clang --target=x86_64 -fsyntax-only %s -fpatchable-function-entry=1,0, 2>&1 | FileCheck --check-prefix=EXCESS %s
 // EXCESS: error: invalid argument '1,0,' to -fpatchable-function-entry=
 
-// RUN: not %clang -target aarch64-linux -fsyntax-only %s -fxray-instrument -fpatchable-function-entry=1 2>&1 | FileCheck --check-prefix=XRAY %s
+// RUN: not %clang --target=aarch64-linux -fsyntax-only %s -fxray-instrument -fpatchable-function-entry=1 2>&1 | FileCheck --check-prefix=XRAY %s
 // XRAY: error: invalid argument '-fxray-instrument' not allowed with '-fpatchable-function-entry='
diff --git a/clang/test/Driver/frame-pointer-elim.c b/clang/test/Driver/frame-pointer-elim.c
index e1b0a46..cdedcc7 100644
--- a/clang/test/Driver/frame-pointer-elim.c
+++ b/clang/test/Driver/frame-pointer-elim.c
@@ -6,39 +6,39 @@
 // KEEP-NONE:     "-mframe-pointer=none"
 
 // On Linux x86, omit frame pointer when optimization is enabled.
-// RUN: %clang -### -target i386-linux -S -fomit-frame-pointer %s 2>&1 | \
+// RUN: %clang -### --target=i386-linux -S -fomit-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
-// RUN: %clang -### -target i386-linux -S -O1 %s 2>&1 | \
+// RUN: %clang -### --target=i386-linux -S -O1 %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 
 // -fno-omit-frame-pointer or -pg disables frame pointer omission.
-// RUN: %clang -### -target i386-linux -S %s 2>&1 | \
+// RUN: %clang -### --target=i386-linux -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target i386-linux -S -O1 -fno-omit-frame-pointer %s 2>&1 | \
+// RUN: %clang -### --target=i386-linux -S -O1 -fno-omit-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target i386-linux -S -O1 -pg %s 2>&1 | \
+// RUN: %clang -### --target=i386-linux -S -O1 -pg %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
 
 // -momit-leaf-frame-pointer omits leaf frame pointer.
 // -fno-omit-frame-pointer loses out to -momit-leaf-frame-pointer.
-// RUN: %clang -### -target i386 -S -momit-leaf-frame-pointer %s 2>&1 | \
+// RUN: %clang -### --target=i386 -S -momit-leaf-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
-// RUN: %clang -### -target i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer %s 2>&1 | \
+// RUN: %clang -### --target=i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
-// RUN: %clang -### -target i386-linux -S -O1 -momit-leaf-frame-pointer %s 2>&1 | \
+// RUN: %clang -### --target=i386-linux -S -O1 -momit-leaf-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 
 // fno-omit-frame-pointer -momit-leaf-frame-pointer can be overwritten by
 // fomit-frame-pointer later on the command without warning
-// RUN: %clang -### -target i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer -fomit-frame-pointer %s 2>&1 | \
+// RUN: %clang -### --target=i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer -fomit-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 
-// RUN: %clang -### -target i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer %s 2>&1 | \
+// RUN: %clang -### --target=i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
 // Explicit or default -fomit-frame-pointer wins over -mno-omit-leaf-frame-pointer.
-// RUN: %clang -### -target i386 -S %s -fomit-frame-pointer -mno-omit-leaf-frame-pointer 2>&1 | \
+// RUN: %clang -### --target=i386 -S %s -fomit-frame-pointer -mno-omit-leaf-frame-pointer 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
-// RUN: %clang -### -target i386-linux -S %s -O1 -mno-omit-leaf-frame-pointer 2>&1 | \
+// RUN: %clang -### --target=i386-linux -S %s -O1 -mno-omit-leaf-frame-pointer 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 
 // -pg -fomit-frame-pointer => error.
@@ -48,10 +48,10 @@
 // CHECK-MIX-NO-OMIT-FP-PG-NOT: '-fomit-frame-pointer' not allowed with '-pg'
 
 // NetBSD follows the same rules as Linux.
-// RUN: %clang -### -target x86_64-unknown-netbsd -S -O1 %s 2>&1 | \
+// RUN: %clang -### --target=x86_64-unknown-netbsd -S -O1 %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 
-// RUN: %clang -### -target x86_64-unknown-netbsd -S %s 2>&1 | \
+// RUN: %clang -### --target=x86_64-unknown-netbsd -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
 
 // Darwin disables omitting the leaf frame pointer even under optimization
@@ -62,10 +62,10 @@
 // RUN: %clang -### -target i386-apple-darwin -S -O1 %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
 
-// RUN: %clang -### -target i386-darwin -S -fomit-frame-pointer %s 2>&1 | \
+// RUN: %clang -### --target=i386-darwin -S -fomit-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 
-// RUN: %clang -### -target i386-darwin -S -momit-leaf-frame-pointer %s 2>&1 | \
+// RUN: %clang -### --target=i386-darwin -S -momit-leaf-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
 
 // RUN: %clang -### -target armv7s-apple-ios -fomit-frame-pointer %s 2>&1 | \
@@ -85,19 +85,19 @@
 
 // On AArch64, PS4, PS5, and VE, default to omitting the frame pointer on leaf
 // functions
-// RUN: %clang -### -target aarch64 -S %s 2>&1 | \
+// RUN: %clang -### --target=aarch64 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
-// RUN: %clang -### -target x86_64-scei-ps4 -S %s 2>&1 | \
+// RUN: %clang -### --target=x86_64-scei-ps4 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
-// RUN: %clang -### -target x86_64-scei-ps4 -S -O2 %s 2>&1 | \
+// RUN: %clang -### --target=x86_64-scei-ps4 -S -O2 %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
-// RUN: %clang -### -target x86_64-sie-ps5 -S %s 2>&1 | \
+// RUN: %clang -### --target=x86_64-sie-ps5 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
-// RUN: %clang -### -target x86_64-sie-ps5 -S -O2 %s 2>&1 | \
+// RUN: %clang -### --target=x86_64-sie-ps5 -S -O2 %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
 // RUN: %clang -### -target aarch64-apple-darwin -arch arm64_32 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
-// RUN: %clang -### -target ve-unknown-linux-gnu -S %s 2>&1 | \
+// RUN: %clang -### --target=ve-unknown-linux-gnu -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
 // RUN: %clang -### --target=aarch64-linux-android -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
@@ -106,57 +106,57 @@
 // RUN: %clang -### --target=aarch64-linux-android -S -Os %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
 
-// RUN: %clang -### -target powerpc64 -S %s 2>&1 | \
+// RUN: %clang -### --target=powerpc64 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target powerpc64 -S -O1 %s 2>&1 | \
+// RUN: %clang -### --target=powerpc64 -S -O1 %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 
 // SPARC targets omit the frame pointer when optimizations are enabled.
-// RUN: %clang -### -target sparc -S %s 2>&1 | \
+// RUN: %clang -### --target=sparc -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target sparc -S -O1 %s 2>&1 | \
+// RUN: %clang -### --target=sparc -S -O1 %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
-// RUN: %clang -### -target sparcel -S %s 2>&1 | \
+// RUN: %clang -### --target=sparcel -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target sparcel -S -O1 %s 2>&1 | \
+// RUN: %clang -### --target=sparcel -S -O1 %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
-// RUN: %clang -### -target sparc64 -S %s 2>&1 | \
+// RUN: %clang -### --target=sparc64 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target sparc64 -S -O1 %s 2>&1 | \
+// RUN: %clang -### --target=sparc64 -S -O1 %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 
 // M68k targets omit the frame pointer when optimizations are enabled.
-// RUN: %clang -### -target m68k -S %s 2>&1 | \
+// RUN: %clang -### --target=m68k -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target m68k -S -O1 %s 2>&1 | \
+// RUN: %clang -### --target=m68k -S -O1 %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 
 // For AAarch32 (A32, T32) linux targets, default omit frame pointer when
 // optimizations are enabled.
-// RUN: %clang -### -target arm-linux-gnueabihf- -marm -S %s 2>&1 | \
+// RUN: %clang -### --target=arm-linux-gnueabihf- -marm -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target arm-linux-gnueabihf- -mthumb -S %s 2>&1 | \
+// RUN: %clang -### --target=arm-linux-gnueabihf- -mthumb -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target arm-linux-gnueabihf- -marm -mbig-endian -S %s 2>&1 | \
+// RUN: %clang -### --target=arm-linux-gnueabihf- -marm -mbig-endian -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target arm-linux-gnueabihf- -mthumb -mbig-endian -S %s 2>&1 | \
+// RUN: %clang -### --target=arm-linux-gnueabihf- -mthumb -mbig-endian -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target arm-linux-gnueabihf- -marm -O1 -S %s 2>&1 | \
+// RUN: %clang -### --target=arm-linux-gnueabihf- -marm -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
-// RUN: %clang -### -target arm-linux-gnueabihf- -mthumb -O1 -S %s 2>&1 | \
+// RUN: %clang -### --target=arm-linux-gnueabihf- -mthumb -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
-// RUN: %clang -### -target arm-linux-gnueabihf- -marm -mbig-endian -O1 -S %s 2>&1 | \
+// RUN: %clang -### --target=arm-linux-gnueabihf- -marm -mbig-endian -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
-// RUN: %clang -### -target arm-linux-gnueabihf- -mthumb -mbig-endian -O1 -S %s 2>&1 | \
+// RUN: %clang -### --target=arm-linux-gnueabihf- -mthumb -mbig-endian -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 // For Android, keep the framepointers always.
-// RUN: %clang -### -target armv7a-linux-androideabi- -marm -O1 -S %s 2>&1 | \
+// RUN: %clang -### --target=armv7a-linux-androideabi- -marm -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target armv7a-linux-androideabi- -mthumb -O1 -S %s 2>&1 | \
+// RUN: %clang -### --target=armv7a-linux-androideabi- -mthumb -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target armv7a-linux-androideabi- -marm -mbig-endian -O1 -S %s 2>&1 | \
+// RUN: %clang -### --target=armv7a-linux-androideabi- -marm -mbig-endian -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target armv7a-linux-androideabi- -mthumb -mbig-endian -O1 -S %s 2>&1 | \
+// RUN: %clang -### --target=armv7a-linux-androideabi- -mthumb -mbig-endian -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
 // RUN: %clang -### --target=riscv64-linux-android -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
diff --git a/clang/test/Driver/freebsd-mips-as.c b/clang/test/Driver/freebsd-mips-as.c
index a053c21..428644a 100644
--- a/clang/test/Driver/freebsd-mips-as.c
+++ b/clang/test/Driver/freebsd-mips-as.c
@@ -1,91 +1,91 @@
 // Check passing options to the assembler for MIPS targets.
 //
-// RUN: %clang -target mips-unknown-freebsd -### \
+// RUN: %clang --target=mips-unknown-freebsd -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS32-EB-AS %s
 // MIPS32-EB-AS: as{{(.exe)?}}" "-march" "mips2" "-mabi" "32" "-EB"
 // MIPS32-EB-AS-NOT: "-KPIC"
 //
-// RUN: %clang -target mips-unknown-freebsd -### \
+// RUN: %clang --target=mips-unknown-freebsd -### \
 // RUN:   -no-integrated-as -fPIC -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS32-EB-PIC %s
 // MIPS32-EB-PIC: as{{(.exe)?}}" "-march" "mips2" "-mabi" "32" "-EB"
 // MIPS32-EB-PIC: "-KPIC"
 //
-// RUN: %clang -target mips-unknown-freebsd -### \
+// RUN: %clang --target=mips-unknown-freebsd -### \
 // RUN:   -no-integrated-as -fpic -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS32-EB-PIC-SMALL %s
 // MIPS32-EB-PIC-SMALL: as{{(.exe)?}}" "-march" "mips2" "-mabi" "32" "-EB"
 // MIPS32-EB-PIC-SMALL: "-KPIC"
 //
-// RUN: %clang -target mips-unknown-freebsd -### \
+// RUN: %clang --target=mips-unknown-freebsd -### \
 // RUN:   -no-integrated-as -fPIE -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS32-EB-PIE %s
 // MIPS32-EB-PIE: as{{(.exe)?}}" "-march" "mips2" "-mabi" "32" "-EB"
 // MIPS32-EB-PIE: "-KPIC"
 //
-// RUN: %clang -target mips-unknown-freebsd -### \
+// RUN: %clang --target=mips-unknown-freebsd -### \
 // RUN:   -no-integrated-as -fpie -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS32-EB-PIE-SMALL %s
 // MIPS32-EB-PIE-SMALL: as{{(.exe)?}}" "-march" "mips2" "-mabi" "32" "-EB"
 // MIPS32-EB-PIE-SMALL: "-KPIC"
 //
-// RUN: %clang -target mipsel-unknown-freebsd -### \
+// RUN: %clang --target=mipsel-unknown-freebsd -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS32-DEF-EL-AS %s
 // MIPS32-DEF-EL-AS: as{{(.exe)?}}" "-march" "mips2" "-mabi" "32" "-EL"
 //
-// RUN: %clang -target mips64-unknown-freebsd -### \
+// RUN: %clang --target=mips64-unknown-freebsd -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS64-EB-AS %s
 // MIPS64-EB-AS: as{{(.exe)?}}" "-march" "mips3" "-mabi" "64" "-EB"
 //
-// RUN: %clang -target mips64el-unknown-freebsd -### \
+// RUN: %clang --target=mips64el-unknown-freebsd -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS64-DEF-EL-AS %s
 // MIPS64-DEF-EL-AS: as{{(.exe)?}}" "-march" "mips3" "-mabi" "64" "-EL"
 //
-// RUN: %clang -target mips64-unknown-freebsd -mabi=n32 -### \
+// RUN: %clang --target=mips64-unknown-freebsd -mabi=n32 -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-N32 %s
 // MIPS-N32: as{{(.exe)?}}" "-march" "mips3" "-mabi" "n32" "-EB"
 //
-// RUN: %clang -target mipsel-unknown-freebsd -mabi=32 -### \
+// RUN: %clang --target=mipsel-unknown-freebsd -mabi=32 -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS32-EL-AS %s
 // MIPS32-EL-AS: as{{(.exe)?}}" "-march" "mips2" "-mabi" "32" "-EL"
 //
-// RUN: %clang -target mips64el-unknown-freebsd -mabi=64 -### \
+// RUN: %clang --target=mips64el-unknown-freebsd -mabi=64 -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS64-EL-AS %s
 // MIPS64-EL-AS: as{{(.exe)?}}" "-march" "mips3" "-mabi" "64" "-EL"
 //
-// RUN: %clang -target mips-linux-freebsd -march=mips32r2 -### \
+// RUN: %clang --target=mips-linux-freebsd -march=mips32r2 -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-32R2 %s
 // MIPS-32R2: as{{(.exe)?}}" "-march" "mips32r2" "-mabi" "32" "-EB"
 //
-// RUN: %clang -target mips-unknown-freebsd -mips32 -### \
+// RUN: %clang --target=mips-unknown-freebsd -mips32 -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-ALIAS-32 %s
 // MIPS-ALIAS-32: as{{(.exe)?}}" "-march" "mips32" "-mabi" "32" "-EB"
 //
-// RUN: %clang -target mips-unknown-freebsd -mips32r2 -### \
+// RUN: %clang --target=mips-unknown-freebsd -mips32r2 -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-ALIAS-32R2 %s
 // MIPS-ALIAS-32R2: as{{(.exe)?}}" "-march" "mips32r2" "-mabi" "32" "-EB"
 //
-// RUN: %clang -target mips64-unknown-freebsd -mips64 -### \
+// RUN: %clang --target=mips64-unknown-freebsd -mips64 -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-ALIAS-64 %s
 // MIPS-ALIAS-64: as{{(.exe)?}}" "-march" "mips64" "-mabi" "64" "-EB"
 //
-// RUN: %clang -target mips64-unknown-freebsd -mips64r2 -### \
+// RUN: %clang --target=mips64-unknown-freebsd -mips64r2 -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-ALIAS-64R2 %s
 // MIPS-ALIAS-64R2: as{{(.exe)?}}" "-march" "mips64r2" "-mabi" "64" "-EB"
 //
-// RUN: %clang -target mips-unknown-freebsd -### \
+// RUN: %clang --target=mips-unknown-freebsd -### \
 // RUN:   -no-integrated-as -G0 -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS32-EB-AS-G0 %s
 // MIPS32-EB-AS-G0: as{{(.exe)?}}" "-march" "mips2" "-mabi" "32" "-EB" "-G0"
diff --git a/clang/test/Driver/freebsd.cpp b/clang/test/Driver/freebsd.cpp
index 6ddab91..dc8c98d 100644
--- a/clang/test/Driver/freebsd.cpp
+++ b/clang/test/Driver/freebsd.cpp
@@ -1,15 +1,15 @@
-// RUN: %clangxx %s -### -o %t.o -target amd64-unknown-freebsd -stdlib=platform 2>&1 \
+// RUN: %clangxx %s -### -o %t.o --target=amd64-unknown-freebsd -stdlib=platform 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-DEFAULT %s
-// RUN: %clangxx %s -### -o %t.o -target amd64-unknown-freebsd10.0 -stdlib=platform 2>&1 \
+// RUN: %clangxx %s -### -o %t.o --target=amd64-unknown-freebsd10.0 -stdlib=platform 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-TEN %s
 // CHECK-DEFAULT: "-lc++" "-lm"
 // CHECK-TEN: "-lc++" "-lm"
 
-// RUN: %clangxx %s -### -pg -o %t.o -target amd64-unknown-freebsd -stdlib=platform 2>&1 \
+// RUN: %clangxx %s -### -pg -o %t.o --target=amd64-unknown-freebsd -stdlib=platform 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PG-DEFAULT %s
-// RUN: %clangxx %s -### -pg -o %t.o -target amd64-unknown-freebsd14.0 -stdlib=platform 2>&1 \
+// RUN: %clangxx %s -### -pg -o %t.o --target=amd64-unknown-freebsd14.0 -stdlib=platform 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PG-FOURTEEN %s
-// RUN: %clangxx %s -### -pg -o %t.o -target amd64-unknown-freebsd10.0 -stdlib=platform 2>&1 \
+// RUN: %clangxx %s -### -pg -o %t.o --target=amd64-unknown-freebsd10.0 -stdlib=platform 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PG-TEN %s
 // CHECK-PG-DEFAULT: "-lc++" "-lm"
 // CHECK-PG-FOURTEEN: "-lc++" "-lm"
diff --git a/clang/test/Driver/fsanitize-coverage.c b/clang/test/Driver/fsanitize-coverage.c
index d34ad5f..c2de897 100644
--- a/clang/test/Driver/fsanitize-coverage.c
+++ b/clang/test/Driver/fsanitize-coverage.c
@@ -1,45 +1,45 @@
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=0 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-0
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge -fsanitize-coverage=0 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-0
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-0
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=0 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-0
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge -fsanitize-coverage=0 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-0
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-0
 // CHECK-SANITIZE-COVERAGE-0-NOT: fsanitize-coverage-type
 // CHECK-SANITIZE-COVERAGE-0: -fsanitize=address
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=kernel-address -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=hwaddress -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=kernel-hwaddress -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=memory -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=kernel-memory -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=leak -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=undefined -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=bounds -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=bool -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=dataflow -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=thread -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=kcfi -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target %itanium_abi_triple -fsanitize=float-divide-by-zero -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu                     -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=kernel-address -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=hwaddress -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=kernel-hwaddress -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=memory -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=kernel-memory -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=leak -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=bounds -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=bool -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=dataflow -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=thread -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=kcfi -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=%itanium_abi_triple -fsanitize=float-divide-by-zero -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu                     -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
 // CHECK-SANITIZE-COVERAGE-FUNC: fsanitize-coverage-type=1
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=bb %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-BB
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=bb %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-BB
 // CHECK-SANITIZE-COVERAGE-BB: fsanitize-coverage-type=2
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-EDGE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-EDGE
 // CHECK-SANITIZE-COVERAGE-EDGE: fsanitize-coverage-type=3
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge,indirect-calls %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC_INDIR
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge,indirect-calls %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC_INDIR
 // CHECK-SANITIZE-COVERAGE-FUNC_INDIR: fsanitize-coverage-type=3
 // CHECK-SANITIZE-COVERAGE-FUNC_INDIR: fsanitize-coverage-indirect-calls
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-1
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-1
 // CHECK-SANITIZE-COVERAGE-1: warning: argument '-fsanitize-coverage=1' is deprecated, use '-fsanitize-coverage=trace-pc-guard' instead
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_FUNC_BB_EDGE_DEPRECATED
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=bb %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_FUNC_BB_EDGE_DEPRECATED
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_FUNC_BB_EDGE_DEPRECATED
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_FUNC_BB_EDGE_DEPRECATED
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=bb %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_FUNC_BB_EDGE_DEPRECATED
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_FUNC_BB_EDGE_DEPRECATED
 // CHECK_FUNC_BB_EDGE_DEPRECATED: warning: argument '-fsanitize-coverage=[func|bb|edge]' is deprecated, use '-fsanitize-coverage=[func|bb|edge],[trace-pc-guard|trace-pc],[control-flow]' instead
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge,indirect-calls,trace-pc,trace-cmp,trace-loads,trace-stores,trace-div,trace-gep %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FEATURES
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge,indirect-calls,trace-pc,trace-cmp,trace-loads,trace-stores,trace-div,trace-gep %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FEATURES
 // CHECK-SANITIZE-COVERAGE-FEATURES: -fsanitize-coverage-type=3
 // CHECK-SANITIZE-COVERAGE-FEATURES: -fsanitize-coverage-indirect-calls
 // CHECK-SANITIZE-COVERAGE-FEATURES: -fsanitize-coverage-trace-cmp
@@ -49,7 +49,7 @@
 // CHECK-SANITIZE-COVERAGE-FEATURES: -fsanitize-coverage-trace-loads
 // CHECK-SANITIZE-COVERAGE-FEATURES: -fsanitize-coverage-trace-stores
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func,edge,indirect-calls,trace-cmp -fno-sanitize-coverage=edge,indirect-calls %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-MASK
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func,edge,indirect-calls,trace-cmp -fno-sanitize-coverage=edge,indirect-calls %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-MASK
 // CHECK-MASK: -fsanitize-coverage-type=1
 // CHECK-MASK: -fsanitize-coverage-trace-cmp
 // CHECK-MASK-NOT: -fsanitize-coverage-
@@ -60,30 +60,30 @@
 // RUN: not %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func -fsanitize-coverage=edge %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-INCOMPATIBLE
 // CHECK-INCOMPATIBLE: error: invalid argument '-fsanitize-coverage=func' not allowed with '-fsanitize-coverage=edge'
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=8bit-counters %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-8BIT
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=8bit-counters %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-8BIT
 // CHECK-8BIT: warning: argument '-fsanitize-coverage=8bit-counters' is deprecated, use '-fsanitize-coverage=trace-pc-guard' instead
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=trace-bb %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE-BB
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=trace-bb %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE-BB
 // CHECK-TRACE-BB: warning: argument '-fsanitize-coverage=trace-bb' is deprecated, use '-fsanitize-coverage=trace-pc-guard' instead
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_EDGE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=edge,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_EDGE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_EDGE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=edge,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_EDGE
 // CHECK-TRACE_PC_EDGE: -fsanitize-coverage-type=3
 // CHECK-TRACE_PC_EDGE: -fsanitize-coverage-trace-pc
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_FUNC
 // CHECK-TRACE_PC_FUNC: -fsanitize-coverage-type=1
 // CHECK-TRACE_PC_FUNC: -fsanitize-coverage-trace-pc
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_GUARD_EDGE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=edge,trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_GUARD_EDGE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_GUARD_EDGE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=edge,trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_GUARD_EDGE
 // CHECK-TRACE_PC_GUARD_EDGE: -fsanitize-coverage-type=3
 // CHECK-TRACE_PC_GUARD_EDGE: -fsanitize-coverage-trace-pc-guard
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=func,trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_GUARD_FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=func,trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_GUARD_FUNC
 // CHECK-TRACE_PC_GUARD_FUNC: -fsanitize-coverage-type=1
 // CHECK-TRACE_PC_GUARD_FUNC: -fsanitize-coverage-trace-pc-guard
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=stack-depth %s \
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=stack-depth %s \
 // RUN:     -### 2>&1 | FileCheck %s --check-prefix=CHECK-STACK-DEPTH
-// RUN: %clang -target x86_64-linux-gnu \
+// RUN: %clang --target=x86_64-linux-gnu \
 // RUN:     -fsanitize-coverage=trace-pc-guard,stack-depth %s -### 2>&1 | \
 // RUN:     FileCheck %s --check-prefix=CHECK-STACK-DEPTH-PC-GUARD
 // CHECK-STACK-DEPTH: -fsanitize-coverage-type=1
@@ -92,35 +92,35 @@
 // CHECK-STACK-DEPTH-PC-GUARD: -fsanitize-coverage-trace-pc-guard
 // CHECK-STACK-DEPTH-PC-GUARD: -fsanitize-coverage-stack-depth
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=trace-cmp,indirect-calls %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-TYPE-NECESSARY
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=trace-cmp,indirect-calls %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-TYPE-NECESSARY
 // CHECK-NO-TYPE-NECESSARY-NOT: error:
 // CHECK-NO-TYPE-NECESSARY: -fsanitize-coverage-indirect-calls
 // CHECK-NO-TYPE-NECESSARY: -fsanitize-coverage-trace-cmp
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func -fsanitize-coverage=trace-cmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-EXTEND-LEGACY
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func -fsanitize-coverage=trace-cmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-EXTEND-LEGACY
 // CHECK-EXTEND-LEGACY: -fsanitize-coverage-type=1
 // CHECK-EXTEND-LEGACY: -fsanitize-coverage-trace-cmp
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=no-prune,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_NOPRUNE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=no-prune,func,trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_NOPRUNE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=no-prune,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_NOPRUNE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=no-prune,func,trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_NOPRUNE
 // CHECK_NOPRUNE: -fsanitize-coverage-no-prune
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=inline-8bit-counters %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_INLINE8BIT
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=bb,inline-8bit-counters %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_INLINE8BIT
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=inline-8bit-counters %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_INLINE8BIT
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=bb,inline-8bit-counters %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_INLINE8BIT
 // CHECK_INLINE8BIT-NOT: warning:
 // CHECK_INLINE8BIT: -fsanitize-coverage-inline-8bit-counters
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=inline-8bit-counters,pc-table %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_PC_TABLE_FOR_INLINE8BIT
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=trace-pc-guard,pc-table %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_PC_TABLE_FOR_INLINE8BIT
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=inline-8bit-counters,pc-table %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_PC_TABLE_FOR_INLINE8BIT
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=trace-pc-guard,pc-table %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_PC_TABLE_FOR_INLINE8BIT
 // CHECK_PC_TABLE_FOR_INLINE8BIT: -fsanitize-coverage-pc-table
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=inline-bool-flag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_INLINE_BOOL_FLAG
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=bb,inline-bool-flag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_INLINE_BOOL_FLAG
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=inline-bool-flag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_INLINE_BOOL_FLAG
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=bb,inline-bool-flag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_INLINE_BOOL_FLAG
 // CHECK_INLINE_BOOL_FLAG-NOT: warning:
 // CHECK_INLINE_BOOL_FLAG: -fsanitize-coverage-inline-bool-flag
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=inline-bool-flag,pc-table %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_PC_TABLE_FOR_INLINEBOOL
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=trace-pc-guard,pc-table %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_PC_TABLE_FOR_INLINEBOOL
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=inline-bool-flag,pc-table %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_PC_TABLE_FOR_INLINEBOOL
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=trace-pc-guard,pc-table %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_PC_TABLE_FOR_INLINEBOOL
 // CHECK_PC_TABLE_FOR_INLINEBOOL: -fsanitize-coverage-pc-table
 
 // RUN: %clang_cl --target=i386-pc-win32 -fsanitize=address -fsanitize-coverage=func,trace-pc-guard -c -### -- %s 2>&1 | FileCheck %s -check-prefix=CLANG-CL-COVERAGE
@@ -131,11 +131,11 @@
 // CLANG-CL-COVERAGE: -fsanitize-coverage-type=1
 // CLANG-CL-COVERAGE: -fsanitize=address
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=safe-stack -fsanitize-coverage=trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-VS-SAFESTACK
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=safe-stack -fsanitize-coverage=trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-VS-SAFESTACK
 // CHECK-VS-SAFESTACK: -fsanitize-coverage-trace-pc-guard
 // CHECK-VS-SAFESTACK: -fsanitize=safe-stack
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=safe-stack -fsanitize-coverage=trace-pc-guard -fno-sanitize=safe-stack %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-SAFESTACK
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=safe-stack -fsanitize-coverage=trace-pc-guard -fno-sanitize=safe-stack %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-SAFESTACK
 // CHECK-NO-SAFESTACK-NOT: error:
 // CHECK-NO-SAFESTACK-NOT: warning:
 // CHECK-NO-SAFESTACK-NOT: argument unused
@@ -143,11 +143,11 @@
 // CHECK-NO-SAFESTACK-NOT: -fsanitize=safe-stack
 // CHECK-NO-SAFESTACK: -fsanitize-coverage-trace-pc-guard
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=shadow-call-stack -fsanitize-coverage=trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-VS-SHADOWCALLSTACK
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=shadow-call-stack -fsanitize-coverage=trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-VS-SHADOWCALLSTACK
 // CHECK-VS-SHADOWCALLSTACK: -fsanitize-coverage-trace-pc-guard
 // CHECK-VS-SHADOWCALLSTACK: -fsanitize=shadow-call-stack
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=shadow-call-stack -fsanitize-coverage=trace-pc-guard -fno-sanitize=shadow-call-stack %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-SAFESTACK
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=shadow-call-stack -fsanitize-coverage=trace-pc-guard -fno-sanitize=shadow-call-stack %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-SAFESTACK
 // CHECK-NO-SHADOWCALLSTACK-NOT: error:
 // CHECK-NO-SHADOWCALLSTACK-NOT: warning:
 // CHECK-NO-SHADOWCALLSTACK-NOT: argument unused
diff --git a/clang/test/Driver/fsanitize-ignorelist.c b/clang/test/Driver/fsanitize-ignorelist.c
index c4669e5..7dd666a 100644
--- a/clang/test/Driver/fsanitize-ignorelist.c
+++ b/clang/test/Driver/fsanitize-ignorelist.c
@@ -11,37 +11,37 @@
 // RUN: echo "fun:bar" > %t.second
 // RUN: echo "badline" > %t.bad
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-ignorelist=%t.good -fsanitize-ignorelist=%t.second %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-IGNORELIST
-// RUN: %clang -target aarch64-linux-gnu -fsanitize=hwaddress -fsanitize-ignorelist=%t.good -fsanitize-ignorelist=%t.second %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-IGNORELIST
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-ignorelist=%t.good -fsanitize-ignorelist=%t.second %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-IGNORELIST
+// RUN: %clang --target=aarch64-linux-gnu -fsanitize=hwaddress -fsanitize-ignorelist=%t.good -fsanitize-ignorelist=%t.second %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-IGNORELIST
 // CHECK-IGNORELIST: -fsanitize-ignorelist={{.*}}.good" "-fsanitize-ignorelist={{.*}}.second
 
 // Check that the default ignorelist is not added as an extra dependency.
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-IGNORELIST-ASAN --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-IGNORELIST-ASAN --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
 // CHECK-DEFAULT-IGNORELIST-ASAN: -fsanitize-system-ignorelist={{.*[^w]}}asan_ignorelist.txt
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=hwaddress -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-IGNORELIST-HWASAN --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=hwaddress -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-IGNORELIST-HWASAN --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
 // CHECK-DEFAULT-IGNORELIST-HWASAN: -fsanitize-system-ignorelist={{.*}}hwasan_ignorelist.txt
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=integer -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=nullability -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=undefined -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=alignment -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
-// RUN: %clang -target %itanium_abi_triple -fsanitize=float-divide-by-zero -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=integer -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=nullability -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=alignment -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
+// RUN: %clang --target=%itanium_abi_triple -fsanitize=float-divide-by-zero -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
 // CHECK-DEFAULT-UBSAN-IGNORELIST: -fsanitize-system-ignorelist={{.*}}ubsan_ignorelist.txt
 
 // Check that combining ubsan and another sanitizer results in both ignorelists being used.
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=undefined,address -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --check-prefix=CHECK-DEFAULT-IGNORELIST-ASAN --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined,address -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --check-prefix=CHECK-DEFAULT-IGNORELIST-ASAN --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
 
 // Ignore -fsanitize-ignorelist flag if there is no -fsanitize flag.
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-ignorelist=%t.good %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-SANITIZE --check-prefix=DELIMITERS
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-ignorelist=%t.good %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-SANITIZE --check-prefix=DELIMITERS
 // CHECK-NO-SANITIZE-NOT: -fsanitize-ignorelist
 
 // Ignore -fsanitize-ignorelist flag if there is no -fsanitize flag.
 // Now, check for the absence of -fdepfile-entry flags.
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-ignorelist=%t.good %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-SANITIZE2 --check-prefix=DELIMITERS
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-ignorelist=%t.good %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-SANITIZE2 --check-prefix=DELIMITERS
 // CHECK-NO-SANITIZE2-NOT: -fdepfile-entry
 
 // Flag -fno-sanitize-ignorelist wins if it is specified later.
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-ignorelist=%t.good -fno-sanitize-ignorelist %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IGNORELIST --check-prefix=DELIMITERS
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-ignorelist=%t.good -fno-sanitize-ignorelist %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IGNORELIST --check-prefix=DELIMITERS
 // CHECK-NO-IGNORELIST-NOT: -fsanitize-ignorelist
 
 // Driver barks on unexisting ignorelist files.
@@ -53,13 +53,13 @@
 // CHECK-BAD-IGNORELIST: error: malformed sanitizer ignorelist: 'error parsing file '{{.*}}.bad': malformed line 1: 'badline''
 
 // -fno-sanitize-ignorelist disables all ignorelists specified earlier.
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-ignorelist=%t.good -fno-sanitize-ignorelist -fsanitize-ignorelist=%t.second %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ONLY-FIRST-DISABLED --implicit-check-not=-fsanitize-ignorelist=
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-ignorelist=%t.good -fno-sanitize-ignorelist -fsanitize-ignorelist=%t.second %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ONLY-FIRST-DISABLED --implicit-check-not=-fsanitize-ignorelist=
 // CHECK-ONLY_FIRST-DISABLED-NOT: good
 // CHECK-ONLY-FIRST-DISABLED: -fsanitize-ignorelist={{.*}}.second
 // CHECK-ONLY_FIRST-DISABLED-NOT: good
 
 // -fno-sanitize-ignorelist disables the system ignorelists.
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fno-sanitize-ignorelist %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DISABLED-SYSTEM --check-prefix=DELIMITERS
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fno-sanitize-ignorelist %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DISABLED-SYSTEM --check-prefix=DELIMITERS
 // CHECK-DISABLED-SYSTEM-NOT: -fsanitize-system-ignorelist
 
 // If cfi_ignorelist.txt cannot be found in the resource dir, driver should fail.
@@ -67,7 +67,7 @@
 // CHECK-MISSING-CFI-IGNORELIST: error: missing sanitizer ignorelist: '{{.*}}cfi_ignorelist.txt'
 
 // -fno-sanitize-ignorelist disables checking for cfi_ignorelist.txt in the resource dir.
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=cfi -flto -fvisibility=default -fno-sanitize-ignorelist -resource-dir=/dev/null %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-MISSING-CFI-NO-IGNORELIST
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=cfi -flto -fvisibility=default -fno-sanitize-ignorelist -resource-dir=/dev/null %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-MISSING-CFI-NO-IGNORELIST
 // CHECK-MISSING-CFI-NO-IGNORELIST-NOT: error: no such file or directory: '{{.*}}cfi_ignorelist.txt'
 
 // DELIMITERS: {{^ *"}}
diff --git a/clang/test/Driver/fsanitize-memory-param-retval.c b/clang/test/Driver/fsanitize-memory-param-retval.c
index 79ade32..99d8cb7 100644
--- a/clang/test/Driver/fsanitize-memory-param-retval.c
+++ b/clang/test/Driver/fsanitize-memory-param-retval.c
@@ -1,14 +1,14 @@
-// RUN: %clang -target i386-gnu-linux %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target x86_64-linux-gnu %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target aarch64-linux-gnu %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target riscv32-linux-gnu %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target riscv64-linux-gnu %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target x86_64-linux-gnu %s -fsanitize=kernel-memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=i386-gnu-linux %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=x86_64-linux-gnu %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=aarch64-linux-gnu %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=riscv32-linux-gnu %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=riscv64-linux-gnu %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=x86_64-linux-gnu %s -fsanitize=kernel-memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
 
 // CHECK: "-fno-sanitize-memory-param-retval"
 
-// RUN: %clang -target aarch64-linux-gnu -fsyntax-only %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck --check-prefix=11 %s
+// RUN: %clang --target=aarch64-linux-gnu -fsyntax-only %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck --check-prefix=11 %s
 // 11: "-fno-sanitize-memory-param-retval"
 
-// RUN: not %clang -target x86_64-linux-gnu -fsyntax-only %s -fsanitize=memory -fno-sanitize-memory-param-retval=1 2>&1 | FileCheck --check-prefix=EXCESS %s
+// RUN: not %clang --target=x86_64-linux-gnu -fsyntax-only %s -fsanitize=memory -fno-sanitize-memory-param-retval=1 2>&1 | FileCheck --check-prefix=EXCESS %s
 // EXCESS: error: unknown argument: '-fno-sanitize-memory-param-retval=
diff --git a/clang/test/Driver/fsanitize-metadata-ignorelist.c b/clang/test/Driver/fsanitize-metadata-ignorelist.c
index 65a45cc..ad5f4be 100644
--- a/clang/test/Driver/fsanitize-metadata-ignorelist.c
+++ b/clang/test/Driver/fsanitize-metadata-ignorelist.c
@@ -3,12 +3,12 @@
 // RUN: echo "fun:foo" > %t.1
 // RUN: echo "fun:bar" > %t.2
 
-// RUN: %clang -target x86_64-linux-gnu -fexperimental-sanitize-metadata=all -fexperimental-sanitize-metadata-ignorelist=%t.1 -fexperimental-sanitize-metadata-ignorelist=%t.2 %s -### 2>&1 | FileCheck %s
-// RUN: %clang -target aarch64-linux-gnu -fexperimental-sanitize-metadata=atomics -fexperimental-sanitize-metadata-ignorelist=%t.1 -fexperimental-sanitize-metadata-ignorelist=%t.2 %s -### 2>&1 | FileCheck %s
+// RUN: %clang --target=x86_64-linux-gnu -fexperimental-sanitize-metadata=all -fexperimental-sanitize-metadata-ignorelist=%t.1 -fexperimental-sanitize-metadata-ignorelist=%t.2 %s -### 2>&1 | FileCheck %s
+// RUN: %clang --target=aarch64-linux-gnu -fexperimental-sanitize-metadata=atomics -fexperimental-sanitize-metadata-ignorelist=%t.1 -fexperimental-sanitize-metadata-ignorelist=%t.2 %s -### 2>&1 | FileCheck %s
 // CHECK: "-fexperimental-sanitize-metadata-ignorelist={{.*}}.1" "-fexperimental-sanitize-metadata-ignorelist={{.*}}.2"
 
 // Verify -fsanitize-metadata-ignorelist flag not passed if there is no -fsanitize-metadata flag.
-// RUN: %clang -target x86_64-linux-gnu -fexperimental-sanitize-metadata-ignorelist=%t.1 -fexperimental-sanitize-metadata-ignorelist=%t.2 %s -### 2>&1 | FileCheck %s --check-prefix=NOSANMD
-// RUN: %clang -target aarch64-linux-gnu -fexperimental-sanitize-metadata-ignorelist=%t.1 -fexperimental-sanitize-metadata-ignorelist=%t.2 %s -### 2>&1 | FileCheck %s --check-prefix=NOSANMD
+// RUN: %clang --target=x86_64-linux-gnu -fexperimental-sanitize-metadata-ignorelist=%t.1 -fexperimental-sanitize-metadata-ignorelist=%t.2 %s -### 2>&1 | FileCheck %s --check-prefix=NOSANMD
+// RUN: %clang --target=aarch64-linux-gnu -fexperimental-sanitize-metadata-ignorelist=%t.1 -fexperimental-sanitize-metadata-ignorelist=%t.2 %s -### 2>&1 | FileCheck %s --check-prefix=NOSANMD
 // NOSANMD: warning: argument unused during compilation: '-fexperimental-sanitize-metadata-ignorelist
 // NOSANMD-NOT: "-fexperimental-sanitize-metadata-ignorelist
diff --git a/clang/test/Driver/fsanitize-object-size.c b/clang/test/Driver/fsanitize-object-size.c
index 50c6783..78c7202 100644
--- a/clang/test/Driver/fsanitize-object-size.c
+++ b/clang/test/Driver/fsanitize-object-size.c
@@ -1,27 +1,27 @@
 // Check that the object size check is disabled at -O0.
 //
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=object-size %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=object-size %s -O0 -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=null,object-size %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -Werror -fsanitize=null,object-size %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=undefined %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OSIZE-NO-WARNING
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=object-size %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=object-size %s -O0 -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=null,object-size %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -Werror -fsanitize=null,object-size %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OSIZE-NO-WARNING
 
 // Check that the object size check is enabled at other optimization levels.
 //
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=undefined -O1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=object-size -O2 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=object-size -O3 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=object-size -O4 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=object-size -Ofast %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=object-size -Os %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=object-size -Oz %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=object-size -Og %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -O1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=object-size -O2 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=object-size -O3 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=object-size -O4 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=object-size -Ofast %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=object-size -Os %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=object-size -Oz %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=object-size -Og %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
 
 // Use of trap mode shouldn't affect the object size check.
 //
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -O1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=undefined-trap -O1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=undefined-trap -fsanitize-undefined-trap-on-error -O1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -O1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined-trap -O1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined-trap -fsanitize-undefined-trap-on-error -O1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
 
 // CHECK-HAS-OSIZE-NOT: warning: the object size sanitizer
 // CHECK-HAS-OSIZE: -fsanitize={{[^ ]*}}object-size
diff --git a/clang/test/Driver/fsemantic-interposition.c b/clang/test/Driver/fsemantic-interposition.c
index 0ee0dbb..aaa4487 100644
--- a/clang/test/Driver/fsemantic-interposition.c
+++ b/clang/test/Driver/fsemantic-interposition.c
@@ -1,20 +1,20 @@
-// RUN: %clang --sysroot=%S/Inputs -target x86_64 %s -Werror -fpic -fsemantic-interposition -c -### 2>&1 | FileCheck %s
-// RUN: %clang --sysroot=%S/Inputs -target x86_64 %s -Werror -fPIC -fsemantic-interposition -c -### 2>&1 | FileCheck %s
+// RUN: %clang --sysroot=%S/Inputs --target=x86_64 %s -Werror -fpic -fsemantic-interposition -c -### 2>&1 | FileCheck %s
+// RUN: %clang --sysroot=%S/Inputs --target=x86_64 %s -Werror -fPIC -fsemantic-interposition -c -### 2>&1 | FileCheck %s
 // CHECK: "-fsemantic-interposition"
 
 /// No-op for -fno-pic/-fpie.
-// RUN: %clang --sysroot=%S/Inputs -target x86_64 %s -Werror -fsemantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NOOP %s
-// RUN: %clang --sysroot=%S/Inputs -target x86_64 %s -Werror -fPIE -fsemantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NOOP %s
+// RUN: %clang --sysroot=%S/Inputs --target=x86_64 %s -Werror -fsemantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NOOP %s
+// RUN: %clang --sysroot=%S/Inputs --target=x86_64 %s -Werror -fPIE -fsemantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NOOP %s
 // NOOP-NOT: "-fsemantic-interposition"
 // NOOP-NOT: "-fno-semantic-interposition"
 
 /// If -fno-semantic-interposition is specified and the target supports local
 /// aliases, neither CC1 option is set.
-// RUN: %clang --sysroot=%S/Inputs -target aarch64 %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
-// RUN: %clang --sysroot=%S/Inputs -target riscv32 %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
-// RUN: %clang --sysroot=%S/Inputs -target riscv64 %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
-// RUN: %clang --sysroot=%S/Inputs -target i386 %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
-// RUN: %clang --sysroot=%S/Inputs -target x86_64 %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
+// RUN: %clang --sysroot=%S/Inputs --target=aarch64 %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
+// RUN: %clang --sysroot=%S/Inputs --target=riscv32 %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
+// RUN: %clang --sysroot=%S/Inputs --target=riscv64 %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
+// RUN: %clang --sysroot=%S/Inputs --target=i386 %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
+// RUN: %clang --sysroot=%S/Inputs --target=x86_64 %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
 // NO-NOT: "-fsemantic-interposition"
 // NO-NOT: "-fhalf-no-semantic-interposition"
 
@@ -23,8 +23,8 @@
 /// local aliases, use the traditional half-baked behavor: interprocedural
 /// optimizations are allowed but local aliases are not used. If references are
 /// not optimized out, semantic interposition at runtime is possible.
-// RUN: %clang --sysroot=%S/Inputs -target ppc64le %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=HALF %s
+// RUN: %clang --sysroot=%S/Inputs --target=ppc64le %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=HALF %s
 
-// RUN: %clang --sysroot=%S/Inputs -target x86_64 %s -Werror -fPIC -c -### 2>&1 | FileCheck --check-prefix=HALF %s
+// RUN: %clang --sysroot=%S/Inputs --target=x86_64 %s -Werror -fPIC -c -### 2>&1 | FileCheck --check-prefix=HALF %s
 //
 // HALF: "-fhalf-no-semantic-interposition"
diff --git a/clang/test/Driver/fsjlj-exceptions.c b/clang/test/Driver/fsjlj-exceptions.c
index fd16a51..122513f 100644
--- a/clang/test/Driver/fsjlj-exceptions.c
+++ b/clang/test/Driver/fsjlj-exceptions.c
@@ -1,6 +1,6 @@
 // RUN: %clang -target armv7-apple-ios -fexceptions -c %s -o /dev/null -### 2>&1 | FileCheck -check-prefix CHECK-IOS %s
-// RUN: %clang -target i686-windows-gnu -fexceptions -c %s -o /dev/null -### 2>&1 | FileCheck -check-prefix CHECK-MINGW-DEFAULT %s
-// RUN: %clang -target i686-windows-gnu -fexceptions -fsjlj-exceptions -c %s -o /dev/null -### 2>&1 | FileCheck -check-prefix CHECK-MINGW-SJLJ %s
+// RUN: %clang --target=i686-windows-gnu -fexceptions -c %s -o /dev/null -### 2>&1 | FileCheck --check-prefix=CHECK-MINGW-DEFAULT %s
+// RUN: %clang --target=i686-windows-gnu -fexceptions -fsjlj-exceptions -c %s -o /dev/null -### 2>&1 | FileCheck --check-prefix=CHECK-MINGW-SJLJ %s
 
 // CHECK-IOS: -exception-model=sjlj
 // CHECK-MINGW-DEFAULT-NOT: -exception-model=sjlj
diff --git a/clang/test/Driver/fuse-ld-windows.c b/clang/test/Driver/fuse-ld-windows.c
index 089f296..8a5af61c 100644
--- a/clang/test/Driver/fuse-ld-windows.c
+++ b/clang/test/Driver/fuse-ld-windows.c
@@ -1,23 +1,23 @@
 // REQUIRES: system-windows
 
 // We used to require adding ".exe" suffix when cross-compiling on Windows.
-// RUN: %clang %s -### -o %t.o -target i386-unknown-linux \
+// RUN: %clang %s -### -o %t.o --target=i386-unknown-linux \
 // RUN:     -B %S/Inputs/fuse_ld_windows -fuse-ld=foo 2>&1 \
 // RUN:   | FileCheck %s
 
 // Check that the old variant still works.
-// RUN: %clang %s -### -o %t.o -target i386-unknown-linux \
+// RUN: %clang %s -### -o %t.o --target=i386-unknown-linux \
 // RUN:     -B %S/Inputs/fuse_ld_windows -fuse-ld=foo.exe 2>&1 \
 // RUN:   | FileCheck %s
 
 // With the full path, the extension can be omitted, too,
 // because Windows allows that.
-// RUN: %clang %s -### -o %t.o -target i386-unknown-linux \
+// RUN: %clang %s -### -o %t.o --target=i386-unknown-linux \
 // RUN:     -fuse-ld=%S/Inputs/fuse_ld_windows/ld.foo 2>&1 \
 // RUN:   | FileCheck %s
 
 // Check that the full path with the extension works too.
-// RUN: %clang %s -### -o %t.o -target i386-unknown-linux \
+// RUN: %clang %s -### -o %t.o --target=i386-unknown-linux \
 // RUN:     -fuse-ld=%S/Inputs/fuse_ld_windows/ld.foo.exe 2>&1 \
 // RUN:   | FileCheck %s
 
diff --git a/clang/test/Driver/fuse-ld.c b/clang/test/Driver/fuse-ld.c
index ef2f8c9..f807434 100644
--- a/clang/test/Driver/fuse-ld.c
+++ b/clang/test/Driver/fuse-ld.c
@@ -15,88 +15,88 @@
 // CHECK-NO-WARN-NOT: warning:
 
 // RUN: %clang %s -### \
-// RUN:     -target x86_64-unknown-freebsd 2>&1 \
+// RUN:     --target=x86_64-unknown-freebsd 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-FREEBSD-LD
 // CHECK-FREEBSD-LD: ld
 
 // RUN: %clang %s -### -fuse-ld=bfd \
 // RUN:     --sysroot=%S/Inputs/basic_freebsd_tree \
-// RUN:     -target x86_64-unknown-freebsd \
+// RUN:     --target=x86_64-unknown-freebsd \
 // RUN:     -B%S/Inputs/basic_freebsd_tree/usr/bin 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=CHECK-FREEBSD-BFD
 // CHECK-FREEBSD-BFD: Inputs/basic_freebsd_tree/usr/bin{{/|\\+}}ld.bfd
 
 // RUN: %clang %s -### -fuse-ld=gold \
 // RUN:     --sysroot=%S/Inputs/basic_freebsd_tree \
-// RUN:     -target x86_64-unknown-freebsd \
+// RUN:     --target=x86_64-unknown-freebsd \
 // RUN:     -B%S/Inputs/basic_freebsd_tree/usr/bin 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=CHECK-FREEBSD-GOLD
 // CHECK-FREEBSD-GOLD: Inputs/basic_freebsd_tree/usr/bin{{/|\\+}}ld.gold
 
 // RUN: not %clang %s -### -fuse-ld=plib \
 // RUN:     --sysroot=%S/Inputs/basic_freebsd_tree \
-// RUN:     -target x86_64-unknown-freebsd \
+// RUN:     --target=x86_64-unknown-freebsd \
 // RUN:     -B%S/Inputs/basic_freebsd_tree/usr/bin 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=CHECK-FREEBSD-PLIB
 // CHECK-FREEBSD-PLIB: error: invalid linker name
 
 // RUN: %clang %s -### -fuse-ld=ld \
-// RUN:     -target arm-linux-androideabi \
+// RUN:     --target=arm-linux-androideabi \
 // RUN:     -B%S/Inputs/basic_android_tree/bin/arm-linux-androideabi- 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-ANDROID-ARM-LD
 // CHECK-ANDROID-ARM-LD: ld.lld
 
 // RUN: %clang %s -### -fuse-ld=bfd \
-// RUN:     -target arm-linux-androideabi \
+// RUN:     --target=arm-linux-androideabi \
 // RUN:     -B%S/Inputs/basic_android_tree/bin/arm-linux-androideabi- 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=CHECK-ANDROID-ARM-BFD
 // CHECK-ANDROID-ARM-BFD: Inputs/basic_android_tree/bin{{/|\\+}}arm-linux-androideabi-ld.bfd
 
 // RUN: %clang %s -### -fuse-ld=gold \
-// RUN:     -target arm-linux-androideabi \
+// RUN:     --target=arm-linux-androideabi \
 // RUN:     -B%S/Inputs/basic_android_tree/bin/arm-linux-androideabi- 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=CHECK-ANDROID-ARM-GOLD
 // CHECK-ANDROID-ARM-GOLD: Inputs/basic_android_tree/bin{{/|\\+}}arm-linux-androideabi-ld.gold
 
 // RUN: %clang %s -### -fuse-ld=ld \
-// RUN:     -target arm-linux-androideabi \
+// RUN:     --target=arm-linux-androideabi \
 // RUN:     --gcc-toolchain=%S/Inputs/basic_android_tree 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-ANDROID-ARM-LD-TC
 // CHECK-ANDROID-ARM-LD-TC: ld.lld
 
 // RUN: %clang %s -### -fuse-ld=bfd \
-// RUN:     -target arm-linux-androideabi \
+// RUN:     --target=arm-linux-androideabi \
 // RUN:     --gcc-toolchain=%S/Inputs/basic_android_tree 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=CHECK-ANDROID-ARM-BFD-TC
 // CHECK-ANDROID-ARM-BFD-TC: Inputs/basic_android_tree/lib/gcc/arm-linux-androideabi/4.4.3/../../../../arm-linux-androideabi/bin{{/|\\+}}ld.bfd
 
 // RUN: %clang %s -### -fuse-ld=gold \
-// RUN:     -target arm-linux-androideabi \
+// RUN:     --target=arm-linux-androideabi \
 // RUN:     --gcc-toolchain=%S/Inputs/basic_android_tree 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=CHECK-ANDROID-ARM-GOLD-TC
 // CHECK-ANDROID-ARM-GOLD-TC: Inputs/basic_android_tree/lib/gcc/arm-linux-androideabi/4.4.3/../../../../arm-linux-androideabi/bin{{/|\\+}}ld.gold
 
 
 // RUN: %clang %s -### -fuse-ld=link \
-// RUN:     -target i686-unknown-windows-msvc 2>&1 \
+// RUN:     --target=i686-unknown-windows-msvc 2>&1 \
 // RUN:   | FileCheck %s --check-prefix CHECK-WINDOWS-MSVC-LINK
 // CHECK-WINDOWS-MSVC-LINK: "{{.*}}link.exe"
 // CHECK-WINDOWS-MSVC-LINK-SAME: "-out:{{.*}}"
 
 // RUN: %clang %s -### -fuse-ld=lld \
-// RUN:     -target i686-unknown-windows-msvc 2>&1 \
+// RUN:     --target=i686-unknown-windows-msvc 2>&1 \
 // RUN:   | FileCheck %s --check-prefix CHECK-WINDOWS-MSVC-LLD
 // CHECK-WINDOWS-MSVC-LLD: "{{.*}}lld-link{{\.exe"|"}}
 // CHECK-WINDOWS-MSVC-LLD-SAME: "-out:{{.*}}"
 
 // RUN: %clang %s -### -fuse-ld=lld-link \
-// RUN:     -target i686-unknown-windows-msvc 2>&1 \
+// RUN:     --target=i686-unknown-windows-msvc 2>&1 \
 // RUN:   | FileCheck %s --check-prefix CHECK-WINDOWS-MSVC-LLD-LINK
 // CHECK-WINDOWS-MSVC-LLD-LINK: "{{.*}}lld-link{{\.exe"|"}}
 // CHECK-WINDOWS-MSVC-LLD-LINK-SAME: "-out:{{.*}}"
 
 // RUN: %clang %s -### -fuse-ld=bfd \
-// RUN:     -target i686-unknown-windows-msvc \
+// RUN:     --target=i686-unknown-windows-msvc \
 // RUN:     -B %S/Inputs/Windows/usr/bin 2>&1 \
 // RUN:   | FileCheck %s --check-prefix CHECK-WINDOWS-MSVC-BFD
 // CHECK-WINDOWS-MSVC-BFD: "{{.*}}ld.bfd"
diff --git a/clang/test/Driver/fuzzer.c b/clang/test/Driver/fuzzer.c
index 14caf76..409fbfac 100644
--- a/clang/test/Driver/fuzzer.c
+++ b/clang/test/Driver/fuzzer.c
@@ -8,7 +8,7 @@
 // CHECK-COVERAGE-SAME: -fsanitize-coverage-pc-table
 // CHECK-FUZZER-LIB: libclang_rt.fuzzer
 
-// RUN: %clang -fsanitize=fuzzer -target i386-unknown-linux -stdlib=platform %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LIBCXX-LINUX %s
+// RUN: %clang -fsanitize=fuzzer --target=i386-unknown-linux -stdlib=platform %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LIBCXX-LINUX %s
 //
 // CHECK-LIBCXX-LINUX: -lstdc++
 
@@ -29,18 +29,18 @@
 // Check that we respect whether thes tandard library should be linked
 // statically.
 //
-// RUN: %clang -fsanitize=fuzzer -target i386-unknown-linux -stdlib=libstdc++ %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LIBSTDCXX-DYNAMIC %s
+// RUN: %clang -fsanitize=fuzzer --target=i386-unknown-linux -stdlib=libstdc++ %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LIBSTDCXX-DYNAMIC %s
 // CHECK-LIBSTDCXX-DYNAMIC-NOT: -Bstatic
 // CHECK-LIBSTDCXX-DYNAMIC: -lstdc++
 //
-// RUN: %clang -fsanitize=fuzzer -target i386-unknown-linux -stdlib=libstdc++ -static-libstdc++ %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LIBSTDCXX-STATIC %s
+// RUN: %clang -fsanitize=fuzzer --target=i386-unknown-linux -stdlib=libstdc++ -static-libstdc++ %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LIBSTDCXX-STATIC %s
 // CHECK-LIBSTDCXX-STATIC: "-Bstatic" "-lstdc++"
 //
-// RUN: %clang -fsanitize=fuzzer -target i386-unknown-linux -stdlib=libc++ %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LIBCXX-DYNAMIC %s
+// RUN: %clang -fsanitize=fuzzer --target=i386-unknown-linux -stdlib=libc++ %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LIBCXX-DYNAMIC %s
 // CHECK-LIBCXX-DYNAMIC-NOT: -Bstatic
 // CHECK-LIBCXX-DYNAMIC: -lc++
 //
-// RUN: %clang -fsanitize=fuzzer -target i386-unknown-linux -stdlib=libc++ -static-libstdc++ %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LIBCXX-STATIC %s
+// RUN: %clang -fsanitize=fuzzer --target=i386-unknown-linux -stdlib=libc++ -static-libstdc++ %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LIBCXX-STATIC %s
 // CHECK-LIBCXX-STATIC: "-Bstatic" "-lc++"
 
 int LLVMFuzzerTestOneInput(const char *Data, long Size) {
diff --git a/clang/test/Driver/fveclib.c b/clang/test/Driver/fveclib.c
index 8a23028..9b0f1ce 100644
--- a/clang/test/Driver/fveclib.c
+++ b/clang/test/Driver/fveclib.c
@@ -1,11 +1,11 @@
-// RUN: %clang -### -c -fveclib=none %s 2>&1 | FileCheck -check-prefix CHECK-NOLIB %s
-// RUN: %clang -### -c -fveclib=Accelerate %s 2>&1 | FileCheck -check-prefix CHECK-ACCELERATE %s
-// RUN: %clang -### -c -fveclib=libmvec %s 2>&1 | FileCheck -check-prefix CHECK-libmvec %s
-// RUN: %clang -### -c -fveclib=MASSV %s 2>&1 | FileCheck -check-prefix CHECK-MASSV %s
-// RUN: %clang -### -c -fveclib=Darwin_libsystem_m %s 2>&1 | FileCheck -check-prefix CHECK-DARWIN_LIBSYSTEM_M %s
-// RUN: %clang -### -c --target=aarch64-none-none -fveclib=SLEEF %s 2>&1 | FileCheck -check-prefix CHECK-SLEEF %s
-// RUN: %clang -### -c --target=aarch64-none-none -fveclib=ArmPL %s 2>&1 | FileCheck -check-prefix CHECK-ARMPL %s
-// RUN: not %clang -c -fveclib=something %s 2>&1 | FileCheck -check-prefix CHECK-INVALID %s
+// RUN: %clang -### -c -fveclib=none %s 2>&1 | FileCheck --check-prefix=CHECK-NOLIB %s
+// RUN: %clang -### -c -fveclib=Accelerate %s 2>&1 | FileCheck --check-prefix=CHECK-ACCELERATE %s
+// RUN: %clang -### -c -fveclib=libmvec %s 2>&1 | FileCheck --check-prefix=CHECK-libmvec %s
+// RUN: %clang -### -c -fveclib=MASSV %s 2>&1 | FileCheck --check-prefix=CHECK-MASSV %s
+// RUN: %clang -### -c -fveclib=Darwin_libsystem_m %s 2>&1 | FileCheck --check-prefix=CHECK-DARWIN_LIBSYSTEM_M %s
+// RUN: %clang -### -c --target=aarch64 -fveclib=SLEEF %s 2>&1 | FileCheck --check-prefix=CHECK-SLEEF %s
+// RUN: %clang -### -c --target=aarch64 -fveclib=ArmPL %s 2>&1 | FileCheck --check-prefix=CHECK-ARMPL %s
+// RUN: not %clang -c -fveclib=something %s 2>&1 | FileCheck --check-prefix=CHECK-INVALID %s
 
 // CHECK-NOLIB: "-fveclib=none"
 // CHECK-ACCELERATE: "-fveclib=Accelerate"
@@ -17,10 +17,10 @@
 
 // CHECK-INVALID: error: invalid value 'something' in '-fveclib=something'
 
-// RUN: not %clang --target=x86-none-none -c -fveclib=SLEEF %s 2>&1 | FileCheck -check-prefix CHECK-ERROR %s
-// RUN: not %clang --target=x86-none-none -c -fveclib=ArmPL %s 2>&1 | FileCheck -check-prefix CHECK-ERROR %s
-// RUN: not %clang --target=aarch64-none-none -c -fveclib=LIBMVEC-X86 %s 2>&1 | FileCheck -check-prefix CHECK-ERROR %s
-// RUN: not %clang --target=aarch64-none-none -c -fveclib=SVML %s 2>&1 | FileCheck -check-prefix CHECK-ERROR %s
+// RUN: not %clang --target=x86 -c -fveclib=SLEEF %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+// RUN: not %clang --target=x86 -c -fveclib=ArmPL %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+// RUN: not %clang --target=aarch64 -c -fveclib=LIBMVEC-X86 %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+// RUN: not %clang --target=aarch64 -c -fveclib=SVML %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
 // CHECK-ERROR: unsupported option {{.*}} for target
 
 // RUN: %clang -fveclib=Accelerate %s -target arm64-apple-ios8.0.0 -### 2>&1 | FileCheck --check-prefix=CHECK-LINK %s
@@ -35,17 +35,17 @@
 
 /* Verify that the correct vector library is passed to LTO flags. */
 
-// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=LIBMVEC -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-LIBMVEC %s
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=LIBMVEC -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-LIBMVEC %s
 // CHECK-LTO-LIBMVEC: "-plugin-opt=-vector-library=LIBMVEC-X86"
 
-// RUN: %clang -### --target=powerpc64-unknown-linux-gnu -fveclib=MASSV -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-MASSV %s
+// RUN: %clang -### --target=powerpc64-unknown-linux-gnu -fveclib=MASSV -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-MASSV %s
 // CHECK-LTO-MASSV: "-plugin-opt=-vector-library=MASSV"
 
-// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=SVML -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-SVML %s
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=SVML -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-SVML %s
 // CHECK-LTO-SVML: "-plugin-opt=-vector-library=SVML"
 
-// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=SLEEF -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-SLEEF %s
+// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=SLEEF -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-SLEEF %s
 // CHECK-LTO-SLEEF: "-plugin-opt=-vector-library=sleefgnuabi"
 
-// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=ArmPL -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-ARMPL %s
+// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=ArmPL -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-ARMPL %s
 // CHECK-LTO-ARMPL: "-plugin-opt=-vector-library=ArmPL"
diff --git a/clang/test/Driver/loongarch-mlasx-error.c b/clang/test/Driver/loongarch-mlasx-error.c
index e66f277..1d88f0f 100644
--- a/clang/test/Driver/loongarch-mlasx-error.c
+++ b/clang/test/Driver/loongarch-mlasx-error.c
@@ -11,5 +11,5 @@
 // RUN: not %clang --target=loongarch64 %s -fsyntax-only -mlasx -mno-lsx 2>&1 \
 // RUN:   FileCheck --check-prefix=ERROR_LASX_FPU128 %s
 
-// ERROR_LASX_FPU64: error: wrong fpu width; LASX depends on 64-bit FPU.
-// ERROR_LASX_FPU128: error: invalid option combination; LASX depends on LSX.
+// ERROR_LASX_FPU64: error: wrong fpu width; LASX depends on 64-bit FPU
+// ERROR_LASX_FPU128: error: invalid option combination; LASX depends on LSX
diff --git a/clang/test/Driver/loongarch-mlsx-error.c b/clang/test/Driver/loongarch-mlsx-error.c
index bd6b8e2..db1f6fb 100644
--- a/clang/test/Driver/loongarch-mlsx-error.c
+++ b/clang/test/Driver/loongarch-mlsx-error.c
@@ -9,4 +9,4 @@
 // RUN: not %clang --target=loongarch64 %s -fsyntax-only -mlsx -mfpu=none 2>&1 \
 // RUN:   FileCheck --check-prefix=ERROR_LSX_FPU64 %s
 
-// ERROR_LSX_FPU64: error: wrong fpu width; LSX depends on 64-bit FPU.
+// ERROR_LSX_FPU64: error: wrong fpu width; LSX depends on 64-bit FPU
diff --git a/clang/test/Driver/ms-define-stdc.c b/clang/test/Driver/ms-define-stdc.c
new file mode 100644
index 0000000..d5e873d
--- /dev/null
+++ b/clang/test/Driver/ms-define-stdc.c
@@ -0,0 +1,11 @@
+// Note: %s must be preceded by --, otherwise it may be interpreted as a
+// command-line option, e.g. on Mac where %s is commonly under /Users.
+//
+// Note: see also cl-zc.cpp
+
+// RUN: %clang_cl /TC /dev/null /E -Xclang -dM /Zc:__STDC__- 2>&1 | FileCheck %s --check-prefix=ZCSTDCIGNORED
+// ZCSTDCIGNORED-NOT: #define __STDC__ 1
+// ZCSTDCIGNORED: argument unused during compilation
+
+// RUN: not %clang -Xclang -fno-ms-define-stdc %s 2>&1 | FileCheck %s --check-prefix="NOARG"
+// NOARG: error: unknown argument: '-fno-ms-define-stdc'
diff --git a/clang/test/Driver/openmp-offload-infer.c b/clang/test/Driver/openmp-offload-infer.c
index 5033329..388860a 100644
--- a/clang/test/Driver/openmp-offload-infer.c
+++ b/clang/test/Driver/openmp-offload-infer.c
@@ -43,7 +43,7 @@
 // RUN:     --offload-arch=sm_70 --offload-arch=gfx908 --offload-arch=skylake \
 // RUN:     -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-FAILED
 
-// CHECK-FAILED: error: failed to deduce triple for target architecture 'skylake'; specify the triple using '-fopenmp-targets' and '-Xopenmp-target' instead.
+// CHECK-FAILED: error: failed to deduce triple for target architecture 'skylake'; specify the triple using '-fopenmp-targets' and '-Xopenmp-target' instead
 
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp=libomp \
 // RUN:     --offload-arch=sm_70 --offload-arch=gfx908 -fno-openmp \
diff --git a/clang/test/Driver/openmp-system-arch.c b/clang/test/Driver/openmp-system-arch.c
index 4e024e6..a48c1e7 100644
--- a/clang/test/Driver/openmp-system-arch.c
+++ b/clang/test/Driver/openmp-system-arch.c
@@ -31,7 +31,7 @@
 // RUN:   not %clang -### --target=x86_64-unknown-linux-gnu -nogpulib -fopenmp=libomp --offload-arch= \
 // RUN:     --nvptx-arch-tool=%t/nvptx_arch_empty --amdgpu-arch-tool=%t/amdgpu_arch_empty %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=NO-OUTPUT-ERROR
-// NO-OUTPUT-ERROR: error: failed to deduce triple for target architecture 'native'; specify the triple using '-fopenmp-targets' and '-Xopenmp-target' instead.
+// NO-OUTPUT-ERROR: error: failed to deduce triple for target architecture 'native'; specify the triple using '-fopenmp-targets' and '-Xopenmp-target' instead
 
 // case when amdgpu-arch succeeds.
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -nogpulib -fopenmp=libomp --offload-arch=native \
diff --git a/clang/test/Driver/tocdata-cc1.c b/clang/test/Driver/tocdata-cc1.c
index fe0d97e..e00383d 100644
--- a/clang/test/Driver/tocdata-cc1.c
+++ b/clang/test/Driver/tocdata-cc1.c
@@ -1,16 +1,13 @@
 // RUN: %clang -### --target=powerpc-ibm-aix-xcoff -mcmodel=medium -mtocdata %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-NOTOC %s
+// RUN:   | FileCheck %s
 // RUN: %clang -### --target=powerpc-ibm-aix-xcoff -mcmodel=large -mtocdata %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-NOTOC %s
+// RUN:   | FileCheck %s
 // RUN: %clang -### --target=powerpc-ibm-aix-xcoff -mtocdata %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-TOC %s
+// RUN:   | FileCheck %s
 // RUN: %clang -### --target=powerpc64-ibm-aix-xcoff -mcmodel=medium -mtocdata %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-NOTOC %s
+// RUN:   | FileCheck %s
 // RUN: %clang -### --target=powerpc64-ibm-aix-xcoff -mcmodel=large -mtocdata %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-NOTOC %s
+// RUN:   | FileCheck %s
 // RUN: %clang -### --target=powerpc64-ibm-aix-xcoff -mtocdata %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-TOC %s
-// CHECK-NOTOC: warning: ignoring '-mtocdata' as it is only supported for -mcmodel=small
-// CHECK-NOTOC-NOT: "-cc1"{{.*}}" "-mtocdata"
-// CHECK-TOC: "-cc1"{{.*}}" "-mtocdata"
-// CHECK-TOC-NOT: warning: ignoring '-mtocdata' as it is only supported for -mcmodel=small
+// RUN:   | FileCheck %s
+// CHECK: "-cc1"{{.*}}" "-mtocdata"
diff --git a/clang/test/Driver/x-args.c b/clang/test/Driver/x-args.c
index 17bb5d9..06c9c7a 100644
--- a/clang/test/Driver/x-args.c
+++ b/clang/test/Driver/x-args.c
@@ -6,6 +6,4 @@
 // RUN: %clang -fsyntax-only %s -xc %s -xc++ -fsyntax-only 2>&1 | FileCheck %s
 // CHECK: '-x c++' after last input file has no effect
 
-// RUN: not %clang_cl /WX /clang:-xc /clang:-E /clang:-dM -- %s 2>&1 | FileCheck --implicit-check-not="error:" -check-prefix=CL %s
-// RUN: not %clang_cl /TC /WX /clang:-xc /clang:-E /clang:-dM -- %s 2>&1 | FileCheck --implicit-check-not="error:" -check-prefix=CL %s
-// CL: error: unsupported option '-x c'; did you mean '/TC' or '/TP'?
+// RUN: %clang_cl -fsyntax-only /WX -xc++ -- %s
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index 25f8f66..1d5f001 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -21,10 +21,10 @@
 // SSE4-AES: "-target-feature" "+sse4.2" "-target-feature" "+aes"
 // NO-SSE4-AES: "-target-feature" "-sse4.1" "-target-feature" "-aes"
 
-// RUN: %clang --target=i386 -march=i386 -mavx -mavx2 -mavx512f -mavx512cd -mavx512er -mavx512pf -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512ifma %s -### 2>&1 | FileCheck -check-prefix=AVX %s
-// RUN: %clang --target=i386 -march=i386 -mno-avx -mno-avx2 -mno-avx512f -mno-avx512cd -mno-avx512er -mno-avx512pf -mno-avx512dq -mno-avx512bw -mno-avx512vl -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512ifma %s -### 2>&1 | FileCheck -check-prefix=NO-AVX %s
-// AVX: "-target-feature" "+avx" "-target-feature" "+avx2" "-target-feature" "+avx512f" "-target-feature" "+avx512cd" "-target-feature" "+avx512er" "-target-feature" "+avx512pf" "-target-feature" "+avx512dq" "-target-feature" "+avx512bw" "-target-feature" "+avx512vl" "-target-feature" "+avx512vbmi" "-target-feature" "+avx512vbmi2" "-target-feature" "+avx512ifma"
-// NO-AVX: "-target-feature" "-avx" "-target-feature" "-avx2" "-target-feature" "-avx512f" "-target-feature" "-avx512cd" "-target-feature" "-avx512er" "-target-feature" "-avx512pf" "-target-feature" "-avx512dq" "-target-feature" "-avx512bw" "-target-feature" "-avx512vl" "-target-feature" "-avx512vbmi" "-target-feature" "-avx512vbmi2" "-target-feature" "-avx512ifma"
+// RUN: %clang --target=i386 -march=i386 -mavx -mavx2 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512ifma %s -### 2>&1 | FileCheck -check-prefix=AVX %s
+// RUN: %clang --target=i386 -march=i386 -mno-avx -mno-avx2 -mno-avx512f -mno-avx512cd -mno-avx512dq -mno-avx512bw -mno-avx512vl -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512ifma %s -### 2>&1 | FileCheck -check-prefix=NO-AVX %s
+// AVX: "-target-feature" "+avx" "-target-feature" "+avx2" "-target-feature" "+avx512f" "-target-feature" "+avx512cd" "-target-feature" "+avx512dq" "-target-feature" "+avx512bw" "-target-feature" "+avx512vl" "-target-feature" "+avx512vbmi" "-target-feature" "+avx512vbmi2" "-target-feature" "+avx512ifma"
+// NO-AVX: "-target-feature" "-avx" "-target-feature" "-avx2" "-target-feature" "-avx512f" "-target-feature" "-avx512cd" "-target-feature" "-avx512dq" "-target-feature" "-avx512bw" "-target-feature" "-avx512vl" "-target-feature" "-avx512vbmi" "-target-feature" "-avx512vbmi2" "-target-feature" "-avx512ifma"
 
 // RUN: %clang --target=i386 -march=i386 -mpclmul -mrdrnd -mfsgsbase -mbmi -mbmi2 %s -### 2>&1 | FileCheck -check-prefix=BMI %s
 // RUN: %clang --target=i386 -march=i386 -mno-pclmul -mno-rdrnd -mno-fsgsbase -mno-bmi -mno-bmi2 %s -### 2>&1 | FileCheck -check-prefix=NO-BMI %s
@@ -86,11 +86,6 @@
 // SGX: "-target-feature" "+sgx"
 // NO-SGX: "-target-feature" "-sgx"
 
-// RUN: %clang --target=i386 -march=i386 -mprefetchwt1 %s -### 2>&1 | FileCheck -check-prefix=PREFETCHWT1 %s
-// RUN: %clang --target=i386 -march=i386 -mno-prefetchwt1 %s -### 2>&1 | FileCheck -check-prefix=NO-PREFETCHWT1 %s
-// PREFETCHWT1: "-target-feature" "+prefetchwt1"
-// NO-PREFETCHWT1: "-target-feature" "-prefetchwt1"
-
 // RUN: %clang --target=i386 -march=i386 -mprefetchi %s -### -o %t.o 2>&1 | FileCheck -check-prefix=PREFETCHI %s
 // RUN: %clang --target=i386 -march=i386 -mno-prefetchi %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-PREFETCHI %s
 // PREFETCHI: "-target-feature" "+prefetchi"
diff --git a/clang/test/ExtractAPI/non_type_template.cpp b/clang/test/ExtractAPI/non_type_template.cpp
index 4e65eb7..85f38e3 100644
--- a/clang/test/ExtractAPI/non_type_template.cpp
+++ b/clang/test/ExtractAPI/non_type_template.cpp
@@ -310,4 +310,48 @@ NestedTemplateTemplateParamPack<Bar, Bar> var;
 // VAR-NEXT:   }
 // VAR-NEXT: ]
 
+template <typename T>
+class TypeContainer {
+  public:
+    // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix TYPE
+    typedef Foo<T> Type;
+// TYPE-LABEL: "!testLabel": "c:non_type_template.cpp@ST>1#T@TypeContainer@T@Type",
+// TYPE:      "declarationFragments": [
+// TYPE-NEXT:   {
+// TYPE-NEXT:     "kind": "keyword",
+// TYPE-NEXT:     "spelling": "typedef"
+// TYPE-NEXT:   },
+// TYPE-NEXT:   {
+// TYPE-NEXT:     "kind": "text",
+// TYPE-NEXT:     "spelling": " "
+// TYPE-NEXT:   },
+// TYPE-NEXT:   {
+// TYPE-NEXT:     "kind": "typeIdentifier",
+// TYPE-NEXT:     "preciseIdentifier": "c:@ST>2#T#NI@Foo",
+// TYPE-NEXT:     "spelling": "Foo"
+// TYPE-NEXT:   },
+// TYPE-NEXT:   {
+// TYPE-NEXT:     "kind": "text",
+// TYPE-NEXT:     "spelling": "<"
+// TYPE-NEXT:   },
+// TYPE-NEXT:   {
+// TYPE-NEXT:     "kind": "typeIdentifier",
+// TYPE-NEXT:     "preciseIdentifier": "c:t0.0",
+// TYPE-NEXT:     "spelling": "T"
+// TYPE-NEXT:   },
+// TYPE-NEXT:   {
+// TYPE-NEXT:     "kind": "text",
+// TYPE-NEXT:     "spelling": "> "
+// TYPE-NEXT:   },
+// TYPE-NEXT:   {
+// TYPE-NEXT:     "kind": "identifier",
+// TYPE-NEXT:     "spelling": "Type"
+// TYPE-NEXT:   },
+// TYPE-NEXT:   {
+// TYPE-NEXT:     "kind": "text",
+// TYPE-NEXT:     "spelling": ";"
+// TYPE-NEXT:   }
+// TYPE-NEXT: ]
+};
+
 // expected-no-diagnostics
diff --git a/clang/test/Frontend/optimization-remark-options.c b/clang/test/Frontend/optimization-remark-options.c
index 96e480d..357273a 100644
--- a/clang/test/Frontend/optimization-remark-options.c
+++ b/clang/test/Frontend/optimization-remark-options.c
@@ -1,7 +1,7 @@
 // REQUIRES: x86-registered-target
 // RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -mllvm -vectorize-memory-check-threshold=8 -Rpass-analysis=loop-vectorize -emit-llvm -S %s -o - 2>&1 | FileCheck %s
 
-// CHECK: {{.*}}:10:11: remark: loop not vectorized: cannot prove it is safe to reorder floating-point operations; allow reordering by specifying '#pragma clang loop vectorize(enable)' before the loop or by providing the compiler option '-ffast-math'.
+// CHECK: {{.*}}:10:11: remark: loop not vectorized: cannot prove it is safe to reorder floating-point operations; allow reordering by specifying '#pragma clang loop vectorize(enable)' before the loop or by providing the compiler option '-ffast-math'
 
 double foo(int N) {
   double v = 0.0;
@@ -12,7 +12,7 @@ double foo(int N) {
   return v;
 }
 
-// CHECK: {{.*}}:18:3: remark: loop not vectorized: cannot prove it is safe to reorder memory operations; allow reordering by specifying '#pragma clang loop vectorize(enable)' before the loop. If the arrays will always be independent specify '#pragma clang loop vectorize(assume_safety)' before the loop or provide the '__restrict__' qualifier with the independent array arguments. Erroneous results will occur if these options are incorrectly applied!
+// CHECK: {{.*}}:18:3: remark: loop not vectorized: cannot prove it is safe to reorder memory operations; allow reordering by specifying '#pragma clang loop vectorize(enable)' before the loop; if the arrays will always be independent, specify '#pragma clang loop vectorize(assume_safety)' before the loop or provide the '__restrict__' qualifier with the independent array arguments -- erroneous results will occur if these options are incorrectly applied
 
 void foo2(int *dw, int *uw, int *A, int *B, int *C, int *D, int N) {
   for (long i = 0; i < N; i++) {
diff --git a/clang/test/Frontend/x86-target-cpu.c b/clang/test/Frontend/x86-target-cpu.c
index 6b99b2c..6c8502a 100644
--- a/clang/test/Frontend/x86-target-cpu.c
+++ b/clang/test/Frontend/x86-target-cpu.c
@@ -15,14 +15,8 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu cannonlake -verify %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu icelake-client -verify %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu icelake-server -verify %s
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu knl -verify=knl %s
-// knl-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
-// knl-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
-// knl-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu knm -verify=knm %s
-// knm-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
-// knm-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
-// knm-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu knl -verify %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu knm -verify %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu bonnell -verify %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu silvermont -verify %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu k8 -verify %s
diff --git a/clang/test/InstallAPI/binary-attributes.test b/clang/test/InstallAPI/binary-attributes.test
index b28e99f..fd9ff12 100644
--- a/clang/test/InstallAPI/binary-attributes.test
+++ b/clang/test/InstallAPI/binary-attributes.test
@@ -30,13 +30,13 @@
 ; RUN: -install_name /System/Library/Frameworks/Simple.framework/Versions/A/Simple \
 ; RUN: -current_version 1.2.3  -compatibility_version 1 -fapplication-extension \
 ; RUN: -o tmp.tbd --verify-against=%t/Simple 2>&1 | FileCheck -check-prefix=APPEXTSAFE %s
-; APPEXTSAFE: error: ApplicationExtensionSafe flag does not match: 'true' (provided) vs 'false' (found)
+; APPEXTSAFE: error: the ApplicationExtensionSafe flag does not match: 'true' (provided) vs 'false' (found)
 
 ; RUN: not clang-installapi -target x86_64-apple-macos10.12 \
 ; RUN: -install_name /System/Library/Frameworks/Simple.framework/Versions/A/Simple \
 ; RUN: -current_version 1.2.3  -compatibility_version 1 -not_for_dyld_shared_cache \
 ; RUN: -o tmp.tbd --verify-against=%t/Simple 2>&1 | FileCheck -check-prefix=SHARED_CACHE %s
-; SHARED_CACHE: error: NotForDyldSharedCache flag does not match: 'true' (provided) vs 'false' (found)
+; SHARED_CACHE: error: the NotForDyldSharedCache flag does not match: 'true' (provided) vs 'false' (found)
 
 ; RUN: not clang-installapi -target x86_64-apple-macos10.12 \
 ; RUN: -install_name /System/Library/Frameworks/Simple.framework/Versions/A/Simple \
diff --git a/clang/test/Lexer/cxx-features.cpp b/clang/test/Lexer/cxx-features.cpp
index 41550cf..4c2aa3ae 100644
--- a/clang/test/Lexer/cxx-features.cpp
+++ b/clang/test/Lexer/cxx-features.cpp
@@ -1,17 +1,17 @@
 // RUN: %clang_cc1 -std=c++98 -fcxx-exceptions -verify %s
 // RUN: %clang_cc1 -std=c++11 -fcxx-exceptions -verify %s
-// RUN: %clang_cc1 -std=c++14 -fcxx-exceptions -fsized-deallocation -verify %s
-// RUN: %clang_cc1 -std=c++17 -fcxx-exceptions -fsized-deallocation -verify %s
-// RUN: %clang_cc1 -std=c++20 -fcxx-exceptions -fsized-deallocation -verify %s
-// RUN: %clang_cc1 -std=c++23 -fcxx-exceptions -fsized-deallocation -verify %s
-// RUN: %clang_cc1 -std=c++2c -fcxx-exceptions -fsized-deallocation -verify %s
+// RUN: %clang_cc1 -std=c++14 -fcxx-exceptions -verify %s
+// RUN: %clang_cc1 -std=c++17 -fcxx-exceptions -verify %s
+// RUN: %clang_cc1 -std=c++20 -fcxx-exceptions -verify %s
+// RUN: %clang_cc1 -std=c++23 -fcxx-exceptions -verify %s
+// RUN: %clang_cc1 -std=c++2c -fcxx-exceptions -verify %s
 
 //
-// RUN: %clang_cc1 -std=c++17 -fcxx-exceptions -fsized-deallocation -fno-relaxed-template-template-args -DNO_RELAXED_TEMPLATE_TEMPLATE_ARGS=1 -verify %s
-// RUN: %clang_cc1 -std=c++17 -fcxx-exceptions -fsized-deallocation -DCONCEPTS_TS=1 -verify %s
-// RUN: %clang_cc1 -std=c++14 -fno-rtti -fno-threadsafe-statics -verify %s -DNO_EXCEPTIONS -DNO_RTTI -DNO_THREADSAFE_STATICS -fsized-deallocation
-// RUN: %clang_cc1 -std=c++14 -fchar8_t -DNO_EXCEPTIONS -DCHAR8_T -verify -fsized-deallocation %s
-// RUN: %clang_cc1 -std=c++2a -fno-char8_t -DNO_EXCEPTIONS -DNO_CHAR8_T -verify -fsized-deallocation %s
+// RUN: %clang_cc1 -std=c++17 -fcxx-exceptions -fno-relaxed-template-template-args -DNO_RELAXED_TEMPLATE_TEMPLATE_ARGS=1 -verify %s
+// RUN: %clang_cc1 -std=c++17 -fcxx-exceptions -DCONCEPTS_TS=1 -verify %s
+// RUN: %clang_cc1 -std=c++14 -fno-rtti -fno-threadsafe-statics -verify %s -DNO_EXCEPTIONS -DNO_RTTI -DNO_THREADSAFE_STATICS
+// RUN: %clang_cc1 -std=c++14 -fchar8_t -DNO_EXCEPTIONS -DCHAR8_T -verify %s
+// RUN: %clang_cc1 -std=c++2a -fno-char8_t -DNO_EXCEPTIONS -DNO_CHAR8_T -verify %s
 
 // expected-no-diagnostics
 
diff --git a/clang/test/Misc/diag-template-diffing.cpp b/clang/test/Misc/diag-template-diffing-cxx11.cpp
index eefeb0b..eefeb0b 100644
--- a/clang/test/Misc/diag-template-diffing.cpp
+++ b/clang/test/Misc/diag-template-diffing-cxx11.cpp
diff --git a/clang/test/Misc/diag-template-diffing-cxx26.cpp b/clang/test/Misc/diag-template-diffing-cxx26.cpp
new file mode 100644
index 0000000..2b6dd86
--- /dev/null
+++ b/clang/test/Misc/diag-template-diffing-cxx26.cpp
@@ -0,0 +1,49 @@
+// RUN: %clang_cc1 -fsyntax-only %s -std=c++26                                                  -verify=expected,notree
+// RUN: %clang_cc1 -fsyntax-only %s -std=c++26 -fno-elide-type                                  -verify=expected,notree
+// RUN: %clang_cc1 -fsyntax-only %s -std=c++26                 -fdiagnostics-show-template-tree -verify=expected,tree
+// RUN: %clang_cc1 -fsyntax-only %s -std=c++26 -fno-elide-type -fdiagnostics-show-template-tree -verify=expected,tree
+
+namespace GH93068 {
+  int n[2];
+
+  template <auto> struct A {}; // #A
+
+  namespace t1 {
+    // notree-error@#1 {{no viable conversion from 'A<0>' to 'A<n + 1>'}}
+
+    /* tree-error@#1 {{no viable conversion
+  A<
+    [0 != n + 1]>}}*/
+
+    A<n + 1> v1 = A<0>(); // #1
+    // expected-note@#A {{no known conversion from 'A<0>' to 'const A<&n[1]> &' for 1st argument}}
+    // expected-note@#A {{no known conversion from 'A<0>' to 'A<&n[1]> &&' for 1st argument}}
+
+    // notree-error@#2 {{no viable conversion from 'A<n>' to 'A<n + 1>'}}
+    /* tree-error@#2 {{no viable conversion
+  A<
+    [n != n + 1]>}}*/
+
+    A<n + 1> v2 = A<n>(); // #2
+    // expected-note@#A {{no known conversion from 'A<n>' to 'const A<&n[1]> &' for 1st argument}}
+    // expected-note@#A {{no known conversion from 'A<n>' to 'A<&n[1]> &&' for 1st argument}}
+  } // namespace t1
+
+  namespace t2 {
+    A<n> v1;
+    A<n + 1> v2;
+
+    // notree-note@#A {{no known conversion from 'A<n>' to 'const A<(no argument)>' for 1st argument}}
+    // notree-note@#A {{no known conversion from 'A<n>' to 'A<(no argument)>' for 1st argument}}
+
+    /* tree-note@#A {{no known conversion from argument type to parameter type for 1st argument
+  [(no qualifiers) != const] A<
+    [n != (no argument)]>}}*/
+
+    /* tree-note@#A {{no known conversion from argument type to parameter type for 1st argument
+  A<
+    [n != (no argument)]>}}*/
+
+    void f() { v2 = v1; } // expected-error {{no viable overloaded '='}}
+  } // namespace t2
+} // namespace GH93068
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index fd0e6d7..9973269 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -63,7 +63,6 @@
 // CHECK-NEXT: CoroOnlyDestroyWhenComplete (SubjectMatchRule_record)
 // CHECK-NEXT: CoroReturnType (SubjectMatchRule_record)
 // CHECK-NEXT: CoroWrapper (SubjectMatchRule_function)
-// CHECK-NEXT: CountedBy (SubjectMatchRule_field)
 // CHECK-NEXT: DLLExport (SubjectMatchRule_function, SubjectMatchRule_variable, SubjectMatchRule_record, SubjectMatchRule_objc_interface)
 // CHECK-NEXT: DLLImport (SubjectMatchRule_function, SubjectMatchRule_variable, SubjectMatchRule_record, SubjectMatchRule_objc_interface)
 // CHECK-NEXT: Destructor (SubjectMatchRule_function)
diff --git a/clang/test/Modules/implicit-module-remap.cpp b/clang/test/Modules/implicit-module-remap.cpp
new file mode 100644
index 0000000..47927b9
--- /dev/null
+++ b/clang/test/Modules/implicit-module-remap.cpp
@@ -0,0 +1,21 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: cd %t
+//
+// RUN: %clang_cc1 -fmodules -fmodule-map-file=module.modulemap -fmodules-cache-path=%t -remap-file "test.cpp;%t/test.cpp"  %t/test.cpp
+
+//--- a.h
+#define FOO
+
+//--- module.modulemap
+module a {
+  header "a.h"
+}
+
+//--- test.cpp
+#include "a.h"
+
+#ifndef FOO
+#error foo
+#endif
+
diff --git a/clang/test/OpenMP/assumes_codegen.cpp b/clang/test/OpenMP/assumes_codegen.cpp
index 4a2518a..4206e5a 100644
--- a/clang/test/OpenMP/assumes_codegen.cpp
+++ b/clang/test/OpenMP/assumes_codegen.cpp
@@ -67,46 +67,46 @@ int lambda_outer() {
 }
 #pragma omp end assumes
 
-// AST:      void foo() __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) {
-// AST-NEXT: }
-// AST-NEXT: class BAR {
-// AST-NEXT: public:
-// AST-NEXT:     __attribute__((assume("ompx_range_bar_only"))) __attribute__((assume("ompx_range_bar_only_2"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) BAR()      {
-// AST-NEXT:     }
-// AST-NEXT:     __attribute__((assume("ompx_range_bar_only"))) __attribute__((assume("ompx_range_bar_only_2"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) void bar1()     {
-// AST-NEXT:     }
-// AST-NEXT:     __attribute__((assume("ompx_range_bar_only"))) __attribute__((assume("ompx_range_bar_only_2"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) static void bar2()      {
-// AST-NEXT:     }
-// AST-NEXT: };
-// AST-NEXT:  __attribute__((assume("ompx_range_bar_only"))) __attribute__((assume("ompx_range_bar_only_2"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) void bar() {
-// AST-NEXT:     BAR b;
-// AST-NEXT: }
-// AST-NEXT: __attribute__((assume("ompx_1234"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) void baz();
-// AST-NEXT: template <typename T> class BAZ {
-// AST-NEXT: public:
-// AST-NEXT:     __attribute__((assume("ompx_1234"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) BAZ<T>()      {
-// AST-NEXT:     }
-// AST-NEXT:     __attribute__((assume("ompx_1234"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp")))  void baz1()     {
-// AST-NEXT:     }
-// AST-NEXT:     __attribute__((assume("ompx_1234"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) static void baz2()      {
-// AST-NEXT:     }
-// AST-NEXT: };
-// AST-NEXT: template<> class BAZ<float> {
-// AST-NEXT: public:
-// AST-NEXT:     __attribute__((assume("ompx_1234"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp")))  BAZ()    {
-// AST-NEXT:     }
-// AST-NEXT:     __attribute__((assume("ompx_1234"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) void baz1();
-// AST-NEXT:     __attribute__((assume("ompx_1234"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) static void baz2();
-// AST-NEXT: };
-// AST-NEXT: __attribute__((assume("ompx_1234"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) void baz() {
-// AST-NEXT:     BAZ<float> b;
-// AST-NEXT: }
-// AST-NEXT: __attribute__((assume("ompx_lambda_assumption"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) int lambda_outer() {
-// AST-NEXT:     auto lambda_inner = []() {
-// AST-NEXT:         return 42;
-// AST-NEXT:     };
-// AST-NEXT:     return lambda_inner();
-// AST-NEXT: }
+// AST{LITERAL}:      void foo() [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] {
+// AST-NEXT{LITERAL}: }
+// AST-NEXT{LITERAL}: class BAR {
+// AST-NEXT{LITERAL}: public:
+// AST-NEXT{LITERAL}:     [[omp::assume("ompx_range_bar_only")]] [[omp::assume("ompx_range_bar_only_2")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] BAR() {
+// AST-NEXT{LITERAL}:     }
+// AST-NEXT{LITERAL}:     [[omp::assume("ompx_range_bar_only")]] [[omp::assume("ompx_range_bar_only_2")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] void bar1() {
+// AST-NEXT{LITERAL}:     }
+// AST-NEXT{LITERAL}:     [[omp::assume("ompx_range_bar_only")]] [[omp::assume("ompx_range_bar_only_2")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] static void bar2() {
+// AST-NEXT{LITERAL}:     }
+// AST-NEXT{LITERAL}: };
+// AST-NEXT{LITERAL}: [[omp::assume("ompx_range_bar_only")]] [[omp::assume("ompx_range_bar_only_2")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] void bar() {
+// AST-NEXT{LITERAL}:     BAR b;
+// AST-NEXT{LITERAL}: }
+// AST-NEXT{LITERAL}: [[omp::assume("ompx_1234")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] void baz();
+// AST-NEXT{LITERAL}: template <typename T> class BAZ {
+// AST-NEXT{LITERAL}: public:
+// AST-NEXT{LITERAL}:     [[omp::assume("ompx_1234")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] BAZ<T>() {
+// AST-NEXT{LITERAL}:     }
+// AST-NEXT{LITERAL}:     [[omp::assume("ompx_1234")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] void baz1() {
+// AST-NEXT{LITERAL}:     }
+// AST-NEXT{LITERAL}:     [[omp::assume("ompx_1234")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] static void baz2() {
+// AST-NEXT{LITERAL}:     }
+// AST-NEXT{LITERAL}: };
+// AST-NEXT{LITERAL}: template<> class BAZ<float> {
+// AST-NEXT{LITERAL}: public:
+// AST-NEXT{LITERAL}:     [[omp::assume("ompx_1234")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] BAZ() {
+// AST-NEXT{LITERAL}:     }
+// AST-NEXT{LITERAL}:     [[omp::assume("ompx_1234")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] void baz1();
+// AST-NEXT{LITERAL}:     [[omp::assume("ompx_1234")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] static void baz2();
+// AST-NEXT{LITERAL}: };
+// AST-NEXT{LITERAL}: [[omp::assume("ompx_1234")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] void baz() {
+// AST-NEXT{LITERAL}:     BAZ<float> b;
+// AST-NEXT{LITERAL}: }
+// AST-NEXT{LITERAL}: [[omp::assume("ompx_lambda_assumption")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] int lambda_outer() {
+// AST-NEXT{LITERAL}:     auto lambda_inner = []() {
+// AST-NEXT{LITERAL}:         return 42;
+// AST-NEXT{LITERAL}:     };
+// AST-NEXT{LITERAL}:     return lambda_inner();
+// AST-NEXT{LITERAL}: }
 
 #endif
 
diff --git a/clang/test/OpenMP/assumes_print.cpp b/clang/test/OpenMP/assumes_print.cpp
index d8bdaaa..9254c29 100644
--- a/clang/test/OpenMP/assumes_print.cpp
+++ b/clang/test/OpenMP/assumes_print.cpp
@@ -37,8 +37,8 @@ void baz() {
 }
 #pragma omp end assumes
 
-// CHECK: void foo() __attribute__((assume("omp_no_openmp_routines"))) __attribute__((assume("omp_no_openmp")))
-// CHECK: __attribute__((assume("ompx_range_bar_only"))) __attribute__((assume("ompx_range_bar_only_2"))) __attribute__((assume("omp_no_openmp_routines"))) __attribute__((assume("omp_no_openmp"))) void bar()
-// CHECK: __attribute__((assume("ompx_1234"))) __attribute__((assume("omp_no_openmp_routines"))) __attribute__((assume("omp_no_openmp"))) void baz()
+// CHECK{LITERAL}: void foo() [[omp::assume("omp_no_openmp_routines")]] [[omp::assume("omp_no_openmp")]]
+// CHECK{LITERAL}: [[omp::assume("ompx_range_bar_only")]] [[omp::assume("ompx_range_bar_only_2")]] [[omp::assume("omp_no_openmp_routines")]] [[omp::assume("omp_no_openmp")]] void bar()
+// CHECK{LITERAL}: [[omp::assume("ompx_1234")]] [[omp::assume("omp_no_openmp_routines")]] [[omp::assume("omp_no_openmp")]] void baz()
 
 #endif
diff --git a/clang/test/OpenMP/assumes_template_print.cpp b/clang/test/OpenMP/assumes_template_print.cpp
index 614138b..f8857ff 100644
--- a/clang/test/OpenMP/assumes_template_print.cpp
+++ b/clang/test/OpenMP/assumes_template_print.cpp
@@ -17,7 +17,7 @@ template <typename T>
 struct S {
   int a;
 // CHECK: template <typename T> struct S {
-// CHECK:     void foo() __attribute__((assume("ompx_global_assumption")))     {
+// CHECK{LITERAL}:     void foo() [[omp::assume("ompx_global_assumption")]] {
   void foo() {
     #pragma omp parallel
     {}
@@ -25,15 +25,15 @@ struct S {
 };
 
 // CHECK: template<> struct S<int> {
-// CHECK:     void foo() __attribute__((assume("ompx_global_assumption")))     {
+// CHECK{LITERAL}:     void foo() [[omp::assume("ompx_global_assumption")]] {
 
 #pragma omp begin assumes no_openmp
-// CHECK: __attribute__((assume("omp_no_openmp"))) void S_with_assumes_no_call() __attribute__((assume("ompx_global_assumption"))) {
+// CHECK{LITERAL}: [[omp::assume("omp_no_openmp")]] void S_with_assumes_no_call() [[omp::assume("ompx_global_assumption")]] {
 void S_with_assumes_no_call() {
   S<int> s;
   s.a = 0;
 }
-// CHECK: __attribute__((assume("omp_no_openmp"))) void S_with_assumes_call() __attribute__((assume("ompx_global_assumption"))) {
+// CHECK{LITERAL}: [[omp::assume("omp_no_openmp")]] void S_with_assumes_call() [[omp::assume("ompx_global_assumption")]] {
 void S_with_assumes_call() {
   S<int> s;
   s.a = 0;
@@ -42,7 +42,7 @@ void S_with_assumes_call() {
 }
 #pragma omp end assumes
 
-// CHECK: void S_without_assumes() __attribute__((assume("ompx_global_assumption"))) {
+// CHECK{LITERAL}: void S_without_assumes() [[omp::assume("ompx_global_assumption")]] {
 void S_without_assumes() {
   S<int> s;
   s.foo();
@@ -54,7 +54,7 @@ void S_without_assumes() {
 template <typename T>
 struct P {
 // CHECK: template <typename T> struct P {
-// CHECK:    __attribute__((assume("ompx_global_assumption"))) void foo()      {
+// CHECK{LITERAL}:    [[omp::assume("ompx_global_assumption")]] void foo() {
   int a;
   void foo() {
     #pragma omp parallel
@@ -65,21 +65,21 @@ struct P {
 // TODO: Avoid the duplication here:
 
 // CHECK: template<> struct P<int> {
-// CHECK:      __attribute__((assume("ompx_global_assumption"))) __attribute__((assume("ompx_global_assumption"))) void foo()   {
+// CHECK{LITERAL}:     [[omp::assume("ompx_global_assumption")]] [[omp::assume("ompx_global_assumption")]] void foo() {
 
-// CHECK: __attribute__((assume("ompx_global_assumption"))) void P_without_assumes() {
+// CHECK{LITERAL}: [[omp::assume("ompx_global_assumption")]] void P_without_assumes() {
 void P_without_assumes() {
   P<int> p;
   p.foo();
 }
 
 #pragma omp begin assumes no_openmp
-// CHECK: __attribute__((assume("omp_no_openmp"))) __attribute__((assume("ompx_global_assumption"))) void P_with_assumes_no_call() {
+// CHECK{LITERAL}: [[omp::assume("omp_no_openmp")]] [[omp::assume("ompx_global_assumption")]] void P_with_assumes_no_call() {
 void P_with_assumes_no_call() {
   P<int> p;
   p.a = 0;
 }
-// CHECK: __attribute__((assume("omp_no_openmp"))) __attribute__((assume("ompx_global_assumption"))) void P_with_assumes_call() {
+// CHECK{LITERAL}: [[omp::assume("omp_no_openmp")]] [[omp::assume("ompx_global_assumption")]] void P_with_assumes_call() {
 void P_with_assumes_call() {
   P<int> p;
   p.a = 0;
diff --git a/clang/test/OpenMP/atomic_messages.c b/clang/test/OpenMP/atomic_messages.c
index 9f6662a..f4e7db5 100644
--- a/clang/test/OpenMP/atomic_messages.c
+++ b/clang/test/OpenMP/atomic_messages.c
@@ -405,67 +405,67 @@ void compare(void) {
   int x = 0;
   int d = 0;
   int e = 0;
-// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected compound statement}}
 #pragma omp atomic compare
   {}
-// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected exactly one expression statement}}
 #pragma omp atomic compare
   {
     x = d;
     x = e;
   }
-// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected assignment statement}}
 #pragma omp atomic compare
   { x += d; }
-// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected assignment statement}}
 #pragma omp atomic compare
   { bbar(); }
-// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected conditional operator}}
 #pragma omp atomic compare
   { x = d; }
-// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect binary operator in conditional expression}}
 #pragma omp atomic compare
   { x = ffoo() ? e : x; }
-// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect '<', '>' or '==' as order operator}}
 #pragma omp atomic compare
   { x = x >= e ? e : x; }
-// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect comparison in a form of 'x == e', 'e == x', 'x ordop expr', or 'expr ordop x'}}
 #pragma omp atomic compare
   { x = d > e ? e : x; }
-// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect result value to be at false expression}}
 #pragma omp atomic compare
   { x = d > x ? e : d; }
-// omp51-error@+4 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+4 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+3 {{expect binary operator in conditional expression}}
 #pragma omp atomic compare
   {
     if (foo())
       x = d;
   }
-// omp51-error@+4 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+4 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+3 {{expect '<', '>' or '==' as order operator}}
 #pragma omp atomic compare
   {
     if (x >= d)
       x = d;
   }
-// omp51-error@+4 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+4 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+3 {{expect comparison in a form of 'x == e', 'e == x', 'x ordop expr', or 'expr ordop x'}}
 #pragma omp atomic compare
   {
     if (e > d)
       x = d;
   }
-// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected exactly one expression statement}}
 #pragma omp atomic compare
   {
@@ -473,7 +473,7 @@ void compare(void) {
       x = e;
     d = e;
   }
-// omp51-error@+7 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+7 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+6 {{unexpected 'else' statement}}
 #pragma omp atomic compare
   {
@@ -491,61 +491,61 @@ void compare_capture(void) {
   int v = 0;
   int r = 0;
   float dr = 0.0;
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected compound statement}}
 #pragma omp atomic compare capture
   if (x == e) {}
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected exactly one expression statement}}
 #pragma omp atomic compare capture
   if (x == e) {
     x = d;
     v = x;
   }
-// omp51-error@+4 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+4 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+3 {{expected assignment statement}}
 #pragma omp atomic compare capture
   if (x == e) {
     bbar();
   }
-// omp51-error@+4 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+4 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+3 {{expected assignment statement}}
 #pragma omp atomic compare capture
   if (x == e) {
     x += d;
   }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect binary operator in conditional expression}}
 #pragma omp atomic compare capture
   if (ffoo()) {
     x = d;
   }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect '==' operator}}
 #pragma omp atomic compare capture
   if (x > e) {
     x = d;
   }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect comparison in a form of 'x == e', 'e == x', 'x ordop expr', or 'expr ordop x'}}
 #pragma omp atomic compare capture
   if (d == e) {
     x = d;
   }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect 'else' statement}}
 #pragma omp atomic compare capture
   if (x == e) {
     x = d;
   }
-// omp51-error@+5 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+5 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+4 {{expected compound statement}}
 #pragma omp atomic compare capture
   if (x == e) {
     x = d;
   } else {
   }
-// omp51-error@+5 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+5 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+4 {{expected exactly one expression statement}}
 #pragma omp atomic compare capture
   if (x == e) {
@@ -554,7 +554,7 @@ void compare_capture(void) {
     v = x;
     d = e;
   }
-// omp51-error@+6 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+6 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+5 {{expected assignment statement}}
 #pragma omp atomic compare capture
   if (x == e) {
@@ -562,7 +562,7 @@ void compare_capture(void) {
   } else {
     bbar();
   }
-// omp51-error@+6 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+6 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+5 {{expected assignment statement}}
 #pragma omp atomic compare capture
   if (x == e) {
@@ -570,7 +570,7 @@ void compare_capture(void) {
   } else {
     v += x;
   }
-// omp51-error@+6 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+6 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+5 {{expect an assignment statement 'v = x'}}
 #pragma omp atomic compare capture
   if (x == e) {
@@ -578,35 +578,35 @@ void compare_capture(void) {
   } else {
     v = d;
   }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected compound statement}}
 #pragma omp atomic compare capture
   {}
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect a compound statement}}
 #pragma omp atomic compare capture
   x = x > e ? e : x;
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect a 'if' statement}}
 #pragma omp atomic compare capture
   { x = x > e ? e : x; }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect a form 'r = x == e; if (r) ...'}}
 #pragma omp atomic compare capture
   { r = x == e; if (x == d) { x = e; } }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected assignment statement}}
 #pragma omp atomic compare capture
   { r = x == e; if (r) { bbar(); } }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected assignment statement}}
 #pragma omp atomic compare capture
   { r = x == e; if (r) { x += d; } }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected compound statement}}
 #pragma omp atomic compare capture
   { r = x == e; if (r) {} }
-// omp51-error@+5 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+5 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+4 {{expected exactly one expression statement}}
 #pragma omp atomic compare capture
   {
@@ -616,19 +616,19 @@ void compare_capture(void) {
       v = x;
     }
   }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect '==' operator}}
 #pragma omp atomic compare capture
   { r = x > e; if (r) { x = d; } }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect comparison in a form of 'x == e', 'e == x', 'x ordop expr', or 'expr ordop x'}}
 #pragma omp atomic compare capture
   { r = d == e; if (r) { x = d; } }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected compound statement}}
 #pragma omp atomic compare capture
   { r = x == e; if (r) { x = d; } else {} }
-// omp51-error@+7 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+7 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+6 {{expected exactly one expression statement}}
 #pragma omp atomic compare capture
   {
@@ -640,40 +640,40 @@ void compare_capture(void) {
       d = e;
     }
   }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected assignment statement}}
 #pragma omp atomic compare capture
   { r = x == e; if (r) { x = d; } else { bbar(); } }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected assignment statement}}
 #pragma omp atomic compare capture
   { r = x == e; if (r) { x = d; } else { v += x; } }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect an assignment statement 'v = x'}}
 #pragma omp atomic compare capture
   { r = x == e; if (r) { x = d; } else { v = d; } }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected assignment statement}}
 #pragma omp atomic compare capture
   { v += x; if (x == e) { x = d; } }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected assignment statement}}
 #pragma omp atomic compare capture
   { if (x == e) { x = d; } v += x; }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect an assignment statement 'v = x'}}
 #pragma omp atomic compare capture
   { v = d; if (x == e) { x = d; } }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect an assignment statement 'v = x'}}
 #pragma omp atomic compare capture
   { if (x == e) { x = d; } v = d; }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected assignment statement}}
 #pragma omp atomic compare capture
   { v = x; bbar(); }
 
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect integer value}}
 #pragma omp atomic compare capture
   { dr = x == e; if (dr) { x = d; } }
diff --git a/clang/test/OpenMP/distribute_firstprivate_messages.cpp b/clang/test/OpenMP/distribute_firstprivate_messages.cpp
index 30fa8be..f507c86 100644
--- a/clang/test/OpenMP/distribute_firstprivate_messages.cpp
+++ b/clang/test/OpenMP/distribute_firstprivate_messages.cpp
@@ -95,7 +95,7 @@ int main(int argc, char **argv) {
   for (i = 0; i < argc; ++i) foo();
   #pragma omp target
   #pragma omp teams
-  #pragma omp distribute firstprivate (a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}} expected-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-error {{no matching constructor for initialization of 'S3'}}
+  #pragma omp distribute firstprivate (a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}} expected-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-error {{no matching constructor for initialization of 'S3'}}
   for (i = 0; i < argc; ++i) foo();
   #pragma omp target
   #pragma omp teams
@@ -103,11 +103,11 @@ int main(int argc, char **argv) {
   for (i = 0; i < argc; ++i) foo();
   #pragma omp target
   #pragma omp teams
-  #pragma omp distribute firstprivate(ba) // expected-warning {{Type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+  #pragma omp distribute firstprivate(ba) // expected-warning {{type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i) foo();
   #pragma omp target
   #pragma omp teams
-  #pragma omp distribute firstprivate(ca) // expected-error {{no matching constructor for initialization of 'S3'}} expected-warning {{Type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+  #pragma omp distribute firstprivate(ca) // expected-error {{no matching constructor for initialization of 'S3'}} expected-warning {{type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i) foo();
   #pragma omp target
   #pragma omp teams
diff --git a/clang/test/OpenMP/distribute_parallel_for_firstprivate_messages.cpp b/clang/test/OpenMP/distribute_parallel_for_firstprivate_messages.cpp
index 84d6337..4bed1fe 100644
--- a/clang/test/OpenMP/distribute_parallel_for_firstprivate_messages.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_firstprivate_messages.cpp
@@ -119,7 +119,7 @@ int foomain(int argc, char **argv) {
     ++k;
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for firstprivate(a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}} expected-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for firstprivate(a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}} expected-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp target
@@ -129,7 +129,7 @@ int foomain(int argc, char **argv) {
     ++k;
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp target
@@ -241,7 +241,7 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for firstprivate(a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for firstprivate(a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -256,12 +256,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for firstprivate(ba) // expected-warning {{Type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for firstprivate(ba) // expected-warning {{type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for firstprivate(ca) // expected-warning {{Type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for firstprivate(ca) // expected-warning {{type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -292,12 +292,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for firstprivate(m) // expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for firstprivate(m) // expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -329,13 +329,13 @@ int main(int argc, char **argv) {
 // expected-error@+3 {{lastprivate variable cannot be firstprivate}} expected-note@+3 {{defined as lastprivate}}
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for lastprivate(g) firstprivate(g) // expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for lastprivate(g) firstprivate(g) // expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 // expected-error@+3 {{lastprivate variable cannot be firstprivate}} expected-note@+3 {{defined as lastprivate}}
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for lastprivate(n) firstprivate(n) // expected-error {{calling a private constructor of class 'S6'}} expected-warning {{Type 'S6' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for lastprivate(n) firstprivate(n) // expected-error {{calling a private constructor of class 'S6'}} expected-warning {{type 'S6' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel
diff --git a/clang/test/OpenMP/distribute_parallel_for_lastprivate_messages.cpp b/clang/test/OpenMP/distribute_parallel_for_lastprivate_messages.cpp
index f403922..0a0962e 100644
--- a/clang/test/OpenMP/distribute_parallel_for_lastprivate_messages.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_lastprivate_messages.cpp
@@ -119,7 +119,7 @@ int foomain(int argc, char **argv) {
     ++k;
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for lastprivate(a, b) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for lastprivate(a, b) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp target
@@ -129,7 +129,7 @@ int foomain(int argc, char **argv) {
     ++k;
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}} expected-warning 2 {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}} expected-warning 2 {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp target
@@ -228,7 +228,7 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for lastprivate(a, b, c, d, f) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-error 1 {{const-qualified variable without mutable fields cannot be lastprivate}} expected-error 2 {{const-qualified variable cannot be lastprivate}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for lastprivate(a, b, c, d, f) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-error 1 {{const-qualified variable without mutable fields cannot be lastprivate}} expected-error 2 {{const-qualified variable cannot be lastprivate}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -243,12 +243,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for lastprivate(ba) // expected-warning {{Type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for lastprivate(ba) // expected-warning {{type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for lastprivate(ca) // expected-error {{const-qualified variable without mutable fields cannot be lastprivate}} expected-warning {{Type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for lastprivate(ca) // expected-error {{const-qualified variable without mutable fields cannot be lastprivate}} expected-warning {{type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -279,12 +279,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}} expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}} expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -325,13 +325,13 @@ int main(int argc, char **argv) {
 // expected-error@+3 {{firstprivate variable cannot be lastprivate}} expected-note@+3 {{defined as firstprivate}}
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for firstprivate(m) lastprivate(m) // expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for firstprivate(m) lastprivate(m) // expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 // expected-error@+3 {{lastprivate variable cannot be firstprivate}} expected-note@+3 {{defined as lastprivate}}
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for lastprivate(n) firstprivate(n) // expected-error {{calling a private constructor of class 'S6'}} expected-warning {{Type 'S6' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for lastprivate(n) firstprivate(n) // expected-error {{calling a private constructor of class 'S6'}} expected-warning {{type 'S6' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
   static int si;
diff --git a/clang/test/OpenMP/distribute_parallel_for_private_messages.cpp b/clang/test/OpenMP/distribute_parallel_for_private_messages.cpp
index d25598e..2e0e750 100644
--- a/clang/test/OpenMP/distribute_parallel_for_private_messages.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_private_messages.cpp
@@ -50,7 +50,7 @@ public:
 #pragma omp target
 #pragma omp teams
 #pragma omp distribute parallel for private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
-    for (int k = 0; k < s.a; ++k) // expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (int k = 0; k < s.a; ++k) // expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++s.a;
     return *this;
   }
diff --git a/clang/test/OpenMP/distribute_parallel_for_reduction_messages.cpp b/clang/test/OpenMP/distribute_parallel_for_reduction_messages.cpp
index 6b3d9da..864fb59 100644
--- a/clang/test/OpenMP/distribute_parallel_for_reduction_messages.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_reduction_messages.cpp
@@ -187,7 +187,7 @@ T tmain(T argc) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}} expected-warning 2 {{Type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning 2 {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}} expected-warning 2 {{type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning 2 {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
@@ -232,7 +232,7 @@ T tmain(T argc) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}} expected-warning 2 {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}} expected-warning 2 {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
@@ -371,12 +371,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{Type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{Type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
@@ -386,12 +386,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for reduction(+ : ba) // expected-error {{const-qualified variable cannot be reduction}} expected-warning {{Type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for reduction(+ : ba) // expected-error {{const-qualified variable cannot be reduction}} expected-warning {{type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for reduction(* : ca) // expected-error {{const-qualified variable cannot be reduction}} expected-warning {{Type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for reduction(* : ca) // expected-error {{const-qualified variable cannot be reduction}} expected-warning {{type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
@@ -416,12 +416,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}} expected-warning {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}} expected-warning {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}} expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}} expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
@@ -437,12 +437,12 @@ int main(int argc, char **argv) {
 #pragma omp parallel private(k)
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}} expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}} expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for reduction(+ : p), reduction(+ : p) // expected-error {{variable can appear only once in OpenMP 'reduction' clause}} expected-note {{previously referenced here}} expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for reduction(+ : p), reduction(+ : p) // expected-error {{variable can appear only once in OpenMP 'reduction' clause}} expected-note {{previously referenced here}} expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_private_messages.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_private_messages.cpp
index 43bc6ad..0cb8c01 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_private_messages.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_private_messages.cpp
@@ -50,7 +50,7 @@ public:
 #pragma omp target
 #pragma omp teams
 #pragma omp distribute parallel for simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
-    for (int k = 0; k < s.a; ++k) // expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (int k = 0; k < s.a; ++k) // expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++s.a;
     return *this;
   }
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_shared_messages.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_shared_messages.cpp
index 7c83e4c..6dc6e77 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_shared_messages.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_shared_messages.cpp
@@ -117,7 +117,7 @@ T tmain(T argc, S **argv) {
 
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for simd shared (a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for simd shared (a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for(int k = 0 ; k < n ; k++) {
     acc++;
   }
@@ -131,14 +131,14 @@ T tmain(T argc, S **argv) {
 
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for simd shared(ba) // expected-warning {{Type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for simd shared(ba) // expected-warning {{type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for(int k = 0 ; k < n ; k++) {
     acc++;
   }
 
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for simd shared(ca) // expected-warning {{Type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for simd shared(ca) // expected-warning {{type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for(int k = 0 ; k < n ; k++) {
     acc++;
   }
@@ -152,7 +152,7 @@ T tmain(T argc, S **argv) {
 
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for simd shared(e, g) // expected-warning {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for simd shared(e, g) // expected-warning {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
   for(int k = 0 ; k < n ; k++) {
     acc++;
   }
@@ -291,7 +291,7 @@ int main(int argc, char **argv) {
 
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for simd shared (a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for simd shared (a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for(int k = 0 ; k < n ; k++) {
     acc++;
   }
@@ -305,14 +305,14 @@ int main(int argc, char **argv) {
 
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for simd shared(ba) // expected-warning {{Type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for simd shared(ba) // expected-warning {{type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for(int k = 0 ; k < n ; k++) {
     acc++;
   }
 
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for simd shared(ca) // expected-warning {{Type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for simd shared(ca) // expected-warning {{type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for(int k = 0 ; k < n ; k++) {
     acc++;
   }
@@ -326,7 +326,7 @@ int main(int argc, char **argv) {
 
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for simd shared(e, g) // expected-warning {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for simd shared(e, g) // expected-warning {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
   for(int k = 0 ; k < n ; k++) {
     acc++;
   }
diff --git a/clang/test/OpenMP/distribute_simd_firstprivate_messages.cpp b/clang/test/OpenMP/distribute_simd_firstprivate_messages.cpp
index 43057fe..bc1dfcf 100644
--- a/clang/test/OpenMP/distribute_simd_firstprivate_messages.cpp
+++ b/clang/test/OpenMP/distribute_simd_firstprivate_messages.cpp
@@ -111,7 +111,7 @@ int foomain(int argc, char **argv) {
     ++k;
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd firstprivate(z, a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}} expected-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd firstprivate(z, a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}} expected-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp target
@@ -121,7 +121,7 @@ int foomain(int argc, char **argv) {
     ++k;
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp target
@@ -233,7 +233,7 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd firstprivate(a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd firstprivate(a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -248,12 +248,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd firstprivate(ba) // expected-warning {{Type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd firstprivate(ba) // expected-warning {{type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd firstprivate(ca) // expected-warning {{Type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd firstprivate(ca) // expected-warning {{type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -284,12 +284,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd firstprivate(m) // expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd firstprivate(m) // expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -321,13 +321,13 @@ int main(int argc, char **argv) {
 // expected-error@+3 {{lastprivate variable cannot be firstprivate}} expected-note@+3 {{defined as lastprivate}}
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd lastprivate(g) firstprivate(g) //expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd lastprivate(g) firstprivate(g) //expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 // expected-error@+3 {{lastprivate variable cannot be firstprivate}} expected-note@+3 {{defined as lastprivate}}
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd lastprivate(n) firstprivate(n) // expected-error {{calling a private constructor of class 'S6'}} expected-warning {{Type 'S6' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd lastprivate(n) firstprivate(n) // expected-error {{calling a private constructor of class 'S6'}} expected-warning {{type 'S6' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel
diff --git a/clang/test/OpenMP/distribute_simd_lastprivate_messages.cpp b/clang/test/OpenMP/distribute_simd_lastprivate_messages.cpp
index 7658288..379f575 100644
--- a/clang/test/OpenMP/distribute_simd_lastprivate_messages.cpp
+++ b/clang/test/OpenMP/distribute_simd_lastprivate_messages.cpp
@@ -120,7 +120,7 @@ int foomain(int argc, char **argv) {
     ++k;
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd lastprivate(a, b) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd lastprivate(a, b) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp target
@@ -130,7 +130,7 @@ int foomain(int argc, char **argv) {
     ++k;
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}} expected-warning 2 {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}} expected-warning 2 {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp target
@@ -229,7 +229,7 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd lastprivate(a, b, c, d, f) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-error 1 {{const-qualified variable without mutable fields cannot be lastprivate}} expected-error 2 {{const-qualified variable cannot be lastprivate}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd lastprivate(a, b, c, d, f) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-error 1 {{const-qualified variable without mutable fields cannot be lastprivate}} expected-error 2 {{const-qualified variable cannot be lastprivate}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -244,12 +244,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd lastprivate(ba) // expected-warning {{Type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd lastprivate(ba) // expected-warning {{type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd lastprivate(ca) // expected-error {{const-qualified variable without mutable fields cannot be lastprivate}} expected-warning {{Type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd lastprivate(ca) // expected-error {{const-qualified variable without mutable fields cannot be lastprivate}} expected-warning {{type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -280,12 +280,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}} expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}} expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -326,13 +326,13 @@ int main(int argc, char **argv) {
 // expected-error@+3 {{firstprivate variable cannot be lastprivate}} expected-note@+3 {{defined as firstprivate}}
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd firstprivate(m) lastprivate(m) // expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd firstprivate(m) lastprivate(m) // expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 // expected-error@+3 {{lastprivate variable cannot be firstprivate}} expected-note@+3 {{defined as lastprivate}}
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd lastprivate(n) firstprivate(n) // expected-error {{calling a private constructor of class 'S6'}} expected-warning {{Type 'S6' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd lastprivate(n) firstprivate(n) // expected-error {{calling a private constructor of class 'S6'}} expected-warning {{type 'S6' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
   static int si;
diff --git a/clang/test/OpenMP/distribute_simd_loop_messages.cpp b/clang/test/OpenMP/distribute_simd_loop_messages.cpp
index 5a55f95..e56c7df 100644
--- a/clang/test/OpenMP/distribute_simd_loop_messages.cpp
+++ b/clang/test/OpenMP/distribute_simd_loop_messages.cpp
@@ -14,7 +14,7 @@ public:
 #pragma omp target
 #pragma omp teams
 #pragma omp distribute simd
-    for (int k = 0; k < s.a; ++k) // expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (int k = 0; k < s.a; ++k) // expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++s.a;
     return *this;
   }
@@ -490,7 +490,7 @@ int test_with_random_access_iterator() {
   #pragma omp target
   #pragma omp teams
   #pragma omp distribute simd
-  for (GoodIter I = begin; I < end; ++I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I < end; ++I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   #pragma omp target
   #pragma omp teams
@@ -501,41 +501,41 @@ int test_with_random_access_iterator() {
   #pragma omp target
   #pragma omp teams
   #pragma omp distribute simd
-  for (GoodIter I = begin; I >= end; --I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I >= end; --I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   #pragma omp target
   #pragma omp teams
   // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
   #pragma omp distribute simd
-  for (GoodIter I(begin); I < end; ++I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(begin); I < end; ++I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   #pragma omp target
   #pragma omp teams
   // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
   #pragma omp distribute simd
-  for (GoodIter I(nullptr); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(nullptr); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   #pragma omp target
   #pragma omp teams
   // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
   #pragma omp distribute simd
-  for (GoodIter I(0); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(0); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   #pragma omp target
   #pragma omp teams
   // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
   #pragma omp distribute simd
-  for (GoodIter I(1,2); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(1,2); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   #pragma omp target
   #pragma omp teams
   #pragma omp distribute simd
-  for (begin = GoodIter(0); begin < end; ++begin) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (begin = GoodIter(0); begin < end; ++begin) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++begin;
   #pragma omp target
   #pragma omp teams
   #pragma omp distribute simd
-  for (begin = GoodIter(1,2); begin < end; ++begin) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (begin = GoodIter(1,2); begin < end; ++begin) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++begin;
   #pragma omp target
   #pragma omp teams
@@ -546,7 +546,7 @@ int test_with_random_access_iterator() {
   #pragma omp target
   #pragma omp teams
   #pragma omp distribute simd
-  for (begin = end; begin < end; ++begin) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (begin = end; begin < end; ++begin) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++begin;
   #pragma omp target
   #pragma omp teams
@@ -576,7 +576,7 @@ int test_with_random_access_iterator() {
   #pragma omp target
   #pragma omp teams
   #pragma omp distribute simd
-  for (GoodIter I = begin; I >= end; I = I - 1) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I >= end; I = I - 1) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   #pragma omp target
   #pragma omp teams
@@ -600,7 +600,7 @@ int test_with_random_access_iterator() {
   #pragma omp target
   #pragma omp teams
   #pragma omp distribute simd
-  for (Iter0 I = begin0; I < end0; ++I) // expected-warning 2 {{Type 'Iter0' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (Iter0 I = begin0; I < end0; ++I) // expected-warning 2 {{type 'Iter0' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 
   #pragma omp target
@@ -608,7 +608,7 @@ int test_with_random_access_iterator() {
   // Initializer is constructor without params.
   // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
   #pragma omp distribute simd
-  for (Iter0 I; I < end0; ++I) // expected-warning {{Type 'Iter0' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (Iter0 I; I < end0; ++I) // expected-warning {{type 'Iter0' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 
   Iter1 begin1, end1;
@@ -654,7 +654,7 @@ template <typename IT, int ST> class TC {
       // expected-note@+3 {{loop step is expected to be positive due to this condition}}
       // expected-error@+2 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
       #pragma omp distribute simd
-      for (IT I = begin; I <= end; I += ST) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+      for (IT I = begin; I <= end; I += ST) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
         ++I;
       }
       #pragma omp distribute simd
@@ -697,7 +697,7 @@ template <typename IT, int ST=0> int dotest_gt(IT begin, IT end) {
   #pragma omp target
   #pragma omp teams
   #pragma omp distribute simd
-  for (IT I = begin; I < end; I+=TC<int,ST>::step()) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (IT I = begin; I < end; I+=TC<int,ST>::step()) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   }
 }
diff --git a/clang/test/OpenMP/distribute_simd_private_messages.cpp b/clang/test/OpenMP/distribute_simd_private_messages.cpp
index 261a46a..8be7193 100644
--- a/clang/test/OpenMP/distribute_simd_private_messages.cpp
+++ b/clang/test/OpenMP/distribute_simd_private_messages.cpp
@@ -50,7 +50,7 @@ public:
 #pragma omp target
 #pragma omp teams
 #pragma omp distribute simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
-    for (int k = 0; k < s.a; ++k) // expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (int k = 0; k < s.a; ++k) // expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++s.a;
     return *this;
   }
diff --git a/clang/test/OpenMP/distribute_simd_reduction_messages.cpp b/clang/test/OpenMP/distribute_simd_reduction_messages.cpp
index d27360a..03b6ee5 100644
--- a/clang/test/OpenMP/distribute_simd_reduction_messages.cpp
+++ b/clang/test/OpenMP/distribute_simd_reduction_messages.cpp
@@ -187,7 +187,7 @@ T tmain(T argc) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}} expected-warning 2 {{Type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning 2 {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}} expected-warning 2 {{type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning 2 {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
@@ -232,7 +232,7 @@ T tmain(T argc) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}} expected-warning 2 {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}} expected-warning 2 {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
@@ -376,12 +376,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}} expected-warning {{Type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}} expected-error {{incomplete type 'S1' where a complete type is required}}
+#pragma omp distribute simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}} expected-warning {{type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}} expected-error {{incomplete type 'S1' where a complete type is required}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-warning {{Type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}} expected-error {{incomplete type 'S1' where a complete type is required}}
+#pragma omp distribute simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-warning {{type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}} expected-error {{incomplete type 'S1' where a complete type is required}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
@@ -391,12 +391,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd reduction(+ : ba) // expected-error {{const-qualified variable cannot be reduction}} expected-warning {{Type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd reduction(+ : ba) // expected-error {{const-qualified variable cannot be reduction}} expected-warning {{type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd reduction(* : ca) // expected-error {{const-qualified variable cannot be reduction}} expected-warning {{Type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd reduction(* : ca) // expected-error {{const-qualified variable cannot be reduction}} expected-warning {{type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
@@ -421,12 +421,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}} expected-warning {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}}
+#pragma omp distribute simd reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}} expected-warning {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}} expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}} expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
@@ -440,7 +440,7 @@ int main(int argc, char **argv) {
   for (int i = 0; i < 10; ++i)
     foo();
 #if __cplusplus < 201103L // < C++11
-// expected-warning@+5 {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+// expected-warning@+5 {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
 #endif
 #pragma omp parallel private(k)
 #pragma omp target
@@ -449,7 +449,7 @@ int main(int argc, char **argv) {
   for (int i = 0; i < 10; ++i)
     foo();
 #if __cplusplus < 201103L // < C++11
-// expected-warning@+4 {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+// expected-warning@+4 {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
 #endif
 #pragma omp target
 #pragma omp teams
diff --git a/clang/test/OpenMP/reduction_implicit_map.cpp b/clang/test/OpenMP/reduction_implicit_map.cpp
index 0f67cdc..765e90b 100644
--- a/clang/test/OpenMP/reduction_implicit_map.cpp
+++ b/clang/test/OpenMP/reduction_implicit_map.cpp
@@ -47,7 +47,7 @@ int bar() {
  S2 o[5];
   //warnig "copyable and not guaranteed to be mapped correctly" and
   //implicit map generated.
-#pragma omp target parallel reduction(+:o[0]) //expected-warning {{Type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp target parallel reduction(+:o[0]) //expected-warning {{type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; i++);
   double b[10][10][10];
   //no error no implicit map generated, the map for b is generated but not
diff --git a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
index 2f829d2..1afedc6 100644
--- a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
+++ b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
@@ -4,7 +4,7 @@
 
 // host-no-diagnostics
 
-void baz(void) __attribute__((assume("omp_no_openmp")));
+[[omp::assume("omp_no_openmp")]] void baz(void);
 
 void bar1(void) {
 #pragma omp parallel // #0
@@ -24,7 +24,7 @@ void foo1(void) {
                          // all-remark@#2 {{Rewriting generic-mode kernel with a customized state machine. [OMP131]}}
 
   {
-    baz();           // all-remark {{Value has potential side effects preventing SPMD-mode execution. Add `__attribute__((assume("ompx_spmd_amenable")))` to the called function to override. [OMP121]}}
+    baz();           // all-remark {{Value has potential side effects preventing SPMD-mode execution. Add `[[omp::assume("ompx_spmd_amenable")]]` to the called function to override. [OMP121]}}
 #pragma omp parallel // #3
     {
     }
@@ -39,7 +39,7 @@ void foo2(void) {
 #pragma omp target teams // #5
                          // all-remark@#5 {{Rewriting generic-mode kernel with a customized state machine. [OMP131]}}
   {
-    baz();           // all-remark {{Value has potential side effects preventing SPMD-mode execution. Add `__attribute__((assume("ompx_spmd_amenable")))` to the called function to override. [OMP121]}}
+    baz();           // all-remark {{Value has potential side effects preventing SPMD-mode execution. Add `[[omp::assume("ompx_spmd_amenable")]]` to the called function to override. [OMP121]}}
 #pragma omp parallel // #6
     {
     }
@@ -57,7 +57,7 @@ void foo3(void) {
 #pragma omp target teams // #8
                          // all-remark@#8 {{Rewriting generic-mode kernel with a customized state machine. [OMP131]}}
   {
-    baz();           // all-remark {{Value has potential side effects preventing SPMD-mode execution. Add `__attribute__((assume("ompx_spmd_amenable")))` to the called function to override. [OMP121]}}
+    baz();           // all-remark {{Value has potential side effects preventing SPMD-mode execution. Add `[[omp::assume("ompx_spmd_amenable")]]` to the called function to override. [OMP121]}}
 #pragma omp parallel // #9
     {
     }
diff --git a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
index c48a4b9..5ce8f1f 100644
--- a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
+++ b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
@@ -3,7 +3,7 @@
 
 // host-no-diagnostics
 
-void baz(void) __attribute__((assume("omp_no_openmp")));
+[[omp::assume("omp_no_openmp")]] void baz(void);
 
 void bar(void) {
 #pragma omp parallel // #1                                                                                                                                                                                                                                                                                                                                           \
@@ -16,7 +16,7 @@ void foo(void) {
 #pragma omp target teams // #2
                          // expected-remark@#2 {{Rewriting generic-mode kernel with a customized state machine. [OMP131]}}
   {
-    baz();               // expected-remark {{Value has potential side effects preventing SPMD-mode execution. Add `__attribute__((assume("ompx_spmd_amenable")))` to the called function to override. [OMP121]}}
+    baz();               // expected-remark {{Value has potential side effects preventing SPMD-mode execution. Add `[[omp::assume("ompx_spmd_amenable")]]` to the called function to override. [OMP121]}}
 #pragma omp parallel
     {
     }
diff --git a/clang/test/OpenMP/requires_default_atomic_mem_order_messages.cpp b/clang/test/OpenMP/requires_default_atomic_mem_order_messages.cpp
index 19f6ede..5160fbb 100644
--- a/clang/test/OpenMP/requires_default_atomic_mem_order_messages.cpp
+++ b/clang/test/OpenMP/requires_default_atomic_mem_order_messages.cpp
@@ -7,6 +7,6 @@ void foo2() {
 }
 
 #pragma omp requires atomic_default_mem_order(seq_cst) // expected-error {{'atomic' region encountered before requires directive with 'atomic_default_mem_order' clause}} expected-note 2 {{atomic_default_mem_order clause previously used here}}
-#pragma omp requires atomic_default_mem_order(acq_rel) // expected-error {{'atomic' region encountered before requires directive with 'atomic_default_mem_order' clause}} expected-error {{Only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
-#pragma omp requires atomic_default_mem_order(relaxed) // expected-error {{'atomic' region encountered before requires directive with 'atomic_default_mem_order' clause}} expected-error {{Only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
+#pragma omp requires atomic_default_mem_order(acq_rel) // expected-error {{'atomic' region encountered before requires directive with 'atomic_default_mem_order' clause}} expected-error {{only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
+#pragma omp requires atomic_default_mem_order(relaxed) // expected-error {{'atomic' region encountered before requires directive with 'atomic_default_mem_order' clause}} expected-error {{only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
 #pragma omp requires atomic_default_mem_order(release) // expected-error {{expected 'seq_cst', 'acq_rel' or 'relaxed' in OpenMP clause 'atomic_default_mem_order'}} expected-error {{expected at least one clause on '#pragma omp requires' directive}}
diff --git a/clang/test/OpenMP/requires_messages.cpp b/clang/test/OpenMP/requires_messages.cpp
index 10d3116..dbb2b31 100644
--- a/clang/test/OpenMP/requires_messages.cpp
+++ b/clang/test/OpenMP/requires_messages.cpp
@@ -6,39 +6,39 @@ int a;
 
 #pragma omp requires unified_shared_memory // rev-note {{unified_shared_memory clause previously used here}} expected-note{{unified_shared_memory clause previously used here}}
 
-#pragma omp requires unified_shared_memory, unified_shared_memory // expected-error {{Only one unified_shared_memory clause can appear on a requires directive in a single translation unit}} expected-error {{directive '#pragma omp requires' cannot contain more than one 'unified_shared_memory' clause}}
+#pragma omp requires unified_shared_memory, unified_shared_memory // expected-error {{only one unified_shared_memory clause can appear on a requires directive in a single translation unit}} expected-error {{directive '#pragma omp requires' cannot contain more than one 'unified_shared_memory' clause}}
 
-#pragma omp requires unified_address // expected-error {{Only one unified_address clause can appear on a requires directive in a single translation unit}}
+#pragma omp requires unified_address // expected-error {{only one unified_address clause can appear on a requires directive in a single translation unit}}
 
-#pragma omp requires unified_address, unified_address // expected-error {{Only one unified_address clause can appear on a requires directive in a single translation unit}} expected-error {{directive '#pragma omp requires' cannot contain more than one 'unified_address' clause}}
+#pragma omp requires unified_address, unified_address // expected-error {{only one unified_address clause can appear on a requires directive in a single translation unit}} expected-error {{directive '#pragma omp requires' cannot contain more than one 'unified_address' clause}}
 
 #ifdef OMP99
 #pragma omp requires reverse_offload // rev-note {{reverse_offload clause previously used here}} rev-note {{reverse_offload clause previously used here}}
 
-#pragma omp requires reverse_offload, reverse_offload // rev-error {{Only one reverse_offload clause can appear on a requires directive in a single translation unit}} rev-error {{directive '#pragma omp requires' cannot contain more than one 'reverse_offload' clause}}
+#pragma omp requires reverse_offload, reverse_offload // rev-error {{only one reverse_offload clause can appear on a requires directive in a single translation unit}} rev-error {{directive '#pragma omp requires' cannot contain more than one 'reverse_offload' clause}}
 #endif
 
 #pragma omp requires dynamic_allocators // rev-note {{dynamic_allocators clause previously used here}} expected-note {{dynamic_allocators clause previously used here}}
 
-#pragma omp requires dynamic_allocators, dynamic_allocators // expected-error {{Only one dynamic_allocators clause can appear on a requires directive in a single translation unit}} expected-error {{directive '#pragma omp requires' cannot contain more than one 'dynamic_allocators' clause}}
+#pragma omp requires dynamic_allocators, dynamic_allocators // expected-error {{only one dynamic_allocators clause can appear on a requires directive in a single translation unit}} expected-error {{directive '#pragma omp requires' cannot contain more than one 'dynamic_allocators' clause}}
 
 #pragma omp requires atomic_default_mem_order(seq_cst) // rev-note {{atomic_default_mem_order clause previously used here}} expected-note {{atomic_default_mem_order clause previously used here}} expected-note {{atomic_default_mem_order clause previously used here}} expected-note {{atomic_default_mem_order clause previously used here}} expected-note {{atomic_default_mem_order clause previously used here}}
 
-#pragma omp requires atomic_default_mem_order(acq_rel) // expected-error {{Only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
+#pragma omp requires atomic_default_mem_order(acq_rel) // expected-error {{only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
 
-#pragma omp requires atomic_default_mem_order(relaxed) // expected-error {{Only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
+#pragma omp requires atomic_default_mem_order(relaxed) // expected-error {{only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
 
 #pragma omp requires atomic_default_mem_order // expected-error {{expected '(' after 'atomic_default_mem_order'}} expected-error {{expected at least one clause on '#pragma omp requires' directive}}
 
 #pragma omp requires atomic_default_mem_order( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected 'seq_cst', 'acq_rel' or 'relaxed' in OpenMP clause 'atomic_default_mem_order'}} expected-error {{expected at least one clause on '#pragma omp requires' directive}}
 
-#pragma omp requires atomic_default_mem_order(seq_cst // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{Only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
+#pragma omp requires atomic_default_mem_order(seq_cst // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
 
 #pragma omp requires atomic_default_mem_order(invalid_modifier) // expected-error {{expected 'seq_cst', 'acq_rel' or 'relaxed' in OpenMP clause 'atomic_default_mem_order'}} expected-error {{expected at least one clause on '#pragma omp requires' directive}}
 
 #pragma omp requires atomic_default_mem_order(shared) // expected-error {{expected 'seq_cst', 'acq_rel' or 'relaxed' in OpenMP clause 'atomic_default_mem_order'}} expected-error {{expected at least one clause on '#pragma omp requires' directive}}
 
-#pragma omp requires atomic_default_mem_order(acq_rel), atomic_default_mem_order(relaxed) // expected-error {{directive '#pragma omp requires' cannot contain more than one 'atomic_default_mem_order' claus}} expected-error {{Only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
+#pragma omp requires atomic_default_mem_order(acq_rel), atomic_default_mem_order(relaxed) // expected-error {{directive '#pragma omp requires' cannot contain more than one 'atomic_default_mem_order' claus}} expected-error {{only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
 
 #pragma omp requires // expected-error {{expected at least one clause on '#pragma omp requires' directive}}
 
@@ -46,18 +46,18 @@ int a;
 
 #pragma omp requires nowait // expected-error {{unexpected OpenMP clause 'nowait' in directive '#pragma omp requires'}} expected-error {{expected at least one clause on '#pragma omp requires' directive}}
 
-#pragma omp requires unified_address, invalid_clause // expected-warning {{extra tokens at the end of '#pragma omp requires' are ignored}} expected-error {{Only one unified_address clause can appear on a requires directive in a single translation unit}}
+#pragma omp requires unified_address, invalid_clause // expected-warning {{extra tokens at the end of '#pragma omp requires' are ignored}} expected-error {{only one unified_address clause can appear on a requires directive in a single translation unit}}
 
 #pragma omp requires invalid_clause unified_address // expected-warning {{extra tokens at the end of '#pragma omp requires' are ignored}} expected-error {{expected at least one clause on '#pragma omp requires' directive}}
 
 #ifdef OMP99
-#pragma omp requires unified_shared_memory, unified_address, reverse_offload, dynamic_allocators, atomic_default_mem_order(seq_cst) // rev-error {{Only one unified_shared_memory clause can appear on a requires directive in a single translation unit}} rev-error{{Only one unified_address clause can appear on a requires directive in a single translation unit}} rev-error{{Only one reverse_offload clause can appear on a requires directive in a single translation unit}} rev-error{{Only one dynamic_allocators clause can appear on a requires directive in a single translation unit}} rev-error {{Only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
+#pragma omp requires unified_shared_memory, unified_address, reverse_offload, dynamic_allocators, atomic_default_mem_order(seq_cst) // rev-error {{only one unified_shared_memory clause can appear on a requires directive in a single translation unit}} rev-error{{only one unified_address clause can appear on a requires directive in a single translation unit}} rev-error{{only one reverse_offload clause can appear on a requires directive in a single translation unit}} rev-error{{only one dynamic_allocators clause can appear on a requires directive in a single translation unit}} rev-error {{only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
 #endif
 
 namespace A {
-  #pragma omp requires unified_address // expected-error {{Only one unified_address clause can appear on a requires directive in a single translation unit}}
+  #pragma omp requires unified_address // expected-error {{only one unified_address clause can appear on a requires directive in a single translation unit}}
   namespace B {
-    #pragma omp requires unified_address // expected-error {{Only one unified_address clause can appear on a requires directive in a single translation unit}}
+    #pragma omp requires unified_address // expected-error {{only one unified_address clause can appear on a requires directive in a single translation unit}}
   }
 }
 
diff --git a/clang/test/OpenMP/target_device_ancestor_messages.cpp b/clang/test/OpenMP/target_device_ancestor_messages.cpp
index bc1d668..e6705b36 100644
--- a/clang/test/OpenMP/target_device_ancestor_messages.cpp
+++ b/clang/test/OpenMP/target_device_ancestor_messages.cpp
@@ -2,6 +2,6 @@
 // RUN: %clang_cc1 -triple=x86_64 -verify -fopenmp-simd -fopenmp-targets=x86_64 -x c++ -fexceptions -fcxx-exceptions %s
 
 void bar() {
-#pragma omp target device(ancestor : 1) // expected-error {{Device clause with ancestor device-modifier used without specifying 'requires reverse_offload'}}
+#pragma omp target device(ancestor : 1) // expected-error {{device clause with ancestor device-modifier used without specifying 'requires reverse_offload'}}
   ;
 }
diff --git a/clang/test/OpenMP/target_firstprivate_messages.cpp b/clang/test/OpenMP/target_firstprivate_messages.cpp
index 9b21129..2eafb36 100644
--- a/clang/test/OpenMP/target_firstprivate_messages.cpp
+++ b/clang/test/OpenMP/target_firstprivate_messages.cpp
@@ -56,7 +56,7 @@ public:
   S5(int v) : a(v) {}
   S5 &operator=(S5 &s) {
 #pragma omp target firstprivate(a) firstprivate(this->a) firstprivate(s.a) // expected-error {{expected variable name or data member of current class}}
-    for (int k = 0; k < s.a; ++k) // expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (int k = 0; k < s.a; ++k) // expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++s.a;
     return *this;
   }
diff --git a/clang/test/OpenMP/target_map_messages.cpp b/clang/test/OpenMP/target_map_messages.cpp
index 3bd432b..10f4668 100644
--- a/clang/test/OpenMP/target_map_messages.cpp
+++ b/clang/test/OpenMP/target_map_messages.cpp
@@ -681,13 +681,13 @@ T tmain(T argc) {
 #pragma omp target data map(tofrom: argc > 0 ? x : y) // lt50-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}} ge50-error 2 {{expected addressable lvalue in 'map' clause}}
 #pragma omp target data map(argc)
 #pragma omp target data map(S1) // expected-error {{'S1' does not refer to a value}}
-#pragma omp target data map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} warn-warning 2 {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} warn-warning 2 {{Type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
-#pragma omp target data map(ba) // warn-warning 2 {{Type 'const S2 [5]' is not trivially copyable and not guaranteed to be mapped correctly}}
-#pragma omp target data map(ca) // warn-warning 2 {{Type 'const S3 [5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp target data map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} warn-warning 2 {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} warn-warning 2 {{type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp target data map(ba) // warn-warning 2 {{type 'const S2 [5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp target data map(ca) // warn-warning 2 {{type 'const S3 [5]' is not trivially copyable and not guaranteed to be mapped correctly}}
 #pragma omp target data map(da)
 #pragma omp target data map(S2::S2s)
 #pragma omp target data map(S2::S2sc)
-#pragma omp target data map(e, g) // warn-warning 2 {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} warn-warning 2 {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp target data map(e, g) // warn-warning 2 {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} warn-warning 2 {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
 #pragma omp target data map(h) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
 #pragma omp target data map(k) map(k) // lt50-error 2 {{variable already marked as mapped in current construct}} lt50-note 2 {{used here}}
 #pragma omp target map(k), map(k[:5]) // lt50-error 2 {{pointer cannot be mapped along with a section derived from itself}} lt50-note 2 {{used here}}
@@ -815,14 +815,14 @@ int main(int argc, char **argv) {
 #pragma omp target data map(tofrom: argc > 0 ? argv[1] : argv[2]) // lt50-error {{expected expression containing only member accesses and/or array sections based on named variables}} ge50-error {{expected addressable lvalue in 'map' clause}}
 #pragma omp target data map(argc)
 #pragma omp target data map(S1) // expected-error {{'S1' does not refer to a value}}
-#pragma omp target data map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} warn-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} warn-warning {{Type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp target data map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} warn-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} warn-warning {{type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
 #pragma omp target data map(argv[1])
-#pragma omp target data map(ba) // warn-warning {{Type 'const S2 [5]' is not trivially copyable and not guaranteed to be mapped correctly}}
-#pragma omp target data map(ca) // warn-warning {{Type 'const S3 [5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp target data map(ba) // warn-warning {{type 'const S2 [5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp target data map(ca) // warn-warning {{type 'const S3 [5]' is not trivially copyable and not guaranteed to be mapped correctly}}
 #pragma omp target data map(da)
 #pragma omp target data map(S2::S2s)
 #pragma omp target data map(S2::S2sc)
-#pragma omp target data map(e, g) // warn-warning {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} warn-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp target data map(e, g) // warn-warning {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} warn-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
 #pragma omp target data map(h) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
 #pragma omp target data map(k), map(k) // lt50-error {{variable already marked as mapped in current construct}} lt50-note {{used here}}
 #pragma omp target map(k), map(k[:5]) // lt50-error {{pointer cannot be mapped along with a section derived from itself}} lt50-note {{used here}}
@@ -872,7 +872,7 @@ int main(int argc, char **argv) {
   {}
 #pragma omp target firstprivate(j) map(j)  // expected-error {{firstprivate variable cannot be in a map clause in '#pragma omp target' directive}} expected-note {{defined as firstprivate}}
   {}
-#pragma omp target map(m) // warn-warning {{Type 'S6<int>' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp target map(m) // warn-warning {{type 'S6<int>' is not trivially copyable and not guaranteed to be mapped correctly}}
   {}
 #pragma omp target
   { s.a++; }
@@ -920,7 +920,7 @@ int main(int argc, char **argv) {
   { s.a++; }
 #pragma omp target map(s.s.s.b[:2])
   { s.s.s.b[0]++; }
-#pragma omp target map(s8[0:1], s9) // warn-warning {{Type 'class S8' is not trivially copyable and not guaranteed to be mapped correctly}} warn-warning {{Type 'class S9' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp target map(s8[0:1], s9) // warn-warning {{type 'class S8' is not trivially copyable and not guaranteed to be mapped correctly}} warn-warning {{type 'class S9' is not trivially copyable and not guaranteed to be mapped correctly}}
   {}
 
   int **BB, *offset, *a;
diff --git a/clang/test/OpenMP/target_parallel_for_private_messages.cpp b/clang/test/OpenMP/target_parallel_for_private_messages.cpp
index 1c31bad..81b4be4 100644
--- a/clang/test/OpenMP/target_parallel_for_private_messages.cpp
+++ b/clang/test/OpenMP/target_parallel_for_private_messages.cpp
@@ -56,7 +56,7 @@ public:
   S5(int v) : a(v) {}
   S5 &operator=(S5 &s) {
 #pragma omp target parallel for private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
-    for (int k = 0; k < s.a; ++k) // expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (int k = 0; k < s.a; ++k) // expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++s.a;
     return *this;
   }
diff --git a/clang/test/OpenMP/target_parallel_for_simd_private_messages.cpp b/clang/test/OpenMP/target_parallel_for_simd_private_messages.cpp
index db9d495..c9b5bac 100644
--- a/clang/test/OpenMP/target_parallel_for_simd_private_messages.cpp
+++ b/clang/test/OpenMP/target_parallel_for_simd_private_messages.cpp
@@ -56,7 +56,7 @@ public:
   S5(int v) : a(v) {}
   S5 &operator=(S5 &s) {
 #pragma omp target parallel for simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
-    for (int k = 0; k < s.a; ++k) // expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (int k = 0; k < s.a; ++k) // expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++s.a;
     return *this;
   }
diff --git a/clang/test/OpenMP/target_private_messages.cpp b/clang/test/OpenMP/target_private_messages.cpp
index 7ee0c8c..8cdd3a11 100644
--- a/clang/test/OpenMP/target_private_messages.cpp
+++ b/clang/test/OpenMP/target_private_messages.cpp
@@ -50,7 +50,7 @@ public:
   S5(int v) : a(v) {}
   S5 &operator=(S5 &s) {
 #pragma omp target private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
-    for (int k = 0; k < s.a; ++k) // expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (int k = 0; k < s.a; ++k) // expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++s.a;
     return *this;
   }
diff --git a/clang/test/OpenMP/target_simd_private_messages.cpp b/clang/test/OpenMP/target_simd_private_messages.cpp
index 4a55a50..f6e4e71 100644
--- a/clang/test/OpenMP/target_simd_private_messages.cpp
+++ b/clang/test/OpenMP/target_simd_private_messages.cpp
@@ -56,7 +56,7 @@ public:
   S5(int v) : a(v) {}
   S5 &operator=(S5 &s) {
 #pragma omp target simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
-    for (int k = 0; k < s.a; ++k) // expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (int k = 0; k < s.a; ++k) // expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++s.a;
     return *this;
   }
diff --git a/clang/test/OpenMP/target_teams_distribute_firstprivate_messages.cpp b/clang/test/OpenMP/target_teams_distribute_firstprivate_messages.cpp
index fccf551..195af52 100644
--- a/clang/test/OpenMP/target_teams_distribute_firstprivate_messages.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_firstprivate_messages.cpp
@@ -119,7 +119,7 @@ int main(int argc, char **argv) {
   for (i = 0; i < argc; ++i) foo();
 
 #pragma omp target
-#pragma omp teams distribute firstprivate(ca) // expected-error {{no matching constructor for initialization of 'S3'}} expected-warning {{Type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp teams distribute firstprivate(ca) // expected-error {{no matching constructor for initialization of 'S3'}} expected-warning {{type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i) foo();
 
 #pragma omp target teams distribute firstprivate(da, z)
diff --git a/clang/test/OpenMP/target_update_messages.cpp b/clang/test/OpenMP/target_update_messages.cpp
index 2bf0ade..83191059 100644
--- a/clang/test/OpenMP/target_update_messages.cpp
+++ b/clang/test/OpenMP/target_update_messages.cpp
@@ -18,14 +18,14 @@ static int y;
 #pragma omp declare target(y)
 
 void yyy() {
-#pragma omp target update to(y) // expected-error {{the host cannot update a declare target variable that is not externally visible.}}
+#pragma omp target update to(y) // expected-error {{the host cannot update a declare target variable that is not externally visible}}
 }
 
 int __attribute__((visibility("hidden"))) z;
 #pragma omp declare target(z)
 
 void zzz() {
-#pragma omp target update from(z) // expected-error {{the host cannot update a declare target variable that is not externally visible.}}
+#pragma omp target update from(z) // expected-error {{the host cannot update a declare target variable that is not externally visible}}
 }
 
 void foo() {
diff --git a/clang/test/OpenMP/teams_distribute_loop_messages.cpp b/clang/test/OpenMP/teams_distribute_loop_messages.cpp
index 167f653e..e5f1466 100644
--- a/clang/test/OpenMP/teams_distribute_loop_messages.cpp
+++ b/clang/test/OpenMP/teams_distribute_loop_messages.cpp
@@ -416,7 +416,7 @@ int test_with_random_access_iterator() {
   Iter0 begin0, end0;
 #pragma omp target
 #pragma omp teams distribute
-  for (GoodIter I = begin; I < end; ++I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I < end; ++I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute
@@ -425,31 +425,31 @@ int test_with_random_access_iterator() {
     ++I;
 #pragma omp target
 #pragma omp teams distribute
-  for (GoodIter I = begin; I >= end; --I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I >= end; --I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(begin); I < end; ++I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(begin); I < end; ++I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(nullptr); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(nullptr); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(0); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(0); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(1, 2); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(1, 2); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute
-  for (begin = GoodIter(0); begin < end; ++begin) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (begin = GoodIter(0); begin < end; ++begin) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++begin;
 #pragma omp target
 #pragma omp teams distribute
@@ -464,7 +464,7 @@ int test_with_random_access_iterator() {
     ++begin;
 #pragma omp target
 #pragma omp teams distribute
-  for (begin = end; begin < end; ++begin) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (begin = end; begin < end; ++begin) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++begin;
 #pragma omp target
 #pragma omp teams distribute
@@ -489,7 +489,7 @@ int test_with_random_access_iterator() {
     ++I;
 #pragma omp target
 #pragma omp teams distribute
-  for (GoodIter I = begin; I >= end; I = I - 1) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I >= end; I = I - 1) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute
@@ -551,19 +551,19 @@ public:
 #pragma omp teams distribute
 // expected-note@+2 {{loop step is expected to be positive due to this condition}}
 // expected-error@+1 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
-    for (IT I = begin; I < end; I = I + ST) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I < end; I = I + ST) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
 #pragma omp target
 #pragma omp teams distribute
 // expected-note@+2 {{loop step is expected to be positive due to this condition}}
 // expected-error@+1 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
-    for (IT I = begin; I <= end; I += ST) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I <= end; I += ST) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
 #pragma omp target
 #pragma omp teams distribute
-    for (IT I = begin; I < end; ++I) { // expected-warning 4 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I < end; ++I) { // expected-warning 4 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
   }
@@ -599,7 +599,7 @@ int dotest_gt(IT begin, IT end) {
 
 #pragma omp target
 #pragma omp teams distribute
-  for (IT I = begin; I < end; I += TC<int, ST>::step()) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (IT I = begin; I < end; I += TC<int, ST>::step()) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   }
 }
@@ -702,7 +702,7 @@ void test_loop_firstprivate_lastprivate() {
   S s(4);
 // expected-error@+2 {{lastprivate variable cannot be firstprivate}} expected-note@+2 {{defined as lastprivate}}
 #pragma omp target
-#pragma omp teams distribute lastprivate(s) firstprivate(s) // expected-error {{calling a private constructor of class 'S'}} expected-warning {{Type 'S' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp teams distribute lastprivate(s) firstprivate(s) // expected-error {{calling a private constructor of class 'S'}} expected-warning {{type 'S' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 16; ++i)
     ;
 }
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_loop_messages.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_loop_messages.cpp
index cdfc5ea..67e3ce4 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_loop_messages.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_loop_messages.cpp
@@ -414,7 +414,7 @@ int test_with_random_access_iterator() {
   Iter0 begin0, end0;
 #pragma omp target
 #pragma omp teams distribute parallel for
-  for (GoodIter I = begin; I < end; ++I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I < end; ++I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for
@@ -423,31 +423,31 @@ int test_with_random_access_iterator() {
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for
-  for (GoodIter I = begin; I >= end; --I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I >= end; --I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(begin); I < end; ++I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(begin); I < end; ++I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(nullptr); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(nullptr); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(0); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(0); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(1, 2); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(1, 2); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for
-  for (begin = GoodIter(0); begin < end; ++begin) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (begin = GoodIter(0); begin < end; ++begin) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++begin;
 #pragma omp target
 #pragma omp teams distribute parallel for
@@ -462,7 +462,7 @@ int test_with_random_access_iterator() {
     ++begin;
 #pragma omp target
 #pragma omp teams distribute parallel for
-  for (begin = end; begin < end; ++begin) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (begin = end; begin < end; ++begin) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++begin;
 #pragma omp target
 #pragma omp teams distribute parallel for
@@ -487,7 +487,7 @@ int test_with_random_access_iterator() {
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for
-  for (GoodIter I = begin; I >= end; I = I - 1) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I >= end; I = I - 1) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for
@@ -549,19 +549,19 @@ public:
 #pragma omp teams distribute parallel for
 // expected-note@+2 {{loop step is expected to be positive due to this condition}}
 // expected-error@+1 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
-    for (IT I = begin; I < end; I = I + ST) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I < end; I = I + ST) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
 #pragma omp target
 #pragma omp teams distribute parallel for
 // expected-note@+2 {{loop step is expected to be positive due to this condition}}
 // expected-error@+1 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
-    for (IT I = begin; I <= end; I += ST) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I <= end; I += ST) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
 #pragma omp target
 #pragma omp teams distribute parallel for
-    for (IT I = begin; I < end; ++I) { // expected-warning 4 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I < end; ++I) { // expected-warning 4 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
   }
@@ -597,7 +597,7 @@ int dotest_gt(IT begin, IT end) {
 
 #pragma omp target
 #pragma omp teams distribute parallel for
-  for (IT I = begin; I < end; I += TC<int, ST>::step()) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (IT I = begin; I < end; I += TC<int, ST>::step()) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   }
 }
@@ -697,7 +697,7 @@ void test_loop_firstprivate_lastprivate() {
   S s(4);
 // expected-error@+2 {{lastprivate variable cannot be firstprivate}} expected-note@+2 {{defined as lastprivate}}
 #pragma omp target
-#pragma omp teams distribute parallel for lastprivate(s) firstprivate(s) // expected-error {{calling a private constructor of class 'S'}} expected-warning {{Type 'S' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp teams distribute parallel for lastprivate(s) firstprivate(s) // expected-error {{calling a private constructor of class 'S'}} expected-warning {{type 'S' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 16; ++i)
     ;
 }
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_loop_messages.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_loop_messages.cpp
index 645035a..7ee8b9c 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_loop_messages.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_loop_messages.cpp
@@ -416,7 +416,7 @@ int test_with_random_access_iterator() {
   Iter0 begin0, end0;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
-  for (GoodIter I = begin; I < end; ++I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I < end; ++I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
@@ -425,31 +425,31 @@ int test_with_random_access_iterator() {
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
-  for (GoodIter I = begin; I >= end; --I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I >= end; --I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(begin); I < end; ++I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(begin); I < end; ++I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(nullptr); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(nullptr); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(0); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(0); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(1, 2); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(1, 2); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
-  for (begin = GoodIter(0); begin < end; ++begin) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (begin = GoodIter(0); begin < end; ++begin) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++begin;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
@@ -464,7 +464,7 @@ int test_with_random_access_iterator() {
     ++begin;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
-  for (begin = end; begin < end; ++begin) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (begin = end; begin < end; ++begin) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++begin;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
@@ -489,7 +489,7 @@ int test_with_random_access_iterator() {
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
-  for (GoodIter I = begin; I >= end; I = I - 1) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I >= end; I = I - 1) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
@@ -551,19 +551,19 @@ public:
 #pragma omp teams distribute parallel for simd
 // expected-note@+2 {{loop step is expected to be positive due to this condition}}
 // expected-error@+1 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
-    for (IT I = begin; I < end; I = I + ST) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I < end; I = I + ST) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
 #pragma omp target
 #pragma omp teams distribute parallel for simd
 // expected-note@+2 {{loop step is expected to be positive due to this condition}}
 // expected-error@+1 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
-    for (IT I = begin; I <= end; I += ST) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I <= end; I += ST) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
 #pragma omp target
 #pragma omp teams distribute parallel for simd
-    for (IT I = begin; I < end; ++I) { // expected-warning 4 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I < end; ++I) { // expected-warning 4 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
   }
@@ -599,7 +599,7 @@ int dotest_gt(IT begin, IT end) {
 
 #pragma omp target
 #pragma omp teams distribute parallel for simd
-  for (IT I = begin; I < end; I += TC<int, ST>::step()) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (IT I = begin; I < end; I += TC<int, ST>::step()) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   }
 }
@@ -699,7 +699,7 @@ void test_loop_firstprivate_lastprivate() {
   S s(4);
 // expected-error@+2 {{lastprivate variable cannot be firstprivate}} expected-note@+2 {{defined as lastprivate}}
 #pragma omp target
-#pragma omp teams distribute parallel for simd lastprivate(s) firstprivate(s) // expected-error {{calling a private constructor of class 'S'}} expected-warning {{Type 'S' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp teams distribute parallel for simd lastprivate(s) firstprivate(s) // expected-error {{calling a private constructor of class 'S'}} expected-warning {{type 'S' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 16; ++i)
     ;
 }
diff --git a/clang/test/OpenMP/teams_distribute_simd_loop_messages.cpp b/clang/test/OpenMP/teams_distribute_simd_loop_messages.cpp
index 13eef6a..8bfddbf 100644
--- a/clang/test/OpenMP/teams_distribute_simd_loop_messages.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_loop_messages.cpp
@@ -416,7 +416,7 @@ int test_with_random_access_iterator() {
   Iter0 begin0, end0;
 #pragma omp target
 #pragma omp teams distribute simd
-  for (GoodIter I = begin; I < end; ++I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I < end; ++I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute simd
@@ -425,31 +425,31 @@ int test_with_random_access_iterator() {
     ++I;
 #pragma omp target
 #pragma omp teams distribute simd
-  for (GoodIter I = begin; I >= end; --I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I >= end; --I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute simd
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(begin); I < end; ++I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(begin); I < end; ++I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute simd
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(nullptr); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(nullptr); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute simd
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(0); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(0); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute simd
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(1, 2); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(1, 2); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute simd
-  for (begin = GoodIter(0); begin < end; ++begin) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (begin = GoodIter(0); begin < end; ++begin) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++begin;
 #pragma omp target
 #pragma omp teams distribute simd
@@ -464,7 +464,7 @@ int test_with_random_access_iterator() {
     ++begin;
 #pragma omp target
 #pragma omp teams distribute simd
-  for (begin = end; begin < end; ++begin) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (begin = end; begin < end; ++begin) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++begin;
 #pragma omp target
 #pragma omp teams distribute simd
@@ -489,7 +489,7 @@ int test_with_random_access_iterator() {
     ++I;
 #pragma omp target
 #pragma omp teams distribute simd
-  for (GoodIter I = begin; I >= end; I = I - 1) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I >= end; I = I - 1) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute simd
@@ -551,19 +551,19 @@ public:
 #pragma omp teams distribute simd
 // expected-note@+2 {{loop step is expected to be positive due to this condition}}
 // expected-error@+1 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
-    for (IT I = begin; I < end; I = I + ST) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I < end; I = I + ST) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
 #pragma omp target
 #pragma omp teams distribute simd
 // expected-note@+2 {{loop step is expected to be positive due to this condition}}
 // expected-error@+1 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
-    for (IT I = begin; I <= end; I += ST) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I <= end; I += ST) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
 #pragma omp target
 #pragma omp teams distribute simd
-    for (IT I = begin; I < end; ++I) { // expected-warning 4 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I < end; ++I) { // expected-warning 4 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
   }
@@ -599,7 +599,7 @@ int dotest_gt(IT begin, IT end) {
 
 #pragma omp target
 #pragma omp teams distribute simd
-  for (IT I = begin; I < end; I += TC<int, ST>::step()) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (IT I = begin; I < end; I += TC<int, ST>::step()) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   }
 }
@@ -699,7 +699,7 @@ void test_loop_firstprivate_lastprivate() {
   S s(4);
 // expected-error@+2 {{lastprivate variable cannot be firstprivate}} expected-note@+2 {{defined as lastprivate}}
 #pragma omp target
-#pragma omp teams distribute simd lastprivate(s) firstprivate(s) // expected-error {{calling a private constructor of class 'S'}} expected-warning {{Type 'S' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp teams distribute simd lastprivate(s) firstprivate(s) // expected-error {{calling a private constructor of class 'S'}} expected-warning {{type 'S' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 16; ++i)
     ;
 }
diff --git a/clang/test/OpenMP/tile_codegen.cpp b/clang/test/OpenMP/tile_codegen.cpp
index 93a3a14..5fd5609 100644
--- a/clang/test/OpenMP/tile_codegen.cpp
+++ b/clang/test/OpenMP/tile_codegen.cpp
@@ -1,10 +1,10 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 4
 // Check code generation
-// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fclang-abi-compat=latest -fopenmp -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fclang-abi-compat=latest -std=c++20 -fopenmp -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
 
 // Check same results after serialization round-trip
-// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fclang-abi-compat=latest -fopenmp -emit-pch -o %t %s
-// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fclang-abi-compat=latest -fopenmp -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fclang-abi-compat=latest -std=c++20 -fopenmp -emit-pch -o %t %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fclang-abi-compat=latest -std=c++20 -fopenmp -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK2
 // expected-no-diagnostics
 
 #ifndef HEADER
@@ -91,22 +91,38 @@ extern "C" void foo8(int a) {
 }
 
 
+typedef struct { double array[12]; } data_t;
+extern "C" void foo9(data_t data) {
+#pragma omp tile sizes(5)
+  for (double v : data.array)
+    body(v);
+}
+
+
+extern "C" void foo10(data_t data) {
+#pragma omp tile sizes(5)
+  for (double c = 42.0; double v : data.array)
+    body(c, v);
+}
+
+
 #endif /* HEADER */
-// CHECK1-LABEL: define {{[^@]+}}@body
-// CHECK1-SAME: (...) #[[ATTR0:[0-9]+]] {
+
+// CHECK1-LABEL: define dso_local void @body(
+// CHECK1-SAME: ...) #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@__cxx_global_var_init
-// CHECK1-SAME: () #[[ATTR1:[0-9]+]] section ".text.startup" {
+// CHECK1-LABEL: define internal void @__cxx_global_var_init(
+// CHECK1-SAME: ) #[[ATTR1:[0-9]+]] section ".text.startup" {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @_ZN1SC1Ev(ptr noundef nonnull align 4 dereferenceable(4) @s)
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@_ZN1SC1Ev
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 {
+// CHECK1-LABEL: define linkonce_odr void @_ZN1SC1Ev(
+// CHECK1-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -115,50 +131,52 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@_ZN1SC2Ev
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 {
+// CHECK1-LABEL: define linkonce_odr void @_ZN1SC2Ev(
+// CHECK1-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[I2:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTFLOOR_0_IV_I:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTILE_0_IV_I:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT:    [[I2:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    store ptr [[I2]], ptr [[I]], align 8
+// CHECK1-NEXT:    [[I:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 7, ptr [[I]], align 4
+// CHECK1-NEXT:    [[I3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT:    store ptr [[I3]], ptr [[I2]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK1-NEXT:    br label [[FOR_COND:%.*]]
 // CHECK1:       for.cond:
 // CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 4
-// CHECK1-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END11:%.*]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END12:%.*]]
 // CHECK1:       for.body:
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK1-NEXT:    br label [[FOR_COND3:%.*]]
-// CHECK1:       for.cond3:
+// CHECK1-NEXT:    br label [[FOR_COND4:%.*]]
+// CHECK1:       for.cond4:
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 5
-// CHECK1-NEXT:    [[CMP4:%.*]] = icmp slt i32 4, [[ADD]]
-// CHECK1-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp slt i32 4, [[ADD]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK1-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP4]], 5
+// CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP4]], 5
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 4, [[COND_TRUE]] ], [ [[ADD5]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    [[CMP6:%.*]] = icmp slt i32 [[TMP2]], [[COND]]
-// CHECK1-NEXT:    br i1 [[CMP6]], label [[FOR_BODY7:%.*]], label [[FOR_END:%.*]]
-// CHECK1:       for.body7:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 4, [[COND_TRUE]] ], [ [[ADD6]], [[COND_FALSE]] ]
+// CHECK1-NEXT:    [[CMP7:%.*]] = icmp slt i32 [[TMP2]], [[COND]]
+// CHECK1-NEXT:    br i1 [[CMP7]], label [[FOR_BODY8:%.*]], label [[FOR_END:%.*]]
+// CHECK1:       for.body8:
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP5]], 3
-// CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i32 7, [[MUL]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I]], align 8
-// CHECK1-NEXT:    store i32 [[ADD8]], ptr [[TMP6]], align 4
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[I]], align 8
+// CHECK1-NEXT:    [[ADD9:%.*]] = add nsw i32 7, [[MUL]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I2]], align 8
+// CHECK1-NEXT:    store i32 [[ADD9]], ptr [[TMP6]], align 4
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[I2]], align 8
 // CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP8]])
 // CHECK1-NEXT:    br label [[FOR_INC:%.*]]
@@ -166,20 +184,20 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK1-NEXT:    store i32 [[INC]], ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK1-NEXT:    br label [[FOR_COND3]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK1-NEXT:    br label [[FOR_COND4]], !llvm.loop [[LOOP3:![0-9]+]]
 // CHECK1:       for.end:
-// CHECK1-NEXT:    br label [[FOR_INC9:%.*]]
-// CHECK1:       for.inc9:
+// CHECK1-NEXT:    br label [[FOR_INC10:%.*]]
+// CHECK1:       for.inc10:
 // CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP10]], 5
-// CHECK1-NEXT:    store i32 [[ADD10]], ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK1-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP10]], 5
+// CHECK1-NEXT:    store i32 [[ADD11]], ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
-// CHECK1:       for.end11:
+// CHECK1:       for.end12:
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@foo1
-// CHECK1-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] {
+// CHECK1-LABEL: define dso_local void @foo1(
+// CHECK1-SAME: i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
@@ -195,81 +213,83 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    store i32 [[END]], ptr [[END_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[STEP]], ptr [[STEP_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[START_ADDR]], align 4
-// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[END_ADDR]], align 4
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_1]], align 4
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
-// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTNEW_STEP]], align 4
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP3]], [[TMP4]]
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
 // CHECK1-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
-// CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], [[TMP5]]
 // CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
-// CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP6]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
 // CHECK1-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
 // CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK1-NEXT:    br label [[FOR_COND:%.*]]
 // CHECK1:       for.cond:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK1-NEXT:    [[ADD5:%.*]] = add i32 [[TMP8]], 1
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP7]], [[ADD5]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[ADD5:%.*]] = add i32 [[TMP9]], 1
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP8]], [[ADD5]]
 // CHECK1-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END17:%.*]]
 // CHECK1:       for.body:
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK1-NEXT:    store i32 [[TMP9]], ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK1-NEXT:    br label [[FOR_COND6:%.*]]
 // CHECK1:       for.cond6:
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK1-NEXT:    [[ADD7:%.*]] = add i32 [[TMP11]], 1
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP12]], 5
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[ADD7:%.*]] = add i32 [[TMP12]], 1
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK1-NEXT:    [[ADD8:%.*]] = add i32 [[TMP13]], 5
 // CHECK1-NEXT:    [[CMP9:%.*]] = icmp ult i32 [[ADD7]], [[ADD8]]
 // CHECK1-NEXT:    br i1 [[CMP9]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK1-NEXT:    [[ADD10:%.*]] = add i32 [[TMP13]], 1
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[ADD10:%.*]] = add i32 [[TMP14]], 1
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK1-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP14]], 5
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK1-NEXT:    [[ADD11:%.*]] = add i32 [[TMP15]], 5
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[ADD10]], [[COND_TRUE]] ], [ [[ADD11]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[TMP10]], [[COND]]
+// CHECK1-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[TMP11]], [[COND]]
 // CHECK1-NEXT:    br i1 [[CMP12]], label [[FOR_BODY13:%.*]], label [[FOR_END:%.*]]
 // CHECK1:       for.body13:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
-// CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP16]], [[TMP17]]
-// CHECK1-NEXT:    [[ADD14:%.*]] = add i32 [[TMP15]], [[MUL]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP17]], [[TMP18]]
+// CHECK1-NEXT:    [[ADD14:%.*]] = add i32 [[TMP16]], [[MUL]]
 // CHECK1-NEXT:    store i32 [[ADD14]], ptr [[I]], align 4
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4
-// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP18]])
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP19]])
 // CHECK1-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK1:       for.inc:
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK1-NEXT:    [[INC:%.*]] = add i32 [[TMP20]], 1
 // CHECK1-NEXT:    store i32 [[INC]], ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK1-NEXT:    br label [[FOR_COND6]], !llvm.loop [[LOOP6:![0-9]+]]
 // CHECK1:       for.end:
 // CHECK1-NEXT:    br label [[FOR_INC15:%.*]]
 // CHECK1:       for.inc15:
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK1-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP20]], 5
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK1-NEXT:    [[ADD16:%.*]] = add i32 [[TMP21]], 5
 // CHECK1-NEXT:    store i32 [[ADD16]], ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
 // CHECK1:       for.end17:
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@foo2
-// CHECK1-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] {
+// CHECK1-LABEL: define dso_local void @foo2(
+// CHECK1-SAME: i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
@@ -381,8 +401,8 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@foo3
-// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-LABEL: define dso_local void @foo3(
+// CHECK1-SAME: ) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -523,8 +543,8 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@foo4
-// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-LABEL: define dso_local void @foo4(
+// CHECK1-SAME: ) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -676,8 +696,8 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@foo5
-// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-LABEL: define dso_local void @foo5(
+// CHECK1-SAME: ) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -885,15 +905,15 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@foo6
-// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-LABEL: define dso_local void @foo6(
+// CHECK1-SAME: ) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 0, ptr @foo6.omp_outlined)
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@foo6.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK1-LABEL: define internal void @foo6.omp_outlined(
+// CHECK1-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -988,15 +1008,15 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@tfoo7
-// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-LABEL: define dso_local void @tfoo7(
+// CHECK1-SAME: ) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @_Z4foo7IiTnT_Li3ETnS0_Li5EEvS0_S0_(i32 noundef 0, i32 noundef 42)
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@_Z4foo7IiTnT_Li3ETnS0_Li5EEvS0_S0_
-// CHECK1-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]]) #[[ATTR0]] comdat {
+// CHECK1-LABEL: define linkonce_odr void @_Z4foo7IiTnT_Li3ETnS0_Li5EEvS0_S0_(
+// CHECK1-SAME: i32 noundef [[START:%.*]], i32 noundef [[END:%.*]]) #[[ATTR0]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
@@ -1039,7 +1059,7 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
 // CHECK1-NEXT:    [[ADD7:%.*]] = add i32 [[TMP9]], 1
 // CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP10]], 5
+// CHECK1-NEXT:    [[ADD8:%.*]] = add i32 [[TMP10]], 5
 // CHECK1-NEXT:    [[CMP9:%.*]] = icmp ult i32 [[ADD7]], [[ADD8]]
 // CHECK1-NEXT:    br i1 [[CMP9]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
@@ -1048,7 +1068,7 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
 // CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK1-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP12]], 5
+// CHECK1-NEXT:    [[ADD11:%.*]] = add i32 [[TMP12]], 5
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[ADD10]], [[COND_TRUE]] ], [ [[ADD11]], [[COND_FALSE]] ]
@@ -1065,22 +1085,22 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK1:       for.inc:
 // CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK1-NEXT:    [[INC:%.*]] = add i32 [[TMP16]], 1
 // CHECK1-NEXT:    store i32 [[INC]], ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK1-NEXT:    br label [[FOR_COND6]], !llvm.loop [[LOOP21:![0-9]+]]
 // CHECK1:       for.end:
 // CHECK1-NEXT:    br label [[FOR_INC15:%.*]]
 // CHECK1:       for.inc15:
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK1-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP17]], 5
+// CHECK1-NEXT:    [[ADD16:%.*]] = add i32 [[TMP17]], 5
 // CHECK1-NEXT:    store i32 [[ADD16]], ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 // CHECK1:       for.end17:
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@foo8
-// CHECK1-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK1-LABEL: define dso_local void @foo8(
+// CHECK1-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
@@ -1168,22 +1188,219 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_tile_codegen.cpp
-// CHECK1-SAME: () #[[ATTR1]] section ".text.startup" {
+// CHECK1-LABEL: define dso_local void @foo9(
+// CHECK1-SAME: ptr noundef byval([[STRUCT_DATA_T:%.*]]) align 8 [[DATA:%.*]]) #[[ATTR0]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_3:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_4:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTFLOOR_0_IV___BEGIN2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTTILE_0_IV___BEGIN2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[V:%.*]] = alloca double, align 8
+// CHECK1-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds [[STRUCT_DATA_T]], ptr [[DATA]], i32 0, i32 0
+// CHECK1-NEXT:    store ptr [[ARRAY]], ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP0]], i64 0, i64 0
+// CHECK1-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 12
+// CHECK1-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP1]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY1]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY2:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP2]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY2]], ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP4]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP5]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK1-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i64 [[SUB]], 1
+// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i64 [[ADD]], 1
+// CHECK1-NEXT:    [[SUB5:%.*]] = sub nsw i64 [[DIV]], 1
+// CHECK1-NEXT:    store i64 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK1-NEXT:    store i64 0, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK1:       for.cond:
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP7]], 1
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i64 [[TMP6]], [[ADD6]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END18:%.*]]
+// CHECK1:       for.body:
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    store i64 [[TMP8]], ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    br label [[FOR_COND7:%.*]]
+// CHECK1:       for.cond7:
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i64 [[TMP10]], 1
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[ADD9:%.*]] = add nsw i64 [[TMP11]], 5
+// CHECK1-NEXT:    [[CMP10:%.*]] = icmp slt i64 [[ADD8]], [[ADD9]]
+// CHECK1-NEXT:    br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1:       cond.true:
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK1-NEXT:    [[ADD11:%.*]] = add nsw i64 [[TMP12]], 1
+// CHECK1-NEXT:    br label [[COND_END:%.*]]
+// CHECK1:       cond.false:
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[ADD12:%.*]] = add nsw i64 [[TMP13]], 5
+// CHECK1-NEXT:    br label [[COND_END]]
+// CHECK1:       cond.end:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i64 [ [[ADD11]], [[COND_TRUE]] ], [ [[ADD12]], [[COND_FALSE]] ]
+// CHECK1-NEXT:    [[CMP13:%.*]] = icmp slt i64 [[TMP9]], [[COND]]
+// CHECK1-NEXT:    br i1 [[CMP13]], label [[FOR_BODY14:%.*]], label [[FOR_END:%.*]]
+// CHECK1:       for.body14:
+// CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP15]], 1
+// CHECK1-NEXT:    [[ADD_PTR15:%.*]] = getelementptr inbounds double, ptr [[TMP14]], i64 [[MUL]]
+// CHECK1-NEXT:    store ptr [[ADD_PTR15]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP17:%.*]] = load double, ptr [[TMP16]], align 8
+// CHECK1-NEXT:    store double [[TMP17]], ptr [[V]], align 8
+// CHECK1-NEXT:    [[TMP18:%.*]] = load double, ptr [[V]], align 8
+// CHECK1-NEXT:    call void (...) @body(double noundef [[TMP18]])
+// CHECK1-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK1:       for.inc:
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[INC:%.*]] = add nsw i64 [[TMP19]], 1
+// CHECK1-NEXT:    store i64 [[INC]], ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    br label [[FOR_COND7]], !llvm.loop [[LOOP25:![0-9]+]]
+// CHECK1:       for.end:
+// CHECK1-NEXT:    br label [[FOR_INC16:%.*]]
+// CHECK1:       for.inc16:
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[ADD17:%.*]] = add nsw i64 [[TMP20]], 5
+// CHECK1-NEXT:    store i64 [[ADD17]], ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
+// CHECK1:       for.end18:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define dso_local void @foo10(
+// CHECK1-SAME: ptr noundef byval([[STRUCT_DATA_T:%.*]]) align 8 [[DATA:%.*]]) #[[ATTR0]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[C:%.*]] = alloca double, align 8
+// CHECK1-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_3:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_4:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTFLOOR_0_IV___BEGIN2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTTILE_0_IV___BEGIN2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[V:%.*]] = alloca double, align 8
+// CHECK1-NEXT:    store double 4.200000e+01, ptr [[C]], align 8
+// CHECK1-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds [[STRUCT_DATA_T]], ptr [[DATA]], i32 0, i32 0
+// CHECK1-NEXT:    store ptr [[ARRAY]], ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP0]], i64 0, i64 0
+// CHECK1-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 12
+// CHECK1-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP1]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY1]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY2:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP2]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY2]], ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP4]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP5]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK1-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i64 [[SUB]], 1
+// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i64 [[ADD]], 1
+// CHECK1-NEXT:    [[SUB5:%.*]] = sub nsw i64 [[DIV]], 1
+// CHECK1-NEXT:    store i64 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK1-NEXT:    store i64 0, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK1:       for.cond:
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP7]], 1
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i64 [[TMP6]], [[ADD6]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END18:%.*]]
+// CHECK1:       for.body:
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    store i64 [[TMP8]], ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    br label [[FOR_COND7:%.*]]
+// CHECK1:       for.cond7:
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i64 [[TMP10]], 1
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[ADD9:%.*]] = add nsw i64 [[TMP11]], 5
+// CHECK1-NEXT:    [[CMP10:%.*]] = icmp slt i64 [[ADD8]], [[ADD9]]
+// CHECK1-NEXT:    br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1:       cond.true:
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK1-NEXT:    [[ADD11:%.*]] = add nsw i64 [[TMP12]], 1
+// CHECK1-NEXT:    br label [[COND_END:%.*]]
+// CHECK1:       cond.false:
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[ADD12:%.*]] = add nsw i64 [[TMP13]], 5
+// CHECK1-NEXT:    br label [[COND_END]]
+// CHECK1:       cond.end:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i64 [ [[ADD11]], [[COND_TRUE]] ], [ [[ADD12]], [[COND_FALSE]] ]
+// CHECK1-NEXT:    [[CMP13:%.*]] = icmp slt i64 [[TMP9]], [[COND]]
+// CHECK1-NEXT:    br i1 [[CMP13]], label [[FOR_BODY14:%.*]], label [[FOR_END:%.*]]
+// CHECK1:       for.body14:
+// CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP15]], 1
+// CHECK1-NEXT:    [[ADD_PTR15:%.*]] = getelementptr inbounds double, ptr [[TMP14]], i64 [[MUL]]
+// CHECK1-NEXT:    store ptr [[ADD_PTR15]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP17:%.*]] = load double, ptr [[TMP16]], align 8
+// CHECK1-NEXT:    store double [[TMP17]], ptr [[V]], align 8
+// CHECK1-NEXT:    [[TMP18:%.*]] = load double, ptr [[C]], align 8
+// CHECK1-NEXT:    [[TMP19:%.*]] = load double, ptr [[V]], align 8
+// CHECK1-NEXT:    call void (...) @body(double noundef [[TMP18]], double noundef [[TMP19]])
+// CHECK1-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK1:       for.inc:
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[INC:%.*]] = add nsw i64 [[TMP20]], 1
+// CHECK1-NEXT:    store i64 [[INC]], ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    br label [[FOR_COND7]], !llvm.loop [[LOOP27:![0-9]+]]
+// CHECK1:       for.end:
+// CHECK1-NEXT:    br label [[FOR_INC16:%.*]]
+// CHECK1:       for.inc16:
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[ADD17:%.*]] = add nsw i64 [[TMP21]], 5
+// CHECK1-NEXT:    store i64 [[ADD17]], ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
+// CHECK1:       for.end18:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define internal void @_GLOBAL__sub_I_tile_codegen.cpp(
+// CHECK1-SAME: ) #[[ATTR1]] section ".text.startup" {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__cxx_global_var_init()
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@__cxx_global_var_init
-// CHECK2-SAME: () #[[ATTR0:[0-9]+]] section ".text.startup" {
+// CHECK2-LABEL: define internal void @__cxx_global_var_init(
+// CHECK2-SAME: ) #[[ATTR0:[0-9]+]] section ".text.startup" {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    call void @_ZN1SC1Ev(ptr noundef nonnull align 4 dereferenceable(4) @s)
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@_ZN1SC1Ev
-// CHECK2-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CHECK2-LABEL: define linkonce_odr void @_ZN1SC1Ev(
+// CHECK2-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -1192,50 +1409,52 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@_ZN1SC2Ev
-// CHECK2-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK2-LABEL: define linkonce_odr void @_ZN1SC2Ev(
+// CHECK2-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK2-NEXT:    [[I:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[I2:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[DOTFLOOR_0_IV_I:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTILE_0_IV_I:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK2-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK2-NEXT:    [[I2:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK2-NEXT:    store ptr [[I2]], ptr [[I]], align 8
+// CHECK2-NEXT:    [[I:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK2-NEXT:    store i32 7, ptr [[I]], align 4
+// CHECK2-NEXT:    [[I3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[THIS1]], i32 0, i32 0
+// CHECK2-NEXT:    store ptr [[I3]], ptr [[I2]], align 8
 // CHECK2-NEXT:    store i32 0, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK2-NEXT:    br label [[FOR_COND:%.*]]
 // CHECK2:       for.cond:
 // CHECK2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 4
-// CHECK2-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END11:%.*]]
+// CHECK2-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END12:%.*]]
 // CHECK2:       for.body:
 // CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK2-NEXT:    store i32 [[TMP1]], ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND3:%.*]]
-// CHECK2:       for.cond3:
+// CHECK2-NEXT:    br label [[FOR_COND4:%.*]]
+// CHECK2:       for.cond4:
 // CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 5
-// CHECK2-NEXT:    [[CMP4:%.*]] = icmp slt i32 4, [[ADD]]
-// CHECK2-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2-NEXT:    [[CMP5:%.*]] = icmp slt i32 4, [[ADD]]
+// CHECK2-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK2:       cond.true:
 // CHECK2-NEXT:    br label [[COND_END:%.*]]
 // CHECK2:       cond.false:
 // CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP4]], 5
+// CHECK2-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP4]], 5
 // CHECK2-NEXT:    br label [[COND_END]]
 // CHECK2:       cond.end:
-// CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ 4, [[COND_TRUE]] ], [ [[ADD5]], [[COND_FALSE]] ]
-// CHECK2-NEXT:    [[CMP6:%.*]] = icmp slt i32 [[TMP2]], [[COND]]
-// CHECK2-NEXT:    br i1 [[CMP6]], label [[FOR_BODY7:%.*]], label [[FOR_END:%.*]]
-// CHECK2:       for.body7:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ 4, [[COND_TRUE]] ], [ [[ADD6]], [[COND_FALSE]] ]
+// CHECK2-NEXT:    [[CMP7:%.*]] = icmp slt i32 [[TMP2]], [[COND]]
+// CHECK2-NEXT:    br i1 [[CMP7]], label [[FOR_BODY8:%.*]], label [[FOR_END:%.*]]
+// CHECK2:       for.body8:
 // CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP5]], 3
-// CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i32 7, [[MUL]]
-// CHECK2-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I]], align 8
-// CHECK2-NEXT:    store i32 [[ADD8]], ptr [[TMP6]], align 4
-// CHECK2-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[I]], align 8
+// CHECK2-NEXT:    [[ADD9:%.*]] = add nsw i32 7, [[MUL]]
+// CHECK2-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I2]], align 8
+// CHECK2-NEXT:    store i32 [[ADD9]], ptr [[TMP6]], align 4
+// CHECK2-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[I2]], align 8
 // CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
 // CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP8]])
 // CHECK2-NEXT:    br label [[FOR_INC:%.*]]
@@ -1243,26 +1462,26 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND3]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND4]], !llvm.loop [[LOOP3:![0-9]+]]
 // CHECK2:       for.end:
-// CHECK2-NEXT:    br label [[FOR_INC9:%.*]]
-// CHECK2:       for.inc9:
+// CHECK2-NEXT:    br label [[FOR_INC10:%.*]]
+// CHECK2:       for.inc10:
 // CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP10]], 5
-// CHECK2-NEXT:    store i32 [[ADD10]], ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK2-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP10]], 5
+// CHECK2-NEXT:    store i32 [[ADD11]], ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
-// CHECK2:       for.end11:
+// CHECK2:       for.end12:
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@body
-// CHECK2-SAME: (...) #[[ATTR1]] {
+// CHECK2-LABEL: define dso_local void @body(
+// CHECK2-SAME: ...) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@foo1
-// CHECK2-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR1]] {
+// CHECK2-LABEL: define dso_local void @foo1(
+// CHECK2-SAME: i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
@@ -1278,81 +1497,183 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    store i32 [[END]], ptr [[END_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[STEP]], ptr [[STEP_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[START_ADDR]], align 4
-// CHECK2-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[END_ADDR]], align 4
-// CHECK2-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_1]], align 4
-// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
-// CHECK2-NEXT:    store i32 [[TMP2]], ptr [[DOTNEW_STEP]], align 4
-// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
-// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK2-NEXT:    [[SUB:%.*]] = sub i32 [[TMP3]], [[TMP4]]
+// CHECK2-NEXT:    store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
 // CHECK2-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
-// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
-// CHECK2-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], [[TMP5]]
 // CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
-// CHECK2-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP6]]
+// CHECK2-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
 // CHECK2-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
 // CHECK2-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
 // CHECK2-NEXT:    store i32 0, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK2-NEXT:    br label [[FOR_COND:%.*]]
 // CHECK2:       for.cond:
-// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK2-NEXT:    [[ADD5:%.*]] = add i32 [[TMP8]], 1
-// CHECK2-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP7]], [[ADD5]]
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[ADD5:%.*]] = add i32 [[TMP9]], 1
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP8]], [[ADD5]]
 // CHECK2-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END17:%.*]]
 // CHECK2:       for.body:
-// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    store i32 [[TMP9]], ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK2-NEXT:    store i32 [[TMP10]], ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK2-NEXT:    br label [[FOR_COND6:%.*]]
 // CHECK2:       for.cond6:
-// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK2-NEXT:    [[ADD7:%.*]] = add i32 [[TMP11]], 1
-// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP12]], 5
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[ADD7:%.*]] = add i32 [[TMP12]], 1
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK2-NEXT:    [[ADD8:%.*]] = add i32 [[TMP13]], 5
 // CHECK2-NEXT:    [[CMP9:%.*]] = icmp ult i32 [[ADD7]], [[ADD8]]
 // CHECK2-NEXT:    br i1 [[CMP9]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK2:       cond.true:
-// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK2-NEXT:    [[ADD10:%.*]] = add i32 [[TMP13]], 1
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[ADD10:%.*]] = add i32 [[TMP14]], 1
 // CHECK2-NEXT:    br label [[COND_END:%.*]]
 // CHECK2:       cond.false:
-// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP14]], 5
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK2-NEXT:    [[ADD11:%.*]] = add i32 [[TMP15]], 5
 // CHECK2-NEXT:    br label [[COND_END]]
 // CHECK2:       cond.end:
 // CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ [[ADD10]], [[COND_TRUE]] ], [ [[ADD11]], [[COND_FALSE]] ]
-// CHECK2-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[TMP10]], [[COND]]
+// CHECK2-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[TMP11]], [[COND]]
 // CHECK2-NEXT:    br i1 [[CMP12]], label [[FOR_BODY13:%.*]], label [[FOR_END:%.*]]
 // CHECK2:       for.body13:
-// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
-// CHECK2-NEXT:    [[MUL:%.*]] = mul i32 [[TMP16]], [[TMP17]]
-// CHECK2-NEXT:    [[ADD14:%.*]] = add i32 [[TMP15]], [[MUL]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[MUL:%.*]] = mul i32 [[TMP17]], [[TMP18]]
+// CHECK2-NEXT:    [[ADD14:%.*]] = add i32 [[TMP16]], [[MUL]]
 // CHECK2-NEXT:    store i32 [[ADD14]], ptr [[I]], align 4
-// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4
-// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP18]])
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP19]])
 // CHECK2-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK2:       for.inc:
-// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK2-NEXT:    [[INC:%.*]] = add i32 [[TMP20]], 1
 // CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK2-NEXT:    br label [[FOR_COND6]], !llvm.loop [[LOOP6:![0-9]+]]
 // CHECK2:       for.end:
 // CHECK2-NEXT:    br label [[FOR_INC15:%.*]]
 // CHECK2:       for.inc15:
-// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP20]], 5
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK2-NEXT:    [[ADD16:%.*]] = add i32 [[TMP21]], 5
 // CHECK2-NEXT:    store i32 [[ADD16]], ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
 // CHECK2:       for.end17:
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@foo2
-// CHECK2-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR1]] {
+// CHECK2-LABEL: define dso_local void @foo10(
+// CHECK2-SAME: ptr noundef byval([[STRUCT_DATA_T:%.*]]) align 8 [[DATA:%.*]]) #[[ATTR1]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[C:%.*]] = alloca double, align 8
+// CHECK2-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_3:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_4:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTFLOOR_0_IV___BEGIN2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTTILE_0_IV___BEGIN2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[V:%.*]] = alloca double, align 8
+// CHECK2-NEXT:    store double 4.200000e+01, ptr [[C]], align 8
+// CHECK2-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds [[STRUCT_DATA_T]], ptr [[DATA]], i32 0, i32 0
+// CHECK2-NEXT:    store ptr [[ARRAY]], ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP0]], i64 0, i64 0
+// CHECK2-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 12
+// CHECK2-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP1]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY1]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY2:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP2]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY2]], ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK2-NEXT:    store ptr [[TMP3]], ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK2-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP4]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP5]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK2-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i64 [[SUB]], 1
+// CHECK2-NEXT:    [[DIV:%.*]] = sdiv i64 [[ADD]], 1
+// CHECK2-NEXT:    [[SUB5:%.*]] = sub nsw i64 [[DIV]], 1
+// CHECK2-NEXT:    store i64 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK2-NEXT:    store i64 0, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK2:       for.cond:
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK2-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP7]], 1
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp slt i64 [[TMP6]], [[ADD6]]
+// CHECK2-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END18:%.*]]
+// CHECK2:       for.body:
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    store i64 [[TMP8]], ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    br label [[FOR_COND7:%.*]]
+// CHECK2:       for.cond7:
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i64 [[TMP10]], 1
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[ADD9:%.*]] = add nsw i64 [[TMP11]], 5
+// CHECK2-NEXT:    [[CMP10:%.*]] = icmp slt i64 [[ADD8]], [[ADD9]]
+// CHECK2-NEXT:    br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2:       cond.true:
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK2-NEXT:    [[ADD11:%.*]] = add nsw i64 [[TMP12]], 1
+// CHECK2-NEXT:    br label [[COND_END:%.*]]
+// CHECK2:       cond.false:
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[ADD12:%.*]] = add nsw i64 [[TMP13]], 5
+// CHECK2-NEXT:    br label [[COND_END]]
+// CHECK2:       cond.end:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i64 [ [[ADD11]], [[COND_TRUE]] ], [ [[ADD12]], [[COND_FALSE]] ]
+// CHECK2-NEXT:    [[CMP13:%.*]] = icmp slt i64 [[TMP9]], [[COND]]
+// CHECK2-NEXT:    br i1 [[CMP13]], label [[FOR_BODY14:%.*]], label [[FOR_END:%.*]]
+// CHECK2:       for.body14:
+// CHECK2-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP15]], 1
+// CHECK2-NEXT:    [[ADD_PTR15:%.*]] = getelementptr inbounds double, ptr [[TMP14]], i64 [[MUL]]
+// CHECK2-NEXT:    store ptr [[ADD_PTR15]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP17:%.*]] = load double, ptr [[TMP16]], align 8
+// CHECK2-NEXT:    store double [[TMP17]], ptr [[V]], align 8
+// CHECK2-NEXT:    [[TMP18:%.*]] = load double, ptr [[C]], align 8
+// CHECK2-NEXT:    [[TMP19:%.*]] = load double, ptr [[V]], align 8
+// CHECK2-NEXT:    call void (...) @body(double noundef [[TMP18]], double noundef [[TMP19]])
+// CHECK2-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK2:       for.inc:
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[INC:%.*]] = add nsw i64 [[TMP20]], 1
+// CHECK2-NEXT:    store i64 [[INC]], ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    br label [[FOR_COND7]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK2:       for.end:
+// CHECK2-NEXT:    br label [[FOR_INC16:%.*]]
+// CHECK2:       for.inc16:
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[ADD17:%.*]] = add nsw i64 [[TMP21]], 5
+// CHECK2-NEXT:    store i64 [[ADD17]], ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK2:       for.end18:
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @foo2(
+// CHECK2-SAME: i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
@@ -1438,34 +1759,34 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTTILE_1_IV_J]], align 4
 // CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP14]], 1
 // CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTTILE_1_IV_J]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND10]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND10]], !llvm.loop [[LOOP10:![0-9]+]]
 // CHECK2:       for.end:
 // CHECK2-NEXT:    br label [[FOR_INC22:%.*]]
 // CHECK2:       for.inc22:
 // CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK2-NEXT:    [[INC23:%.*]] = add nsw i32 [[TMP15]], 1
 // CHECK2-NEXT:    store i32 [[INC23]], ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND4]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND4]], !llvm.loop [[LOOP11:![0-9]+]]
 // CHECK2:       for.end24:
 // CHECK2-NEXT:    br label [[FOR_INC25:%.*]]
 // CHECK2:       for.inc25:
 // CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTFLOOR_1_IV_J]], align 4
 // CHECK2-NEXT:    [[ADD26:%.*]] = add nsw i32 [[TMP16]], 5
 // CHECK2-NEXT:    store i32 [[ADD26]], ptr [[DOTFLOOR_1_IV_J]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND1]], !llvm.loop [[LOOP10:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND1]], !llvm.loop [[LOOP12:![0-9]+]]
 // CHECK2:       for.end27:
 // CHECK2-NEXT:    br label [[FOR_INC28:%.*]]
 // CHECK2:       for.inc28:
 // CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK2-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP17]], 5
 // CHECK2-NEXT:    store i32 [[ADD29]], ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
 // CHECK2:       for.end30:
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@foo3
-// CHECK2-SAME: () #[[ATTR1]] {
+// CHECK2-LABEL: define dso_local void @foo3(
+// CHECK2-SAME: ) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -1574,21 +1895,21 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTTILE_1_IV_J]], align 4
 // CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP20]], 1
 // CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTTILE_1_IV_J]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND15]], !llvm.loop [[LOOP12:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND15]], !llvm.loop [[LOOP14:![0-9]+]]
 // CHECK2:       for.end:
 // CHECK2-NEXT:    br label [[FOR_INC27:%.*]]
 // CHECK2:       for.inc27:
 // CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK2-NEXT:    [[INC28:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK2-NEXT:    store i32 [[INC28]], ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND3]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND3]], !llvm.loop [[LOOP15:![0-9]+]]
 // CHECK2:       for.end29:
 // CHECK2-NEXT:    br label [[FOR_INC30:%.*]]
 // CHECK2:       for.inc30:
 // CHECK2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTFLOOR_1_IV_J]], align 4
 // CHECK2-NEXT:    [[ADD31:%.*]] = add nsw i32 [[TMP22]], 5
 // CHECK2-NEXT:    store i32 [[ADD31]], ptr [[DOTFLOOR_1_IV_J]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 // CHECK2:       for.end32:
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK2:       omp.body.continue:
@@ -1606,8 +1927,8 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@foo4
-// CHECK2-SAME: () #[[ATTR1]] {
+// CHECK2-LABEL: define dso_local void @foo4(
+// CHECK2-SAME: ) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -1727,21 +2048,21 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTTILE_1_IV_J]], align 4
 // CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP22]], 1
 // CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTTILE_1_IV_J]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND20]], !llvm.loop [[LOOP15:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND20]], !llvm.loop [[LOOP17:![0-9]+]]
 // CHECK2:       for.end:
 // CHECK2-NEXT:    br label [[FOR_INC32:%.*]]
 // CHECK2:       for.inc32:
 // CHECK2-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK2-NEXT:    [[INC33:%.*]] = add nsw i32 [[TMP23]], 1
 // CHECK2-NEXT:    store i32 [[INC33]], ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND8]], !llvm.loop [[LOOP16:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND8]], !llvm.loop [[LOOP18:![0-9]+]]
 // CHECK2:       for.end34:
 // CHECK2-NEXT:    br label [[FOR_INC35:%.*]]
 // CHECK2:       for.inc35:
 // CHECK2-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTFLOOR_1_IV_J]], align 4
 // CHECK2-NEXT:    [[ADD36:%.*]] = add nsw i32 [[TMP24]], 5
 // CHECK2-NEXT:    store i32 [[ADD36]], ptr [[DOTFLOOR_1_IV_J]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 // CHECK2:       for.end37:
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK2:       omp.body.continue:
@@ -1759,8 +2080,8 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@foo5
-// CHECK2-SAME: () #[[ATTR1]] {
+// CHECK2-LABEL: define dso_local void @foo5(
+// CHECK2-SAME: ) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTOMP_IV:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -1968,15 +2289,15 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@foo6
-// CHECK2-SAME: () #[[ATTR1]] {
+// CHECK2-LABEL: define dso_local void @foo6(
+// CHECK2-SAME: ) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 0, ptr @foo6.omp_outlined)
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@foo6.omp_outlined
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK2-LABEL: define internal void @foo6.omp_outlined(
+// CHECK2-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2054,7 +2375,7 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP14]], 1
 // CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
 // CHECK2:       for.end:
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK2:       omp.body.continue:
@@ -2071,8 +2392,8 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@foo8
-// CHECK2-SAME: (i32 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK2-LABEL: define dso_local void @foo8(
+// CHECK2-SAME: i32 noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
@@ -2138,7 +2459,7 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP11]], 1
 // CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND1]], !llvm.loop [[LOOP21:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND1]], !llvm.loop [[LOOP23:![0-9]+]]
 // CHECK2:       for.end:
 // CHECK2-NEXT:    br label [[FOR_INC17:%.*]]
 // CHECK2:       for.inc17:
@@ -2155,20 +2476,117 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK2-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP14]], [[COND22]]
 // CHECK2-NEXT:    store i32 [[ADD23]], ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
 // CHECK2:       for.end24:
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@tfoo7
-// CHECK2-SAME: () #[[ATTR1]] {
+// CHECK2-LABEL: define dso_local void @foo9(
+// CHECK2-SAME: ptr noundef byval([[STRUCT_DATA_T:%.*]]) align 8 [[DATA:%.*]]) #[[ATTR1]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_3:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_4:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTFLOOR_0_IV___BEGIN2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTTILE_0_IV___BEGIN2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[V:%.*]] = alloca double, align 8
+// CHECK2-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds [[STRUCT_DATA_T]], ptr [[DATA]], i32 0, i32 0
+// CHECK2-NEXT:    store ptr [[ARRAY]], ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP0]], i64 0, i64 0
+// CHECK2-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 12
+// CHECK2-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP1]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY1]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY2:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP2]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY2]], ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK2-NEXT:    store ptr [[TMP3]], ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK2-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP4]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP5]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK2-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i64 [[SUB]], 1
+// CHECK2-NEXT:    [[DIV:%.*]] = sdiv i64 [[ADD]], 1
+// CHECK2-NEXT:    [[SUB5:%.*]] = sub nsw i64 [[DIV]], 1
+// CHECK2-NEXT:    store i64 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK2-NEXT:    store i64 0, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK2:       for.cond:
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK2-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP7]], 1
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp slt i64 [[TMP6]], [[ADD6]]
+// CHECK2-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END18:%.*]]
+// CHECK2:       for.body:
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    store i64 [[TMP8]], ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    br label [[FOR_COND7:%.*]]
+// CHECK2:       for.cond7:
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i64 [[TMP10]], 1
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[ADD9:%.*]] = add nsw i64 [[TMP11]], 5
+// CHECK2-NEXT:    [[CMP10:%.*]] = icmp slt i64 [[ADD8]], [[ADD9]]
+// CHECK2-NEXT:    br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2:       cond.true:
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK2-NEXT:    [[ADD11:%.*]] = add nsw i64 [[TMP12]], 1
+// CHECK2-NEXT:    br label [[COND_END:%.*]]
+// CHECK2:       cond.false:
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[ADD12:%.*]] = add nsw i64 [[TMP13]], 5
+// CHECK2-NEXT:    br label [[COND_END]]
+// CHECK2:       cond.end:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i64 [ [[ADD11]], [[COND_TRUE]] ], [ [[ADD12]], [[COND_FALSE]] ]
+// CHECK2-NEXT:    [[CMP13:%.*]] = icmp slt i64 [[TMP9]], [[COND]]
+// CHECK2-NEXT:    br i1 [[CMP13]], label [[FOR_BODY14:%.*]], label [[FOR_END:%.*]]
+// CHECK2:       for.body14:
+// CHECK2-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP15]], 1
+// CHECK2-NEXT:    [[ADD_PTR15:%.*]] = getelementptr inbounds double, ptr [[TMP14]], i64 [[MUL]]
+// CHECK2-NEXT:    store ptr [[ADD_PTR15]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP17:%.*]] = load double, ptr [[TMP16]], align 8
+// CHECK2-NEXT:    store double [[TMP17]], ptr [[V]], align 8
+// CHECK2-NEXT:    [[TMP18:%.*]] = load double, ptr [[V]], align 8
+// CHECK2-NEXT:    call void (...) @body(double noundef [[TMP18]])
+// CHECK2-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK2:       for.inc:
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[INC:%.*]] = add nsw i64 [[TMP19]], 1
+// CHECK2-NEXT:    store i64 [[INC]], ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    br label [[FOR_COND7]], !llvm.loop [[LOOP25:![0-9]+]]
+// CHECK2:       for.end:
+// CHECK2-NEXT:    br label [[FOR_INC16:%.*]]
+// CHECK2:       for.inc16:
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[ADD17:%.*]] = add nsw i64 [[TMP20]], 5
+// CHECK2-NEXT:    store i64 [[ADD17]], ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
+// CHECK2:       for.end18:
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @tfoo7(
+// CHECK2-SAME: ) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    call void @_Z4foo7IiTnT_Li3ETnS0_Li5EEvS0_S0_(i32 noundef 0, i32 noundef 42)
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@_Z4foo7IiTnT_Li3ETnS0_Li5EEvS0_S0_
-// CHECK2-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]]) #[[ATTR1]] comdat {
+// CHECK2-LABEL: define linkonce_odr void @_Z4foo7IiTnT_Li3ETnS0_Li5EEvS0_S0_(
+// CHECK2-SAME: i32 noundef [[START:%.*]], i32 noundef [[END:%.*]]) #[[ATTR1]] comdat {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
@@ -2211,7 +2629,7 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
 // CHECK2-NEXT:    [[ADD7:%.*]] = add i32 [[TMP9]], 1
 // CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP10]], 5
+// CHECK2-NEXT:    [[ADD8:%.*]] = add i32 [[TMP10]], 5
 // CHECK2-NEXT:    [[CMP9:%.*]] = icmp ult i32 [[ADD7]], [[ADD8]]
 // CHECK2-NEXT:    br i1 [[CMP9]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK2:       cond.true:
@@ -2220,7 +2638,7 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    br label [[COND_END:%.*]]
 // CHECK2:       cond.false:
 // CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP12]], 5
+// CHECK2-NEXT:    [[ADD11:%.*]] = add i32 [[TMP12]], 5
 // CHECK2-NEXT:    br label [[COND_END]]
 // CHECK2:       cond.end:
 // CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ [[ADD10]], [[COND_TRUE]] ], [ [[ADD11]], [[COND_FALSE]] ]
@@ -2237,23 +2655,74 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK2:       for.inc:
 // CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK2-NEXT:    [[INC:%.*]] = add i32 [[TMP16]], 1
 // CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND6]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND6]], !llvm.loop [[LOOP27:![0-9]+]]
 // CHECK2:       for.end:
 // CHECK2-NEXT:    br label [[FOR_INC15:%.*]]
 // CHECK2:       for.inc15:
 // CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP17]], 5
+// CHECK2-NEXT:    [[ADD16:%.*]] = add i32 [[TMP17]], 5
 // CHECK2-NEXT:    store i32 [[ADD16]], ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
 // CHECK2:       for.end17:
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_tile_codegen.cpp
-// CHECK2-SAME: () #[[ATTR0]] section ".text.startup" {
+// CHECK2-LABEL: define internal void @_GLOBAL__sub_I_tile_codegen.cpp(
+// CHECK2-SAME: ) #[[ATTR0]] section ".text.startup" {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    call void @__cxx_global_var_init()
 // CHECK2-NEXT:    ret void
 //
+//.
+// CHECK1: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
+// CHECK1: [[META4]] = !{!"llvm.loop.mustprogress"}
+// CHECK1: [[LOOP5]] = distinct !{[[LOOP5]], [[META4]]}
+// CHECK1: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]}
+// CHECK1: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]}
+// CHECK1: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]}
+// CHECK1: [[LOOP9]] = distinct !{[[LOOP9]], [[META4]]}
+// CHECK1: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]]}
+// CHECK1: [[LOOP11]] = distinct !{[[LOOP11]], [[META4]]}
+// CHECK1: [[LOOP12]] = distinct !{[[LOOP12]], [[META4]]}
+// CHECK1: [[LOOP13]] = distinct !{[[LOOP13]], [[META4]]}
+// CHECK1: [[LOOP14]] = distinct !{[[LOOP14]], [[META4]]}
+// CHECK1: [[LOOP15]] = distinct !{[[LOOP15]], [[META4]]}
+// CHECK1: [[LOOP16]] = distinct !{[[LOOP16]], [[META4]]}
+// CHECK1: [[LOOP17]] = distinct !{[[LOOP17]], [[META4]]}
+// CHECK1: [[LOOP18]] = distinct !{[[LOOP18]], [[META4]]}
+// CHECK1: [[LOOP21]] = distinct !{[[LOOP21]], [[META4]]}
+// CHECK1: [[LOOP22]] = distinct !{[[LOOP22]], [[META4]]}
+// CHECK1: [[LOOP23]] = distinct !{[[LOOP23]], [[META4]]}
+// CHECK1: [[LOOP24]] = distinct !{[[LOOP24]], [[META4]]}
+// CHECK1: [[LOOP25]] = distinct !{[[LOOP25]], [[META4]]}
+// CHECK1: [[LOOP26]] = distinct !{[[LOOP26]], [[META4]]}
+// CHECK1: [[LOOP27]] = distinct !{[[LOOP27]], [[META4]]}
+// CHECK1: [[LOOP28]] = distinct !{[[LOOP28]], [[META4]]}
+//.
+// CHECK2: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
+// CHECK2: [[META4]] = !{!"llvm.loop.mustprogress"}
+// CHECK2: [[LOOP5]] = distinct !{[[LOOP5]], [[META4]]}
+// CHECK2: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]}
+// CHECK2: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]}
+// CHECK2: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]}
+// CHECK2: [[LOOP9]] = distinct !{[[LOOP9]], [[META4]]}
+// CHECK2: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]]}
+// CHECK2: [[LOOP11]] = distinct !{[[LOOP11]], [[META4]]}
+// CHECK2: [[LOOP12]] = distinct !{[[LOOP12]], [[META4]]}
+// CHECK2: [[LOOP13]] = distinct !{[[LOOP13]], [[META4]]}
+// CHECK2: [[LOOP14]] = distinct !{[[LOOP14]], [[META4]]}
+// CHECK2: [[LOOP15]] = distinct !{[[LOOP15]], [[META4]]}
+// CHECK2: [[LOOP16]] = distinct !{[[LOOP16]], [[META4]]}
+// CHECK2: [[LOOP17]] = distinct !{[[LOOP17]], [[META4]]}
+// CHECK2: [[LOOP18]] = distinct !{[[LOOP18]], [[META4]]}
+// CHECK2: [[LOOP19]] = distinct !{[[LOOP19]], [[META4]]}
+// CHECK2: [[LOOP20]] = distinct !{[[LOOP20]], [[META4]]}
+// CHECK2: [[LOOP23]] = distinct !{[[LOOP23]], [[META4]]}
+// CHECK2: [[LOOP24]] = distinct !{[[LOOP24]], [[META4]]}
+// CHECK2: [[LOOP25]] = distinct !{[[LOOP25]], [[META4]]}
+// CHECK2: [[LOOP26]] = distinct !{[[LOOP26]], [[META4]]}
+// CHECK2: [[LOOP27]] = distinct !{[[LOOP27]], [[META4]]}
+// CHECK2: [[LOOP28]] = distinct !{[[LOOP28]], [[META4]]}
+//.
diff --git a/clang/test/OpenMP/tile_codegen_for_dependent.cpp b/clang/test/OpenMP/tile_codegen_for_dependent.cpp
index 93c51c9..820d33d 100644
--- a/clang/test/OpenMP/tile_codegen_for_dependent.cpp
+++ b/clang/test/OpenMP/tile_codegen_for_dependent.cpp
@@ -17,7 +17,7 @@
 extern "C" void body(...) {}
 
 
-// IR-LABEL: @func(
+// IR-LABEL: define {{.*}}@func(
 // IR-NEXT:  [[ENTRY:.*]]:
 // IR-NEXT:    %[[START_ADDR:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[END_ADDR:.+]] = alloca i32, align 4
@@ -27,18 +27,18 @@ extern "C" void body(...) {}
 // IR-NEXT:    %[[I:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTCAPTURE_EXPR_:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTCAPTURE_EXPR_1:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTNEW_STEP:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTCAPTURE_EXPR_2:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[DOTCAPTURE_EXPR_3:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[DOTCAPTURE_EXPR_6:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[DOTCAPTURE_EXPR_8:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_5:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_7:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTFLOOR_0_IV_I:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTOMP_LB:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTOMP_UB:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTOMP_STRIDE:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTOMP_IS_LAST:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[DOTFLOOR_0_IV_I12:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTFLOOR_0_IV_I11:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTTILE_0_IV_I:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[TMP0:.+]] = call i32 @__kmpc_global_thread_num(ptr @2)
+// IR-NEXT:    %[[TMP0:.+]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:.+]])
 // IR-NEXT:    store i32 %[[START:.+]], ptr %[[START_ADDR]], align 4
 // IR-NEXT:    store i32 %[[END:.+]], ptr %[[END_ADDR]], align 4
 // IR-NEXT:    store i32 %[[STEP:.+]], ptr %[[STEP_ADDR]], align 4
@@ -49,44 +49,44 @@ extern "C" void body(...) {}
 // IR-NEXT:    %[[TMP3:.+]] = load i32, ptr %[[END_ADDR]], align 4
 // IR-NEXT:    store i32 %[[TMP3]], ptr %[[DOTCAPTURE_EXPR_1]], align 4
 // IR-NEXT:    %[[TMP4:.+]] = load i32, ptr %[[STEP_ADDR]], align 4
-// IR-NEXT:    store i32 %[[TMP4]], ptr %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    store i32 %[[TMP4]], ptr %[[DOTNEW_STEP]], align 4
 // IR-NEXT:    %[[TMP5:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_1]], align 4
 // IR-NEXT:    %[[TMP6:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_]], align 4
 // IR-NEXT:    %[[SUB:.+]] = sub i32 %[[TMP5]], %[[TMP6]]
-// IR-NEXT:    %[[SUB4:.+]] = sub i32 %[[SUB]], 1
-// IR-NEXT:    %[[TMP7:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
-// IR-NEXT:    %[[ADD:.+]] = add i32 %[[SUB4]], %[[TMP7]]
-// IR-NEXT:    %[[TMP8:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[SUB3:.+]] = sub i32 %[[SUB]], 1
+// IR-NEXT:    %[[TMP7:.+]] = load i32, ptr %[[DOTNEW_STEP]], align 4
+// IR-NEXT:    %[[ADD:.+]] = add i32 %[[SUB3]], %[[TMP7]]
+// IR-NEXT:    %[[TMP8:.+]] = load i32, ptr %[[DOTNEW_STEP]], align 4
 // IR-NEXT:    %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP8]]
-// IR-NEXT:    %[[SUB5:.+]] = sub i32 %[[DIV]], 1
-// IR-NEXT:    store i32 %[[SUB5]], ptr %[[DOTCAPTURE_EXPR_3]], align 4
-// IR-NEXT:    %[[TMP9:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_3]], align 4
-// IR-NEXT:    %[[ADD7:.+]] = add i32 %[[TMP9]], 1
-// IR-NEXT:    store i32 %[[ADD7]], ptr %[[DOTCAPTURE_EXPR_6]], align 4
-// IR-NEXT:    %[[TMP10:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_6]], align 4
-// IR-NEXT:    %[[SUB9:.+]] = sub i32 %[[TMP10]], -3
-// IR-NEXT:    %[[DIV10:.+]] = udiv i32 %[[SUB9]], 4
-// IR-NEXT:    %[[SUB11:.+]] = sub i32 %[[DIV10]], 1
-// IR-NEXT:    store i32 %[[SUB11]], ptr %[[DOTCAPTURE_EXPR_8]], align 4
+// IR-NEXT:    %[[SUB4:.+]] = sub i32 %[[DIV]], 1
+// IR-NEXT:    store i32 %[[SUB4]], ptr %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[TMP9:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[ADD6:.+]] = add i32 %[[TMP9]], 1
+// IR-NEXT:    store i32 %[[ADD6]], ptr %[[DOTCAPTURE_EXPR_5]], align 4
+// IR-NEXT:    %[[TMP10:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_5]], align 4
+// IR-NEXT:    %[[SUB8:.+]] = sub i32 %[[TMP10]], -3
+// IR-NEXT:    %[[DIV9:.+]] = udiv i32 %[[SUB8]], 4
+// IR-NEXT:    %[[SUB10:.+]] = sub i32 %[[DIV9]], 1
+// IR-NEXT:    store i32 %[[SUB10]], ptr %[[DOTCAPTURE_EXPR_7]], align 4
 // IR-NEXT:    store i32 0, ptr %[[DOTFLOOR_0_IV_I]], align 4
-// IR-NEXT:    %[[TMP11:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_6]], align 4
+// IR-NEXT:    %[[TMP11:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_5]], align 4
 // IR-NEXT:    %[[CMP:.+]] = icmp ult i32 0, %[[TMP11]]
 // IR-NEXT:    br i1 %[[CMP]], label %[[OMP_PRECOND_THEN:.+]], label %[[OMP_PRECOND_END:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_PRECOND_THEN]]:
 // IR-NEXT:    store i32 0, ptr %[[DOTOMP_LB]], align 4
-// IR-NEXT:    %[[TMP12:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_8]], align 4
+// IR-NEXT:    %[[TMP12:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_7]], align 4
 // IR-NEXT:    store i32 %[[TMP12]], ptr %[[DOTOMP_UB]], align 4
 // IR-NEXT:    store i32 1, ptr %[[DOTOMP_STRIDE]], align 4
 // IR-NEXT:    store i32 0, ptr %[[DOTOMP_IS_LAST]], align 4
-// IR-NEXT:    call void @__kmpc_for_static_init_4u(ptr @1, i32 %[[TMP0]], i32 34, ptr %[[DOTOMP_IS_LAST]], ptr %[[DOTOMP_LB]], ptr %[[DOTOMP_UB]], ptr %[[DOTOMP_STRIDE]], i32 1, i32 1)
+// IR-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB1:.+]], i32 %[[TMP0]], i32 34, ptr %[[DOTOMP_IS_LAST]], ptr %[[DOTOMP_LB]], ptr %[[DOTOMP_UB]], ptr %[[DOTOMP_STRIDE]], i32 1, i32 1)
 // IR-NEXT:    %[[TMP13:.+]] = load i32, ptr %[[DOTOMP_UB]], align 4
-// IR-NEXT:    %[[TMP14:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_8]], align 4
-// IR-NEXT:    %[[CMP13:.+]] = icmp ugt i32 %[[TMP13]], %[[TMP14]]
-// IR-NEXT:    br i1 %[[CMP13]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]]
+// IR-NEXT:    %[[TMP14:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_7]], align 4
+// IR-NEXT:    %[[CMP12:.+]] = icmp ugt i32 %[[TMP13]], %[[TMP14]]
+// IR-NEXT:    br i1 %[[CMP12]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[COND_TRUE]]:
-// IR-NEXT:    %[[TMP15:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_8]], align 4
+// IR-NEXT:    %[[TMP15:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_7]], align 4
 // IR-NEXT:    br label %[[COND_END:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[COND_FALSE]]:
@@ -103,50 +103,50 @@ extern "C" void body(...) {}
 // IR-NEXT:  [[OMP_INNER_FOR_COND]]:
 // IR-NEXT:    %[[TMP18:.+]] = load i32, ptr %[[DOTOMP_IV]], align 4
 // IR-NEXT:    %[[TMP19:.+]] = load i32, ptr %[[DOTOMP_UB]], align 4
-// IR-NEXT:    %[[ADD14:.+]] = add i32 %[[TMP19]], 1
-// IR-NEXT:    %[[CMP15:.+]] = icmp ult i32 %[[TMP18]], %[[ADD14]]
-// IR-NEXT:    br i1 %[[CMP15]], label %[[OMP_INNER_FOR_BODY:.+]], label %[[OMP_INNER_FOR_END:.+]]
+// IR-NEXT:    %[[ADD13:.+]] = add i32 %[[TMP19]], 1
+// IR-NEXT:    %[[CMP14:.+]] = icmp ult i32 %[[TMP18]], %[[ADD13]]
+// IR-NEXT:    br i1 %[[CMP14]], label %[[OMP_INNER_FOR_BODY:.+]], label %[[OMP_INNER_FOR_END:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_INNER_FOR_BODY]]:
 // IR-NEXT:    %[[TMP20:.+]] = load i32, ptr %[[DOTOMP_IV]], align 4
 // IR-NEXT:    %[[MUL:.+]] = mul i32 %[[TMP20]], 4
-// IR-NEXT:    %[[ADD16:.+]] = add i32 0, %[[MUL]]
-// IR-NEXT:    store i32 %[[ADD16]], ptr %[[DOTFLOOR_0_IV_I12]], align 4
-// IR-NEXT:    %[[TMP21:.+]] = load i32, ptr %[[DOTFLOOR_0_IV_I12]], align 4
+// IR-NEXT:    %[[ADD15:.+]] = add i32 0, %[[MUL]]
+// IR-NEXT:    store i32 %[[ADD15]], ptr %[[DOTFLOOR_0_IV_I11]], align 4
+// IR-NEXT:    %[[TMP21:.+]] = load i32, ptr %[[DOTFLOOR_0_IV_I11]], align 4
 // IR-NEXT:    store i32 %[[TMP21]], ptr %[[DOTTILE_0_IV_I]], align 4
 // IR-NEXT:    br label %[[FOR_COND:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[FOR_COND]]:
 // IR-NEXT:    %[[TMP22:.+]] = load i32, ptr %[[DOTTILE_0_IV_I]], align 4
-// IR-NEXT:    %[[TMP23:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_3]], align 4
-// IR-NEXT:    %[[ADD17:.+]] = add i32 %[[TMP23]], 1
-// IR-NEXT:    %[[TMP24:.+]] = load i32, ptr %[[DOTFLOOR_0_IV_I12]], align 4
-// IR-NEXT:    %[[ADD18:.+]] = add nsw i32 %[[TMP24]], 4
-// IR-NEXT:    %[[CMP19:.+]] = icmp ult i32 %[[ADD17]], %[[ADD18]]
-// IR-NEXT:    br i1 %[[CMP19]], label %[[COND_TRUE20:.+]], label %[[COND_FALSE22:.+]]
-// IR-EMPTY:
-// IR-NEXT:  [[COND_TRUE20]]:
-// IR-NEXT:    %[[TMP25:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_3]], align 4
-// IR-NEXT:    %[[ADD21:.+]] = add i32 %[[TMP25]], 1
-// IR-NEXT:    br label %[[COND_END24:.+]]
-// IR-EMPTY:
-// IR-NEXT:  [[COND_FALSE22]]:
-// IR-NEXT:    %[[TMP26:.+]] = load i32, ptr %[[DOTFLOOR_0_IV_I12]], align 4
-// IR-NEXT:    %[[ADD23:.+]] = add nsw i32 %[[TMP26]], 4
-// IR-NEXT:    br label %[[COND_END24]]
-// IR-EMPTY:
-// IR-NEXT:  [[COND_END24]]:
-// IR-NEXT:    %[[COND25:.+]] = phi i32 [ %[[ADD21]], %[[COND_TRUE20]] ], [ %[[ADD23]], %[[COND_FALSE22]] ]
-// IR-NEXT:    %[[CMP26:.+]] = icmp ult i32 %[[TMP22]], %[[COND25]]
-// IR-NEXT:    br i1 %[[CMP26]], label %[[FOR_BODY:.+]], label %[[FOR_END:.+]]
+// IR-NEXT:    %[[TMP23:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[ADD16:.+]] = add i32 %[[TMP23]], 1
+// IR-NEXT:    %[[TMP24:.+]] = load i32, ptr %[[DOTFLOOR_0_IV_I11]], align 4
+// IR-NEXT:    %[[ADD17:.+]] = add i32 %[[TMP24]], 4
+// IR-NEXT:    %[[CMP18:.+]] = icmp ult i32 %[[ADD16]], %[[ADD17]]
+// IR-NEXT:    br i1 %[[CMP18]], label %[[COND_TRUE19:.+]], label %[[COND_FALSE21:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_TRUE19]]:
+// IR-NEXT:    %[[TMP25:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[ADD20:.+]] = add i32 %[[TMP25]], 1
+// IR-NEXT:    br label %[[COND_END23:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_FALSE21]]:
+// IR-NEXT:    %[[TMP26:.+]] = load i32, ptr %[[DOTFLOOR_0_IV_I11]], align 4
+// IR-NEXT:    %[[ADD22:.+]] = add i32 %[[TMP26]], 4
+// IR-NEXT:    br label %[[COND_END23]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_END23]]:
+// IR-NEXT:    %[[COND24:.+]] = phi i32 [ %[[ADD20]], %[[COND_TRUE19]] ], [ %[[ADD22]], %[[COND_FALSE21]] ]
+// IR-NEXT:    %[[CMP25:.+]] = icmp ult i32 %[[TMP22]], %[[COND24]]
+// IR-NEXT:    br i1 %[[CMP25]], label %[[FOR_BODY:.+]], label %[[FOR_END:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[FOR_BODY]]:
 // IR-NEXT:    %[[TMP27:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_]], align 4
 // IR-NEXT:    %[[TMP28:.+]] = load i32, ptr %[[DOTTILE_0_IV_I]], align 4
-// IR-NEXT:    %[[TMP29:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
-// IR-NEXT:    %[[MUL27:.+]] = mul i32 %[[TMP28]], %[[TMP29]]
-// IR-NEXT:    %[[ADD28:.+]] = add i32 %[[TMP27]], %[[MUL27]]
-// IR-NEXT:    store i32 %[[ADD28]], ptr %[[I]], align 4
+// IR-NEXT:    %[[TMP29:.+]] = load i32, ptr %[[DOTNEW_STEP]], align 4
+// IR-NEXT:    %[[MUL26:.+]] = mul i32 %[[TMP28]], %[[TMP29]]
+// IR-NEXT:    %[[ADD27:.+]] = add i32 %[[TMP27]], %[[MUL26]]
+// IR-NEXT:    store i32 %[[ADD27]], ptr %[[I]], align 4
 // IR-NEXT:    %[[TMP30:.+]] = load i32, ptr %[[START_ADDR]], align 4
 // IR-NEXT:    %[[TMP31:.+]] = load i32, ptr %[[END_ADDR]], align 4
 // IR-NEXT:    %[[TMP32:.+]] = load i32, ptr %[[STEP_ADDR]], align 4
@@ -156,9 +156,9 @@ extern "C" void body(...) {}
 // IR-EMPTY:
 // IR-NEXT:  [[FOR_INC]]:
 // IR-NEXT:    %[[TMP34:.+]] = load i32, ptr %[[DOTTILE_0_IV_I]], align 4
-// IR-NEXT:    %[[INC:.+]] = add nsw i32 %[[TMP34]], 1
+// IR-NEXT:    %[[INC:.+]] = add i32 %[[TMP34]], 1
 // IR-NEXT:    store i32 %[[INC]], ptr %[[DOTTILE_0_IV_I]], align 4
-// IR-NEXT:    br label %[[FOR_COND]], !llvm.loop ![[LOOP2:[0-9]+]]
+// IR-NEXT:    br label %[[FOR_COND]], !llvm.loop ![[LOOP3:[0-9]+]]
 // IR-EMPTY:
 // IR-NEXT:  [[FOR_END]]:
 // IR-NEXT:    br label %[[OMP_BODY_CONTINUE:.+]]
@@ -168,19 +168,19 @@ extern "C" void body(...) {}
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_INNER_FOR_INC]]:
 // IR-NEXT:    %[[TMP35:.+]] = load i32, ptr %[[DOTOMP_IV]], align 4
-// IR-NEXT:    %[[ADD29:.+]] = add i32 %[[TMP35]], 1
-// IR-NEXT:    store i32 %[[ADD29]], ptr %[[DOTOMP_IV]], align 4
+// IR-NEXT:    %[[ADD28:.+]] = add i32 %[[TMP35]], 1
+// IR-NEXT:    store i32 %[[ADD28]], ptr %[[DOTOMP_IV]], align 4
 // IR-NEXT:    br label %[[OMP_INNER_FOR_COND]]
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_INNER_FOR_END]]:
 // IR-NEXT:    br label %[[OMP_LOOP_EXIT:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_LOOP_EXIT]]:
-// IR-NEXT:    call void @__kmpc_for_static_fini(ptr @1, i32 %[[TMP0]])
+// IR-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 %[[TMP0]])
 // IR-NEXT:    br label %[[OMP_PRECOND_END]]
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_PRECOND_END]]:
-// IR-NEXT:    call void @__kmpc_barrier(ptr @3, i32 %[[TMP0]])
+// IR-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB3:.+]], i32 %[[TMP0]])
 // IR-NEXT:    ret void
 // IR-NEXT:  }
 extern "C" void func(int start, int end, int step) {
diff --git a/clang/test/OpenMP/tile_codegen_tile_for.cpp b/clang/test/OpenMP/tile_codegen_tile_for.cpp
index d0fb893..91536c4 100644
--- a/clang/test/OpenMP/tile_codegen_tile_for.cpp
+++ b/clang/test/OpenMP/tile_codegen_tile_for.cpp
@@ -16,7 +16,7 @@
 extern "C" void body(...) {}
 
 
-// IR-LABEL: @func(
+// IR-LABEL: define {{.*}}@func(
 // IR-NEXT:  [[ENTRY:.*]]:
 // IR-NEXT:    %[[START_ADDR:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[END_ADDR:.+]] = alloca i32, align 4
@@ -26,22 +26,22 @@ extern "C" void body(...) {}
 // IR-NEXT:    %[[I:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTCAPTURE_EXPR_:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTCAPTURE_EXPR_1:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTNEW_STEP:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTCAPTURE_EXPR_2:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[DOTCAPTURE_EXPR_3:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTFLOOR_0_IV_I:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[DOTCAPTURE_EXPR_6:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[DOTCAPTURE_EXPR_8:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[DOTCAPTURE_EXPR_12:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[DOTCAPTURE_EXPR_14:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_5:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_7:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_11:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_13:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTFLOOR_0_IV__FLOOR_0_IV_I:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTOMP_LB:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTOMP_UB:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTOMP_STRIDE:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTOMP_IS_LAST:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTFLOOR_0_IV__FLOOR_0_IV_I17:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTTILE_0_IV__FLOOR_0_IV_I:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTTILE_0_IV_I:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[TMP0:.+]] = call i32 @__kmpc_global_thread_num(ptr @2)
+// IR-NEXT:    %[[TMP0:.+]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:.+]])
 // IR-NEXT:    store i32 %[[START:.+]], ptr %[[START_ADDR]], align 4
 // IR-NEXT:    store i32 %[[END:.+]], ptr %[[END_ADDR]], align 4
 // IR-NEXT:    store i32 %[[STEP:.+]], ptr %[[STEP_ADDR]], align 4
@@ -52,53 +52,53 @@ extern "C" void body(...) {}
 // IR-NEXT:    %[[TMP3:.+]] = load i32, ptr %[[END_ADDR]], align 4
 // IR-NEXT:    store i32 %[[TMP3]], ptr %[[DOTCAPTURE_EXPR_1]], align 4
 // IR-NEXT:    %[[TMP4:.+]] = load i32, ptr %[[STEP_ADDR]], align 4
-// IR-NEXT:    store i32 %[[TMP4]], ptr %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    store i32 %[[TMP4]], ptr %[[DOTNEW_STEP]], align 4
 // IR-NEXT:    %[[TMP5:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_1]], align 4
 // IR-NEXT:    %[[TMP6:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_]], align 4
 // IR-NEXT:    %[[SUB:.+]] = sub i32 %[[TMP5]], %[[TMP6]]
-// IR-NEXT:    %[[SUB4:.+]] = sub i32 %[[SUB]], 1
-// IR-NEXT:    %[[TMP7:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
-// IR-NEXT:    %[[ADD:.+]] = add i32 %[[SUB4]], %[[TMP7]]
-// IR-NEXT:    %[[TMP8:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[SUB3:.+]] = sub i32 %[[SUB]], 1
+// IR-NEXT:    %[[TMP7:.+]] = load i32, ptr %[[DOTNEW_STEP]], align 4
+// IR-NEXT:    %[[ADD:.+]] = add i32 %[[SUB3]], %[[TMP7]]
+// IR-NEXT:    %[[TMP8:.+]] = load i32, ptr %[[DOTNEW_STEP]], align 4
 // IR-NEXT:    %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP8]]
-// IR-NEXT:    %[[SUB5:.+]] = sub i32 %[[DIV]], 1
-// IR-NEXT:    store i32 %[[SUB5]], ptr %[[DOTCAPTURE_EXPR_3]], align 4
+// IR-NEXT:    %[[SUB4:.+]] = sub i32 %[[DIV]], 1
+// IR-NEXT:    store i32 %[[SUB4]], ptr %[[DOTCAPTURE_EXPR_2]], align 4
 // IR-NEXT:    store i32 0, ptr %[[DOTFLOOR_0_IV_I]], align 4
-// IR-NEXT:    %[[TMP9:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_3]], align 4
-// IR-NEXT:    %[[ADD7:.+]] = add i32 %[[TMP9]], 1
-// IR-NEXT:    store i32 %[[ADD7]], ptr %[[DOTCAPTURE_EXPR_6]], align 4
-// IR-NEXT:    %[[TMP10:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_6]], align 4
-// IR-NEXT:    %[[SUB9:.+]] = sub i32 %[[TMP10]], -3
-// IR-NEXT:    %[[DIV10:.+]] = udiv i32 %[[SUB9]], 4
-// IR-NEXT:    %[[SUB11:.+]] = sub i32 %[[DIV10]], 1
-// IR-NEXT:    store i32 %[[SUB11]], ptr %[[DOTCAPTURE_EXPR_8]], align 4
-// IR-NEXT:    %[[TMP11:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_8]], align 4
-// IR-NEXT:    %[[ADD13:.+]] = add i32 %[[TMP11]], 1
-// IR-NEXT:    store i32 %[[ADD13]], ptr %[[DOTCAPTURE_EXPR_12]], align 4
-// IR-NEXT:    %[[TMP12:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_12]], align 4
-// IR-NEXT:    %[[SUB15:.+]] = sub i32 %[[TMP12]], -2
-// IR-NEXT:    %[[DIV16:.+]] = udiv i32 %[[SUB15]], 3
-// IR-NEXT:    %[[SUB17:.+]] = sub i32 %[[DIV16]], 1
-// IR-NEXT:    store i32 %[[SUB17]], ptr %[[DOTCAPTURE_EXPR_14]], align 4
+// IR-NEXT:    %[[TMP9:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[ADD6:.+]] = add i32 %[[TMP9]], 1
+// IR-NEXT:    store i32 %[[ADD6]], ptr %[[DOTCAPTURE_EXPR_5]], align 4
+// IR-NEXT:    %[[TMP10:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_5]], align 4
+// IR-NEXT:    %[[SUB8:.+]] = sub i32 %[[TMP10]], -3
+// IR-NEXT:    %[[DIV9:.+]] = udiv i32 %[[SUB8]], 4
+// IR-NEXT:    %[[SUB10:.+]] = sub i32 %[[DIV9]], 1
+// IR-NEXT:    store i32 %[[SUB10]], ptr %[[DOTCAPTURE_EXPR_7]], align 4
+// IR-NEXT:    %[[TMP11:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_7]], align 4
+// IR-NEXT:    %[[ADD12:.+]] = add i32 %[[TMP11]], 1
+// IR-NEXT:    store i32 %[[ADD12]], ptr %[[DOTCAPTURE_EXPR_11]], align 4
+// IR-NEXT:    %[[TMP12:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_11]], align 4
+// IR-NEXT:    %[[SUB14:.+]] = sub i32 %[[TMP12]], -2
+// IR-NEXT:    %[[DIV15:.+]] = udiv i32 %[[SUB14]], 3
+// IR-NEXT:    %[[SUB16:.+]] = sub i32 %[[DIV15]], 1
+// IR-NEXT:    store i32 %[[SUB16]], ptr %[[DOTCAPTURE_EXPR_13]], align 4
 // IR-NEXT:    store i32 0, ptr %[[DOTFLOOR_0_IV__FLOOR_0_IV_I]], align 4
-// IR-NEXT:    %[[TMP13:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_12]], align 4
+// IR-NEXT:    %[[TMP13:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_11]], align 4
 // IR-NEXT:    %[[CMP:.+]] = icmp ult i32 0, %[[TMP13]]
 // IR-NEXT:    br i1 %[[CMP]], label %[[OMP_PRECOND_THEN:.+]], label %[[OMP_PRECOND_END:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_PRECOND_THEN]]:
 // IR-NEXT:    store i32 0, ptr %[[DOTOMP_LB]], align 4
-// IR-NEXT:    %[[TMP14:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_14]], align 4
+// IR-NEXT:    %[[TMP14:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_13]], align 4
 // IR-NEXT:    store i32 %[[TMP14]], ptr %[[DOTOMP_UB]], align 4
 // IR-NEXT:    store i32 1, ptr %[[DOTOMP_STRIDE]], align 4
 // IR-NEXT:    store i32 0, ptr %[[DOTOMP_IS_LAST]], align 4
-// IR-NEXT:    call void @__kmpc_for_static_init_4u(ptr @1, i32 %[[TMP0]], i32 34, ptr %[[DOTOMP_IS_LAST]], ptr %[[DOTOMP_LB]], ptr %[[DOTOMP_UB]], ptr %[[DOTOMP_STRIDE]], i32 1, i32 1)
+// IR-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB1:.+]], i32 %[[TMP0]], i32 34, ptr %[[DOTOMP_IS_LAST]], ptr %[[DOTOMP_LB]], ptr %[[DOTOMP_UB]], ptr %[[DOTOMP_STRIDE]], i32 1, i32 1)
 // IR-NEXT:    %[[TMP15:.+]] = load i32, ptr %[[DOTOMP_UB]], align 4
-// IR-NEXT:    %[[TMP16:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_14]], align 4
-// IR-NEXT:    %[[CMP19:.+]] = icmp ugt i32 %[[TMP15]], %[[TMP16]]
-// IR-NEXT:    br i1 %[[CMP19]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]]
+// IR-NEXT:    %[[TMP16:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_13]], align 4
+// IR-NEXT:    %[[CMP18:.+]] = icmp ugt i32 %[[TMP15]], %[[TMP16]]
+// IR-NEXT:    br i1 %[[CMP18]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[COND_TRUE]]:
-// IR-NEXT:    %[[TMP17:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_14]], align 4
+// IR-NEXT:    %[[TMP17:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_13]], align 4
 // IR-NEXT:    br label %[[COND_END:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[COND_FALSE]]:
@@ -115,83 +115,83 @@ extern "C" void body(...) {}
 // IR-NEXT:  [[OMP_INNER_FOR_COND]]:
 // IR-NEXT:    %[[TMP20:.+]] = load i32, ptr %[[DOTOMP_IV]], align 4
 // IR-NEXT:    %[[TMP21:.+]] = load i32, ptr %[[DOTOMP_UB]], align 4
-// IR-NEXT:    %[[ADD20:.+]] = add i32 %[[TMP21]], 1
-// IR-NEXT:    %[[CMP21:.+]] = icmp ult i32 %[[TMP20]], %[[ADD20]]
-// IR-NEXT:    br i1 %[[CMP21]], label %[[OMP_INNER_FOR_BODY:.+]], label %[[OMP_INNER_FOR_END:.+]]
+// IR-NEXT:    %[[ADD19:.+]] = add i32 %[[TMP21]], 1
+// IR-NEXT:    %[[CMP20:.+]] = icmp ult i32 %[[TMP20]], %[[ADD19]]
+// IR-NEXT:    br i1 %[[CMP20]], label %[[OMP_INNER_FOR_BODY:.+]], label %[[OMP_INNER_FOR_END:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_INNER_FOR_BODY]]:
 // IR-NEXT:    %[[TMP22:.+]] = load i32, ptr %[[DOTOMP_IV]], align 4
 // IR-NEXT:    %[[MUL:.+]] = mul i32 %[[TMP22]], 3
-// IR-NEXT:    %[[ADD22:.+]] = add i32 0, %[[MUL]]
-// IR-NEXT:    store i32 %[[ADD22]], ptr %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18]], align 4
-// IR-NEXT:    %[[TMP23:.+]] = load i32, ptr %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18]], align 4
+// IR-NEXT:    %[[ADD21:.+]] = add i32 0, %[[MUL]]
+// IR-NEXT:    store i32 %[[ADD21]], ptr %[[DOTFLOOR_0_IV__FLOOR_0_IV_I17]], align 4
+// IR-NEXT:    %[[TMP23:.+]] = load i32, ptr %[[DOTFLOOR_0_IV__FLOOR_0_IV_I17]], align 4
 // IR-NEXT:    store i32 %[[TMP23]], ptr %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4
 // IR-NEXT:    br label %[[FOR_COND:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[FOR_COND]]:
 // IR-NEXT:    %[[TMP24:.+]] = load i32, ptr %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4
-// IR-NEXT:    %[[TMP25:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_8]], align 4
-// IR-NEXT:    %[[ADD23:.+]] = add i32 %[[TMP25]], 1
-// IR-NEXT:    %[[TMP26:.+]] = load i32, ptr %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18]], align 4
-// IR-NEXT:    %[[ADD24:.+]] = add i32 %[[TMP26]], 3
-// IR-NEXT:    %[[CMP25:.+]] = icmp ult i32 %[[ADD23]], %[[ADD24]]
-// IR-NEXT:    br i1 %[[CMP25]], label %[[COND_TRUE26:.+]], label %[[COND_FALSE28:.+]]
-// IR-EMPTY:
-// IR-NEXT:  [[COND_TRUE26]]:
-// IR-NEXT:    %[[TMP27:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_8]], align 4
-// IR-NEXT:    %[[ADD27:.+]] = add i32 %[[TMP27]], 1
-// IR-NEXT:    br label %[[COND_END30:.+]]
-// IR-EMPTY:
-// IR-NEXT:  [[COND_FALSE28]]:
-// IR-NEXT:    %[[TMP28:.+]] = load i32, ptr %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18]], align 4
-// IR-NEXT:    %[[ADD29:.+]] = add i32 %[[TMP28]], 3
-// IR-NEXT:    br label %[[COND_END30]]
-// IR-EMPTY:
-// IR-NEXT:  [[COND_END30]]:
-// IR-NEXT:    %[[COND31:.+]] = phi i32 [ %[[ADD27]], %[[COND_TRUE26]] ], [ %[[ADD29]], %[[COND_FALSE28]] ]
-// IR-NEXT:    %[[CMP32:.+]] = icmp ult i32 %[[TMP24]], %[[COND31]]
-// IR-NEXT:    br i1 %[[CMP32]], label %[[FOR_BODY:.+]], label %[[FOR_END51:.+]]
+// IR-NEXT:    %[[TMP25:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_7]], align 4
+// IR-NEXT:    %[[ADD22:.+]] = add i32 %[[TMP25]], 1
+// IR-NEXT:    %[[TMP26:.+]] = load i32, ptr %[[DOTFLOOR_0_IV__FLOOR_0_IV_I17]], align 4
+// IR-NEXT:    %[[ADD23:.+]] = add i32 %[[TMP26]], 3
+// IR-NEXT:    %[[CMP24:.+]] = icmp ult i32 %[[ADD22]], %[[ADD23]]
+// IR-NEXT:    br i1 %[[CMP24]], label %[[COND_TRUE25:.+]], label %[[COND_FALSE27:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_TRUE25]]:
+// IR-NEXT:    %[[TMP27:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_7]], align 4
+// IR-NEXT:    %[[ADD26:.+]] = add i32 %[[TMP27]], 1
+// IR-NEXT:    br label %[[COND_END29:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_FALSE27]]:
+// IR-NEXT:    %[[TMP28:.+]] = load i32, ptr %[[DOTFLOOR_0_IV__FLOOR_0_IV_I17]], align 4
+// IR-NEXT:    %[[ADD28:.+]] = add i32 %[[TMP28]], 3
+// IR-NEXT:    br label %[[COND_END29]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_END29]]:
+// IR-NEXT:    %[[COND30:.+]] = phi i32 [ %[[ADD26]], %[[COND_TRUE25]] ], [ %[[ADD28]], %[[COND_FALSE27]] ]
+// IR-NEXT:    %[[CMP31:.+]] = icmp ult i32 %[[TMP24]], %[[COND30]]
+// IR-NEXT:    br i1 %[[CMP31]], label %[[FOR_BODY:.+]], label %[[FOR_END50:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[FOR_BODY]]:
 // IR-NEXT:    %[[TMP29:.+]] = load i32, ptr %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4
-// IR-NEXT:    %[[MUL33:.+]] = mul i32 %[[TMP29]], 4
-// IR-NEXT:    %[[ADD34:.+]] = add i32 0, %[[MUL33]]
-// IR-NEXT:    store i32 %[[ADD34]], ptr %[[DOTFLOOR_0_IV_I]], align 4
+// IR-NEXT:    %[[MUL32:.+]] = mul i32 %[[TMP29]], 4
+// IR-NEXT:    %[[ADD33:.+]] = add i32 0, %[[MUL32]]
+// IR-NEXT:    store i32 %[[ADD33]], ptr %[[DOTFLOOR_0_IV_I]], align 4
 // IR-NEXT:    %[[TMP30:.+]] = load i32, ptr %[[DOTFLOOR_0_IV_I]], align 4
 // IR-NEXT:    store i32 %[[TMP30]], ptr %[[DOTTILE_0_IV_I]], align 4
-// IR-NEXT:    br label %[[FOR_COND35:.+]]
+// IR-NEXT:    br label %[[FOR_COND34:.+]]
 // IR-EMPTY:
-// IR-NEXT:  [[FOR_COND35]]:
+// IR-NEXT:  [[FOR_COND34]]:
 // IR-NEXT:    %[[TMP31:.+]] = load i32, ptr %[[DOTTILE_0_IV_I]], align 4
-// IR-NEXT:    %[[TMP32:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_3]], align 4
-// IR-NEXT:    %[[ADD36:.+]] = add i32 %[[TMP32]], 1
+// IR-NEXT:    %[[TMP32:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[ADD35:.+]] = add i32 %[[TMP32]], 1
 // IR-NEXT:    %[[TMP33:.+]] = load i32, ptr %[[DOTFLOOR_0_IV_I]], align 4
-// IR-NEXT:    %[[ADD37:.+]] = add nsw i32 %[[TMP33]], 4
-// IR-NEXT:    %[[CMP38:.+]] = icmp ult i32 %[[ADD36]], %[[ADD37]]
-// IR-NEXT:    br i1 %[[CMP38]], label %[[COND_TRUE39:.+]], label %[[COND_FALSE41:.+]]
+// IR-NEXT:    %[[ADD36:.+]] = add i32 %[[TMP33]], 4
+// IR-NEXT:    %[[CMP37:.+]] = icmp ult i32 %[[ADD35]], %[[ADD36]]
+// IR-NEXT:    br i1 %[[CMP37]], label %[[COND_TRUE38:.+]], label %[[COND_FALSE40:.+]]
 // IR-EMPTY:
-// IR-NEXT:  [[COND_TRUE39]]:
-// IR-NEXT:    %[[TMP34:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_3]], align 4
-// IR-NEXT:    %[[ADD40:.+]] = add i32 %[[TMP34]], 1
-// IR-NEXT:    br label %[[COND_END43:.+]]
+// IR-NEXT:  [[COND_TRUE38]]:
+// IR-NEXT:    %[[TMP34:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[ADD39:.+]] = add i32 %[[TMP34]], 1
+// IR-NEXT:    br label %[[COND_END42:.+]]
 // IR-EMPTY:
-// IR-NEXT:  [[COND_FALSE41]]:
+// IR-NEXT:  [[COND_FALSE40]]:
 // IR-NEXT:    %[[TMP35:.+]] = load i32, ptr %[[DOTFLOOR_0_IV_I]], align 4
-// IR-NEXT:    %[[ADD42:.+]] = add nsw i32 %[[TMP35]], 4
-// IR-NEXT:    br label %[[COND_END43]]
+// IR-NEXT:    %[[ADD41:.+]] = add i32 %[[TMP35]], 4
+// IR-NEXT:    br label %[[COND_END42]]
 // IR-EMPTY:
-// IR-NEXT:  [[COND_END43]]:
-// IR-NEXT:    %[[COND44:.+]] = phi i32 [ %[[ADD40]], %[[COND_TRUE39]] ], [ %[[ADD42]], %[[COND_FALSE41]] ]
-// IR-NEXT:    %[[CMP45:.+]] = icmp ult i32 %[[TMP31]], %[[COND44]]
-// IR-NEXT:    br i1 %[[CMP45]], label %[[FOR_BODY46:.+]], label %[[FOR_END:.+]]
+// IR-NEXT:  [[COND_END42]]:
+// IR-NEXT:    %[[COND43:.+]] = phi i32 [ %[[ADD39]], %[[COND_TRUE38]] ], [ %[[ADD41]], %[[COND_FALSE40]] ]
+// IR-NEXT:    %[[CMP44:.+]] = icmp ult i32 %[[TMP31]], %[[COND43]]
+// IR-NEXT:    br i1 %[[CMP44]], label %[[FOR_BODY45:.+]], label %[[FOR_END:.+]]
 // IR-EMPTY:
-// IR-NEXT:  [[FOR_BODY46]]:
+// IR-NEXT:  [[FOR_BODY45]]:
 // IR-NEXT:    %[[TMP36:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_]], align 4
 // IR-NEXT:    %[[TMP37:.+]] = load i32, ptr %[[DOTTILE_0_IV_I]], align 4
-// IR-NEXT:    %[[TMP38:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
-// IR-NEXT:    %[[MUL47:.+]] = mul i32 %[[TMP37]], %[[TMP38]]
-// IR-NEXT:    %[[ADD48:.+]] = add i32 %[[TMP36]], %[[MUL47]]
-// IR-NEXT:    store i32 %[[ADD48]], ptr %[[I]], align 4
+// IR-NEXT:    %[[TMP38:.+]] = load i32, ptr %[[DOTNEW_STEP]], align 4
+// IR-NEXT:    %[[MUL46:.+]] = mul i32 %[[TMP37]], %[[TMP38]]
+// IR-NEXT:    %[[ADD47:.+]] = add i32 %[[TMP36]], %[[MUL46]]
+// IR-NEXT:    store i32 %[[ADD47]], ptr %[[I]], align 4
 // IR-NEXT:    %[[TMP39:.+]] = load i32, ptr %[[START_ADDR]], align 4
 // IR-NEXT:    %[[TMP40:.+]] = load i32, ptr %[[END_ADDR]], align 4
 // IR-NEXT:    %[[TMP41:.+]] = load i32, ptr %[[STEP_ADDR]], align 4
@@ -201,20 +201,20 @@ extern "C" void body(...) {}
 // IR-EMPTY:
 // IR-NEXT:  [[FOR_INC]]:
 // IR-NEXT:    %[[TMP43:.+]] = load i32, ptr %[[DOTTILE_0_IV_I]], align 4
-// IR-NEXT:    %[[INC:.+]] = add nsw i32 %[[TMP43]], 1
+// IR-NEXT:    %[[INC:.+]] = add i32 %[[TMP43]], 1
 // IR-NEXT:    store i32 %[[INC]], ptr %[[DOTTILE_0_IV_I]], align 4
-// IR-NEXT:    br label %[[FOR_COND35]], !llvm.loop ![[LOOP2:[0-9]+]]
+// IR-NEXT:    br label %[[FOR_COND34]], !llvm.loop ![[LOOP3:[0-9]+]]
 // IR-EMPTY:
 // IR-NEXT:  [[FOR_END]]:
-// IR-NEXT:    br label %[[FOR_INC49:.+]]
+// IR-NEXT:    br label %[[FOR_INC48:.+]]
 // IR-EMPTY:
-// IR-NEXT:  [[FOR_INC49]]:
+// IR-NEXT:  [[FOR_INC48]]:
 // IR-NEXT:    %[[TMP44:.+]] = load i32, ptr %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4
-// IR-NEXT:    %[[INC50:.+]] = add i32 %[[TMP44]], 1
-// IR-NEXT:    store i32 %[[INC50]], ptr %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4
-// IR-NEXT:    br label %[[FOR_COND]], !llvm.loop ![[LOOP4:[0-9]+]]
+// IR-NEXT:    %[[INC49:.+]] = add i32 %[[TMP44]], 1
+// IR-NEXT:    store i32 %[[INC49]], ptr %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4
+// IR-NEXT:    br label %[[FOR_COND]], !llvm.loop ![[LOOP5:[0-9]+]]
 // IR-EMPTY:
-// IR-NEXT:  [[FOR_END51]]:
+// IR-NEXT:  [[FOR_END50]]:
 // IR-NEXT:    br label %[[OMP_BODY_CONTINUE:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_BODY_CONTINUE]]:
@@ -222,21 +222,23 @@ extern "C" void body(...) {}
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_INNER_FOR_INC]]:
 // IR-NEXT:    %[[TMP45:.+]] = load i32, ptr %[[DOTOMP_IV]], align 4
-// IR-NEXT:    %[[ADD52:.+]] = add i32 %[[TMP45]], 1
-// IR-NEXT:    store i32 %[[ADD52]], ptr %[[DOTOMP_IV]], align 4
+// IR-NEXT:    %[[ADD51:.+]] = add i32 %[[TMP45]], 1
+// IR-NEXT:    store i32 %[[ADD51]], ptr %[[DOTOMP_IV]], align 4
 // IR-NEXT:    br label %[[OMP_INNER_FOR_COND]]
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_INNER_FOR_END]]:
 // IR-NEXT:    br label %[[OMP_LOOP_EXIT:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_LOOP_EXIT]]:
-// IR-NEXT:    call void @__kmpc_for_static_fini(ptr @1, i32 %[[TMP0]])
+// IR-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 %[[TMP0]])
 // IR-NEXT:    br label %[[OMP_PRECOND_END]]
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_PRECOND_END]]:
-// IR-NEXT:    call void @__kmpc_barrier(ptr @3, i32 %[[TMP0]])
+// IR-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB3:.+]], i32 %[[TMP0]])
 // IR-NEXT:    ret void
 // IR-NEXT:  }
+
+
 extern "C" void func(int start, int end, int step) {
 #pragma omp for
 #pragma omp tile sizes(3)
@@ -246,8 +248,10 @@ extern "C" void func(int start, int end, int step) {
 }
 
 #endif /* HEADER */
+
 // IR: ![[META0:[0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-// IR: ![[META1:[0-9]+]] = !{!"{{[^"]*}}"}
-// IR: ![[LOOP2]] = distinct !{![[LOOP2]], ![[LOOPPROP3:[0-9]+]]}
-// IR: ![[LOOPPROP3]] = !{!"llvm.loop.mustprogress"}
-// IR: ![[LOOP4]] = distinct !{![[LOOP4]], ![[LOOPPROP3]]}
+// IR: ![[META1:[0-9]+]] = !{i32 7, !"openmp", i32 51}
+// IR: ![[META2:[0-9]+]] =
+// IR: ![[LOOP3]] = distinct !{![[LOOP3]], ![[LOOPPROP4:[0-9]+]]}
+// IR: ![[LOOPPROP4]] = !{!"llvm.loop.mustprogress"}
+// IR: ![[LOOP5]] = distinct !{![[LOOP5]], ![[LOOPPROP4]]}
diff --git a/clang/test/PCH/cxx1z-aligned-alloc.cpp b/clang/test/PCH/cxx1z-aligned-alloc.cpp
index c1becbd..cccd628 100644
--- a/clang/test/PCH/cxx1z-aligned-alloc.cpp
+++ b/clang/test/PCH/cxx1z-aligned-alloc.cpp
@@ -1,12 +1,12 @@
 // No PCH:
-// RUN: %clang_cc1 -pedantic -fsized-deallocation -std=c++1z -include %s -verify %s
+// RUN: %clang_cc1 -pedantic -std=c++1z -include %s -verify %s
 //
 // With PCH:
-// RUN: %clang_cc1 -pedantic -fsized-deallocation -std=c++1z -emit-pch %s -o %t
-// RUN: %clang_cc1 -pedantic -fsized-deallocation -std=c++1z -include-pch %t -verify %s
+// RUN: %clang_cc1 -pedantic -std=c++1z -emit-pch %s -o %t
+// RUN: %clang_cc1 -pedantic -std=c++1z -include-pch %t -verify %s
 
-// RUN: %clang_cc1 -pedantic -fsized-deallocation -std=c++1z -emit-pch -fpch-instantiate-templates %s -o %t
-// RUN: %clang_cc1 -pedantic -fsized-deallocation -std=c++1z -include-pch %t -verify %s
+// RUN: %clang_cc1 -pedantic -std=c++1z -emit-pch -fpch-instantiate-templates %s -o %t
+// RUN: %clang_cc1 -pedantic -std=c++1z -include-pch %t -verify %s
 
 // expected-no-diagnostics
 
diff --git a/clang/test/Parser/altivec.c b/clang/test/Parser/altivec.c
index 445369f..9291b9b 100644
--- a/clang/test/Parser/altivec.c
+++ b/clang/test/Parser/altivec.c
@@ -56,40 +56,40 @@ void f_a2(int b, vector int a);
 vector int v = (vector int)(-1);
 
 // These should have errors on AIX and warnings otherwise.
-__vector long vv_l;                 // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector long vv_l;                 // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-__vector signed long vv_sl;         // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector signed long vv_sl;         // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-__vector unsigned long vv_ul;       // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector unsigned long vv_ul;       // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-__vector long int vv_li;            // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector long int vv_li;            // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-__vector signed long int vv_sli;    // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector signed long int vv_sli;    // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-__vector unsigned long int vv_uli;  // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector unsigned long int vv_uli;  // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector long v_l;                    // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector long v_l;                    // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector signed long v_sl;            // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector signed long v_sl;            // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector unsigned long v_ul;          // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector unsigned long v_ul;          // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector long int v_li;               // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector long int v_li;               // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector signed long int v_sli;       // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector signed long int v_sli;       // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector unsigned long int v_uli;     // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector unsigned long int v_uli;     // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
 
diff --git a/clang/test/Parser/cxx-altivec.cpp b/clang/test/Parser/cxx-altivec.cpp
index 5cb760d..15a6bf6 100644
--- a/clang/test/Parser/cxx-altivec.cpp
+++ b/clang/test/Parser/cxx-altivec.cpp
@@ -59,40 +59,40 @@ void f_a2(int b, vector int a);
 vector int v = (vector int)(-1);
 
 // These should have errors on AIX and warnings otherwise.
-__vector long vv_l;                 // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector long vv_l;                 // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-__vector signed long vv_sl;         // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector signed long vv_sl;         // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-__vector unsigned long vv_ul;       // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector unsigned long vv_ul;       // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-__vector long int vv_li;            // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector long int vv_li;            // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-__vector signed long int vv_sli;    // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector signed long int vv_sli;    // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-__vector unsigned long int vv_uli;  // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector unsigned long int vv_uli;  // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector long v_l;                    // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector long v_l;                    // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector signed long v_sl;            // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector signed long v_sl;            // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector unsigned long v_ul;          // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector unsigned long v_ul;          // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector long int v_li;               // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector long int v_li;               // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector signed long int v_sli;       // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector signed long int v_sli;       // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector unsigned long int v_uli;     // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector unsigned long int v_uli;     // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
 
diff --git a/clang/test/Parser/lax-conv.cpp b/clang/test/Parser/lax-conv.cpp
index f784e3f..0cb2503 100644
--- a/clang/test/Parser/lax-conv.cpp
+++ b/clang/test/Parser/lax-conv.cpp
@@ -21,10 +21,10 @@ template <typename VEC> VEC __attribute__((noinline)) test(vector unsigned char
     return (VEC)(a * b);
 }
 vector unsigned int test1(vector unsigned char RetImplicitConv) {
-  return RetImplicitConv; // expected-warning {{Implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}} 
+  return RetImplicitConv; // expected-warning {{implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 }
 vector unsigned int test2(vector unsigned char RetImplicitConvAddConst) {
-  return RetImplicitConvAddConst + 5; // expected-warning {{Implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}} 
+  return RetImplicitConvAddConst + 5; // expected-warning {{implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 }
 vector unsigned int test3(vector unsigned char RetExplicitConv) {
   return (vector unsigned int)RetExplicitConv;
@@ -34,7 +34,7 @@ vector unsigned int test4(vector unsigned char RetExplicitConvAddConst) {
 }
 vector unsigned int test5(vector unsigned char RetImplicitConvAddSame1,
                           vector unsigned char RetImplicitConvAddSame2) {
-  return RetImplicitConvAddSame1 + RetImplicitConvAddSame2; // expected-warning {{Implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}} 
+  return RetImplicitConvAddSame1 + RetImplicitConvAddSame2; // expected-warning {{implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 }
 vector unsigned int test6(vector unsigned char RetExplicitConvAddSame1,
                           vector unsigned char RetExplicitConvAddSame2) {
@@ -54,10 +54,10 @@ vector unsigned long long test9(vector unsigned char a, vector unsigned char b)
     return test<vector unsigned long long>(a, b);
 }
 void test1a(vector unsigned char ArgImplicitConv) {
-  return dummy(ArgImplicitConv); // expected-warning {{Implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
+  return dummy(ArgImplicitConv); // expected-warning {{implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 }
 void test2a(vector unsigned char ArgImplicitConvAddConst) {
-  return dummy(ArgImplicitConvAddConst + 5); // expected-warning {{Implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
+  return dummy(ArgImplicitConvAddConst + 5); // expected-warning {{implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 }
 void test3a(vector unsigned char ArgExplicitConv) {
   return dummy((vector unsigned int)ArgExplicitConv);
@@ -67,7 +67,7 @@ void test4a(vector unsigned char ArgExplicitConvAddConst) {
 }
 void test5a(vector unsigned char ArgImplicitConvAddSame1,
             vector unsigned char ArgImplicitConvAddSame2) {
-  return dummy(ArgImplicitConvAddSame1 + ArgImplicitConvAddSame2); // expected-warning {{Implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
+  return dummy(ArgImplicitConvAddSame1 + ArgImplicitConvAddSame2); // expected-warning {{implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 }
 void test6a(vector unsigned char ArgExplicitConvAddSame1,
             vector unsigned char ArgExplicitConvAddSame2) {
@@ -80,33 +80,33 @@ void test7a(vector unsigned char ArgExplicitConvAddSame1Full,
                                      ArgExplicitConvAddSame2Full));
 }
 void test_bool_compat(void) {
-  vbs = vss; // expected-warning {{Implicit conversion between vector types (''__vector short' (vector of 8 'short' values)' and ''__vector __bool unsigned short' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vbs = vus; // expected-warning {{Implicit conversion between vector types (''__vector unsigned short' (vector of 8 'unsigned short' values)' and ''__vector __bool unsigned short' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
+  vbs = vss; // expected-warning {{implicit conversion between vector types (''__vector short' (vector of 8 'short' values)' and ''__vector __bool unsigned short' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vbs = vus; // expected-warning {{implicit conversion between vector types (''__vector unsigned short' (vector of 8 'unsigned short' values)' and ''__vector __bool unsigned short' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 
-  vbi = vsi; // expected-warning {{Implicit conversion between vector types (''__vector int' (vector of 4 'int' values)' and ''__vector __bool unsigned int' (vector of 4 'unsigned int' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vbi = vui; // expected-warning {{Implicit conversion between vector types (''__vector unsigned int' (vector of 4 'unsigned int' values)' and ''__vector __bool unsigned int' (vector of 4 'unsigned int' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
+  vbi = vsi; // expected-warning {{implicit conversion between vector types (''__vector int' (vector of 4 'int' values)' and ''__vector __bool unsigned int' (vector of 4 'unsigned int' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vbi = vui; // expected-warning {{implicit conversion between vector types (''__vector unsigned int' (vector of 4 'unsigned int' values)' and ''__vector __bool unsigned int' (vector of 4 'unsigned int' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 
-  vbl = vsl; // expected-warning {{Implicit conversion between vector types (''__vector long long' (vector of 2 'long long' values)' and ''__vector __bool unsigned long long' (vector of 2 'unsigned long long' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vbl = vul; // expected-warning {{Implicit conversion between vector types (''__vector unsigned long long' (vector of 2 'unsigned long long' values)' and ''__vector __bool unsigned long long' (vector of 2 'unsigned long long' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
+  vbl = vsl; // expected-warning {{implicit conversion between vector types (''__vector long long' (vector of 2 'long long' values)' and ''__vector __bool unsigned long long' (vector of 2 'unsigned long long' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vbl = vul; // expected-warning {{implicit conversion between vector types (''__vector unsigned long long' (vector of 2 'unsigned long long' values)' and ''__vector __bool unsigned long long' (vector of 2 'unsigned long long' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 
-  vbc = vsc; // expected-warning {{Implicit conversion between vector types (''__vector signed char' (vector of 16 'signed char' values)' and ''__vector __bool unsigned char' (vector of 16 'unsigned char' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vbc = vuc; // expected-warning {{Implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector __bool unsigned char' (vector of 16 'unsigned char' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
+  vbc = vsc; // expected-warning {{implicit conversion between vector types (''__vector signed char' (vector of 16 'signed char' values)' and ''__vector __bool unsigned char' (vector of 16 'unsigned char' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vbc = vuc; // expected-warning {{implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector __bool unsigned char' (vector of 16 'unsigned char' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 }
 
 void test_pixel_compat(void) {
-  vp = vbs; // expected-warning {{Implicit conversion between vector types (''__vector __bool unsigned short' (vector of 8 'unsigned short' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vp = vss; // expected-warning {{Implicit conversion between vector types (''__vector short' (vector of 8 'short' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vp = vus; // expected-warning {{Implicit conversion between vector types (''__vector unsigned short' (vector of 8 'unsigned short' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
+  vp = vbs; // expected-warning {{implicit conversion between vector types (''__vector __bool unsigned short' (vector of 8 'unsigned short' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vp = vss; // expected-warning {{implicit conversion between vector types (''__vector short' (vector of 8 'short' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vp = vus; // expected-warning {{implicit conversion between vector types (''__vector unsigned short' (vector of 8 'unsigned short' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 
-  vp = vbi; // expected-warning {{Implicit conversion between vector types (''__vector __bool unsigned int' (vector of 4 'unsigned int' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vp = vsi; // expected-warning {{Implicit conversion between vector types (''__vector int' (vector of 4 'int' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vp = vui; // expected-warning {{Implicit conversion between vector types (''__vector unsigned int' (vector of 4 'unsigned int' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
+  vp = vbi; // expected-warning {{implicit conversion between vector types (''__vector __bool unsigned int' (vector of 4 'unsigned int' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vp = vsi; // expected-warning {{implicit conversion between vector types (''__vector int' (vector of 4 'int' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vp = vui; // expected-warning {{implicit conversion between vector types (''__vector unsigned int' (vector of 4 'unsigned int' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 
-  vp = vbl; // expected-warning {{Implicit conversion between vector types (''__vector __bool unsigned long long' (vector of 2 'unsigned long long' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vp = vsl; // expected-warning {{Implicit conversion between vector types (''__vector long long' (vector of 2 'long long' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vp = vul; // expected-warning {{Implicit conversion between vector types (''__vector unsigned long long' (vector of 2 'unsigned long long' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
+  vp = vbl; // expected-warning {{implicit conversion between vector types (''__vector __bool unsigned long long' (vector of 2 'unsigned long long' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vp = vsl; // expected-warning {{implicit conversion between vector types (''__vector long long' (vector of 2 'long long' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vp = vul; // expected-warning {{implicit conversion between vector types (''__vector unsigned long long' (vector of 2 'unsigned long long' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 
-  vp = vbc; // expected-warning {{Implicit conversion between vector types (''__vector __bool unsigned char' (vector of 16 'unsigned char' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vp = vsc; // expected-warning {{Implicit conversion between vector types (''__vector signed char' (vector of 16 'signed char' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vp = vuc; // expected-warning {{Implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
+  vp = vbc; // expected-warning {{implicit conversion between vector types (''__vector __bool unsigned char' (vector of 16 'unsigned char' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vp = vsc; // expected-warning {{implicit conversion between vector types (''__vector signed char' (vector of 16 'signed char' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vp = vuc; // expected-warning {{implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 }
diff --git a/clang/test/Parser/objcbridge-related-attribute.m b/clang/test/Parser/objcbridge-related-attribute.m
index 246afeef..e76d5e3 100644
--- a/clang/test/Parser/objcbridge-related-attribute.m
+++ b/clang/test/Parser/objcbridge-related-attribute.m
@@ -5,10 +5,10 @@ typedef struct __attribute__((objc_bridge_related(NSColor,,CGColor))) CGColor *C
 typedef struct __attribute__((objc_bridge_related(NSColor,,))) CGColor *CGColorRef2Ok;
 typedef struct __attribute__((objc_bridge_related(NSColor,colorWithCGColor:,))) CGColor *CGColorRef3Ok;
 
-typedef struct __attribute__((objc_bridge_related(,colorWithCGColor:,CGColor))) CGColor *CGColorRef1NotOk; // expected-error {{expected a related ObjectiveC class name, e.g., 'NSColor'}}
+typedef struct __attribute__((objc_bridge_related(,colorWithCGColor:,CGColor))) CGColor *CGColorRef1NotOk; // expected-error {{expected a related Objective-C class name, e.g., 'NSColor'}}
 typedef struct __attribute__((objc_bridge_related(NSColor,colorWithCGColor,CGColor))) CGColor *CGColorRef2NotOk; // expected-error {{expected a class method selector with single argument, e.g., 'colorWithCGColor:'}}
 typedef struct __attribute__((objc_bridge_related(NSColor,colorWithCGColor::,CGColor))) CGColor *CGColorRef3NotOk; // expected-error {{expected a class method selector with single argument, e.g., 'colorWithCGColor:'}}
-typedef struct __attribute__((objc_bridge_related(12,colorWithCGColor:,CGColor))) CGColor *CGColorRef4NotOk; // expected-error {{expected a related ObjectiveC class name, e.g., 'NSColor'}}
+typedef struct __attribute__((objc_bridge_related(12,colorWithCGColor:,CGColor))) CGColor *CGColorRef4NotOk; // expected-error {{expected a related Objective-C class name, e.g., 'NSColor'}}
 typedef struct __attribute__((objc_bridge_related(NSColor,+:,CGColor))) CGColor *CGColorRef5NotOk; // expected-error {{expected ','}}
 typedef struct __attribute__((objc_bridge_related(NSColor,colorWithCGColor:,+))) CGColor *CGColorRef6NotOk; // expected-error {{expected ')'}}
 
diff --git a/clang/test/Parser/pragma-attribute.cpp b/clang/test/Parser/pragma-attribute.cpp
index bc8e7b9..6377fc7 100644
--- a/clang/test/Parser/pragma-attribute.cpp
+++ b/clang/test/Parser/pragma-attribute.cpp
@@ -127,7 +127,7 @@ void function();
 // expected-error@-1 {{attribute 'objc_bridge_related' can't be applied to 'function'}}
 #pragma clang attribute pop
 
-#pragma clang attribute push (__attribute__((objc_bridge_related(1))), apply_to=function) // expected-error {{expected a related ObjectiveC class name, e.g., 'NSColor'}}
+#pragma clang attribute push (__attribute__((objc_bridge_related(1))), apply_to=function) // expected-error {{expected a related Objective-C class name, e.g., 'NSColor'}}
 
 #pragma clang attribute push (__attribute__((used)), apply_to=function) // expected-error {{attribute 'used' is not supported by '#pragma clang attribute'}}
 
diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
index ca51f2f..f0a2ef8 100644
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -793,9 +793,7 @@
 // CHECK_KNL_M32: #define __AES__ 1
 // CHECK_KNL_M32: #define __AVX2__ 1
 // CHECK_KNL_M32: #define __AVX512CD__ 1
-// CHECK_KNL_M32: #define __AVX512ER__ 1
 // CHECK_KNL_M32: #define __AVX512F__ 1
-// CHECK_KNL_M32: #define __AVX512PF__ 1
 // CHECK_KNL_M32: #define __AVX__ 1
 // CHECK_KNL_M32: #define __BMI2__ 1
 // CHECK_KNL_M32: #define __BMI__ 1
@@ -808,7 +806,6 @@
 // CHECK_KNL_M32: #define __MOVBE__ 1
 // CHECK_KNL_M32: #define __PCLMUL__ 1
 // CHECK_KNL_M32: #define __POPCNT__ 1
-// CHECK_KNL_M32: #define __PREFETCHWT1__ 1
 // CHECK_KNL_M32: #define __PRFCHW__ 1
 // CHECK_KNL_M32: #define __RDRND__ 1
 // CHECK_KNL_M32: #define __SSE2__ 1
@@ -832,9 +829,7 @@
 // CHECK_KNL_M64: #define __AES__ 1
 // CHECK_KNL_M64: #define __AVX2__ 1
 // CHECK_KNL_M64: #define __AVX512CD__ 1
-// CHECK_KNL_M64: #define __AVX512ER__ 1
 // CHECK_KNL_M64: #define __AVX512F__ 1
-// CHECK_KNL_M64: #define __AVX512PF__ 1
 // CHECK_KNL_M64: #define __AVX__ 1
 // CHECK_KNL_M64: #define __BMI2__ 1
 // CHECK_KNL_M64: #define __BMI__ 1
@@ -847,7 +842,6 @@
 // CHECK_KNL_M64: #define __MOVBE__ 1
 // CHECK_KNL_M64: #define __PCLMUL__ 1
 // CHECK_KNL_M64: #define __POPCNT__ 1
-// CHECK_KNL_M64: #define __PREFETCHWT1__ 1
 // CHECK_KNL_M64: #define __PRFCHW__ 1
 // CHECK_KNL_M64: #define __RDRND__ 1
 // CHECK_KNL_M64: #define __SSE2_MATH__ 1
@@ -874,9 +868,7 @@
 // CHECK_KNM_M32: #define __AES__ 1
 // CHECK_KNM_M32: #define __AVX2__ 1
 // CHECK_KNM_M32: #define __AVX512CD__ 1
-// CHECK_KNM_M32: #define __AVX512ER__ 1
 // CHECK_KNM_M32: #define __AVX512F__ 1
-// CHECK_KNM_M32: #define __AVX512PF__ 1
 // CHECK_KNM_M32: #define __AVX512VPOPCNTDQ__ 1
 // CHECK_KNM_M32: #define __AVX__ 1
 // CHECK_KNM_M32: #define __BMI2__ 1
@@ -890,7 +882,6 @@
 // CHECK_KNM_M32: #define __MOVBE__ 1
 // CHECK_KNM_M32: #define __PCLMUL__ 1
 // CHECK_KNM_M32: #define __POPCNT__ 1
-// CHECK_KNM_M32: #define __PREFETCHWT1__ 1
 // CHECK_KNM_M32: #define __PRFCHW__ 1
 // CHECK_KNM_M32: #define __RDRND__ 1
 // CHECK_KNM_M32: #define __SSE2__ 1
@@ -911,9 +902,7 @@
 // CHECK_KNM_M64: #define __AES__ 1
 // CHECK_KNM_M64: #define __AVX2__ 1
 // CHECK_KNM_M64: #define __AVX512CD__ 1
-// CHECK_KNM_M64: #define __AVX512ER__ 1
 // CHECK_KNM_M64: #define __AVX512F__ 1
-// CHECK_KNM_M64: #define __AVX512PF__ 1
 // CHECK_KNM_M64: #define __AVX512VPOPCNTDQ__ 1
 // CHECK_KNM_M64: #define __AVX__ 1
 // CHECK_KNM_M64: #define __BMI2__ 1
@@ -927,7 +916,6 @@
 // CHECK_KNM_M64: #define __MOVBE__ 1
 // CHECK_KNM_M64: #define __PCLMUL__ 1
 // CHECK_KNM_M64: #define __POPCNT__ 1
-// CHECK_KNM_M64: #define __PREFETCHWT1__ 1
 // CHECK_KNM_M64: #define __PRFCHW__ 1
 // CHECK_KNM_M64: #define __RDRND__ 1
 // CHECK_KNM_M64: #define __SSE2_MATH__ 1
diff --git a/clang/test/Preprocessor/stdc-ms-extension.cpp b/clang/test/Preprocessor/stdc-ms-extension.cpp
new file mode 100644
index 0000000..6e9fa60
--- /dev/null
+++ b/clang/test/Preprocessor/stdc-ms-extension.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cl /TC /dev/null /E -Xclang -dM 2> /dev/null | FileCheck -match-full-lines %s --check-prefix=NOSTDC
+// RUN: %clang_cl /TC /dev/null /E -Xclang -dM /Zc:__STDC__ 2> /dev/null | FileCheck -match-full-lines %s --check-prefix=YESSTDC
+// __STDC__ should never be defined in C++ mode with fms-compatibility.
+// RUN: %clang_cl /dev/null /E -Xclang -dM 2>&1 | FileCheck %s --check-prefix=NOSTDC
+// RUN: %clang_cl /dev/null /E -Xclang -dM /Zc:__STDC__ 2>&1 | FileCheck %s --check-prefix=ZCSTDCIGNORED
+// YESSTDC: #define __STDC__ 1
+// NOSTDC-NOT: #define __STDC__ 1
+// ZCSTDCIGNORED-NOT: #define __STDC__ 1
+// ZCSTDCIGNORED: argument unused during compilation
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index 57104c9..7567267b 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -90,38 +90,6 @@
 // AVX512CD: #define __SSE__ 1
 // AVX512CD: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512er -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512ER %s
-
-// AVX512ER: #define __AVX2__ 1
-// AVX512ER: #define __AVX512ER__ 1
-// AVX512ER: #define __AVX512F__ 1
-// AVX512ER: #define __AVX__ 1
-// AVX512ER: #define __EVEX512__ 1
-// AVX512ER: #define __SSE2_MATH__ 1
-// AVX512ER: #define __SSE2__ 1
-// AVX512ER: #define __SSE3__ 1
-// AVX512ER: #define __SSE4_1__ 1
-// AVX512ER: #define __SSE4_2__ 1
-// AVX512ER: #define __SSE_MATH__ 1
-// AVX512ER: #define __SSE__ 1
-// AVX512ER: #define __SSSE3__ 1
-
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512pf -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512PF %s
-
-// AVX512PF: #define __AVX2__ 1
-// AVX512PF: #define __AVX512F__ 1
-// AVX512PF: #define __AVX512PF__ 1
-// AVX512PF: #define __AVX__ 1
-// AVX512PF: #define __EVEX512__ 1
-// AVX512PF: #define __SSE2_MATH__ 1
-// AVX512PF: #define __SSE2__ 1
-// AVX512PF: #define __SSE3__ 1
-// AVX512PF: #define __SSE4_1__ 1
-// AVX512PF: #define __SSE4_2__ 1
-// AVX512PF: #define __SSE_MATH__ 1
-// AVX512PF: #define __SSE__ 1
-// AVX512PF: #define __SSSE3__ 1
-
 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512dq -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512DQ %s
 
 // AVX512DQ: #define __AVX2__ 1
@@ -171,22 +139,6 @@
 // AVX512VL: #define __SSE__ 1
 // AVX512VL: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512pf -mno-avx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512F2 %s
-
-// AVX512F2: #define __AVX2__ 1
-// AVX512F2-NOT: #define __AVX512F__ 1
-// AVX512F2-NOT: #define __AVX512PF__ 1
-// AVX512F2-NOT: #define __EVEX512__ 1
-// AVX512F2: #define __AVX__ 1
-// AVX512F2: #define __SSE2_MATH__ 1
-// AVX512F2: #define __SSE2__ 1
-// AVX512F2: #define __SSE3__ 1
-// AVX512F2: #define __SSE4_1__ 1
-// AVX512F2: #define __SSE4_2__ 1
-// AVX512F2: #define __SSE_MATH__ 1
-// AVX512F2: #define __SSE__ 1
-// AVX512F2: #define __SSSE3__ 1
-
 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512ifma -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512IFMA %s
 
 // AVX512IFMA: #define __AVX2__ 1
@@ -640,14 +592,12 @@
 
 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512f -mno-avx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOEVEX512 %s
 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512cd -mno-avx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOEVEX512 %s
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512er -mno-avx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOEVEX512 %s
 // NOEVEX512-NOT: #define __AVX512F__ 1
 // NOEVEX512-NOT: #define __EVEX256__ 1
 // NOEVEX512-NOT: #define __EVEX512__ 1
 
 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512f -mno-evex512 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512NOEVEX512 %s
 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512cd -mno-evex512 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512NOEVEX512 %s
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512er -mno-evex512 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512NOEVEX512 %s
 // AVX512NOEVEX512: #define __AVX512F__ 1
 // AVX512NOEVEX512-NOT: #define __EVEX256__ 1
 // AVX512NOEVEX512-NOT: #define __EVEX512__ 1
diff --git a/clang/test/Profile/misexpect-branch.c b/clang/test/Profile/misexpect-branch.c
index ce46b46..5c43944 100644
--- a/clang/test/Profile/misexpect-branch.c
+++ b/clang/test/Profile/misexpect-branch.c
@@ -26,10 +26,10 @@ int buzz();
 const int inner_loop = 100;
 const int outer_loop = 2000;
 
-int bar() { // imprecise-warning-re {{Potential performance regression from use of __builtin_expect(): Annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions.}}
+int bar() { // imprecise-warning-re {{potential performance regression from use of __builtin_expect(): annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions}}
   int rando = buzz();
   int x = 0;
-  if (likely(rando % (outer_loop * inner_loop) == 0)) { // exact-warning-re {{Potential performance regression from use of __builtin_expect(): Annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions.}}
+  if (likely(rando % (outer_loop * inner_loop) == 0)) { // exact-warning-re {{potential performance regression from use of __builtin_expect(): annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions}}
     x = baz(rando);
   } else {
     x = foo(50);
@@ -37,10 +37,10 @@ int bar() { // imprecise-warning-re {{Potential performance regression from use
   return x;
 }
 
-int fizz() { // imprecise-warning-re {{Potential performance regression from use of __builtin_expect(): Annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions.}}
+int fizz() { // imprecise-warning-re {{potential performance regression from use of __builtin_expect(): annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions}}
   int rando = buzz();
   int x = 0;
-  if (unlikely(rando % (outer_loop * inner_loop) == 0)) { // exact-warning-re {{Potential performance regression from use of __builtin_expect(): Annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions.}}
+  if (unlikely(rando % (outer_loop * inner_loop) == 0)) { // exact-warning-re {{potential performance regression from use of __builtin_expect(): annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions}}
     x = baz(rando);
   } else {
     x = foo(50);
diff --git a/clang/test/Profile/misexpect-switch-default.c b/clang/test/Profile/misexpect-switch-default.c
index 033490e..cd337b9 100644
--- a/clang/test/Profile/misexpect-switch-default.c
+++ b/clang/test/Profile/misexpect-switch-default.c
@@ -20,7 +20,7 @@ int main() {
   int j;
   for (j = 0; j < outer_loop * inner_loop; ++j) {
     unsigned condition = rand() % 5;
-    switch (__builtin_expect(condition, 6)) { // expected-warning-re {{Potential performance regression from use of __builtin_expect(): Annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions.}}
+    switch (__builtin_expect(condition, 6)) { // expected-warning-re {{potential performance regression from use of __builtin_expect(): annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions}}
     case 0:
       val += sum(arry, arry_size);
       break;
diff --git a/clang/test/Profile/misexpect-switch.c b/clang/test/Profile/misexpect-switch.c
index 8ca8a15..84a7174 100644
--- a/clang/test/Profile/misexpect-switch.c
+++ b/clang/test/Profile/misexpect-switch.c
@@ -20,7 +20,7 @@ int main() {
   for (j = 0; j < outer_loop; ++j) {
     for (k = 0; k < inner_loop; ++k) {
       unsigned condition = rand() % 10000;
-      switch (__builtin_expect(condition, 0)) { // expected-warning-re {{Potential performance regression from use of __builtin_expect(): Annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions.}}
+      switch (__builtin_expect(condition, 0)) { // expected-warning-re {{potential performance regression from use of __builtin_expect(): annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions}}
       case 0:
         val += sum(arry, arry_size);
         break;
diff --git a/clang/test/Sema/attr-assume.c b/clang/test/Sema/attr-assume.c
deleted file mode 100644
index 98deffa..0000000
--- a/clang/test/Sema/attr-assume.c
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: %clang_cc1 -triple i386-apple-darwin9 -fsyntax-only -verify %s
-
-void f1(void) __attribute__((assume(3))); // expected-error {{expected string literal as argument of 'assume' attribute}}
-void f2(void) __attribute__((assume(int))); // expected-error {{expected string literal as argument of 'assume' attribute}}
-void f3(void) __attribute__((assume(for))); // expected-error {{expected string literal as argument of 'assume' attribute}}
-void f4(void) __attribute__((assume("QQQQ"))); // expected-warning {{unknown assumption string 'QQQQ'; attribute is potentially ignored}}
-void f5(void) __attribute__((assume("omp_no_openmp")));
-void f6(void) __attribute__((assume("omp_noopenmp"))); // expected-warning {{unknown assumption string 'omp_noopenmp' may be misspelled; attribute is potentially ignored, did you mean 'omp_no_openmp'?}}
-void f7(void) __attribute__((assume("omp_no_openmp_routine"))); // expected-warning {{unknown assumption string 'omp_no_openmp_routine' may be misspelled; attribute is potentially ignored, did you mean 'omp_no_openmp_routines'?}}
-void f8(void) __attribute__((assume("omp_no_openmp1"))); // expected-warning {{unknown assumption string 'omp_no_openmp1' may be misspelled; attribute is potentially ignored, did you mean 'omp_no_openmp'?}}
-void f9(void) __attribute__((assume("omp_no_openmp", "omp_no_openmp"))); // expected-error {{'assume' attribute takes one argument}}
-
-int g1 __attribute__((assume(0))); // expected-error {{expected string literal as argument of 'assume' attribute}}
-int g2 __attribute__((assume("omp_no_openmp"))); // expected-warning {{'assume' attribute only applies to functions and Objective-C methods}}
diff --git a/clang/test/Sema/attr-counted-by-late-parsed-off.c b/clang/test/Sema/attr-counted-by-late-parsed-off.c
new file mode 100644
index 0000000..34f51d1
--- /dev/null
+++ b/clang/test/Sema/attr-counted-by-late-parsed-off.c
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -DNEEDS_LATE_PARSING -fno-experimental-late-parse-attributes -fsyntax-only -verify %s
+// RUN: %clang_cc1 -DNEEDS_LATE_PARSING -fsyntax-only -verify %s
+
+// RUN: %clang_cc1 -UNEEDS_LATE_PARSING -fno-experimental-late-parse-attributes -fsyntax-only -verify=ok %s
+// RUN: %clang_cc1 -UNEEDS_LATE_PARSING -fsyntax-only -verify=ok %s
+
+#define __counted_by(f)  __attribute__((counted_by(f)))
+
+struct size_known { int dummy; };
+
+#ifdef NEEDS_LATE_PARSING
+struct on_decl {
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  struct size_known *buf __counted_by(count);
+  int count;
+};
+
+#else
+
+// ok-no-diagnostics
+struct on_decl {
+  int count;
+  struct size_known *buf __counted_by(count);
+};
+
+#endif
diff --git a/clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c b/clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c
new file mode 100644
index 0000000..9ff3b080
--- /dev/null
+++ b/clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c
@@ -0,0 +1,254 @@
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -verify %s
+
+#define __counted_by(f)  __attribute__((counted_by(f)))
+
+struct size_unknown;
+struct size_known {
+  int field;
+};
+
+typedef void(*fn_ptr_ty)(void);
+
+//==============================================================================
+// __counted_by on struct member pointer in decl attribute position
+//==============================================================================
+
+struct on_member_pointer_complete_ty {
+  struct size_known * buf __counted_by(count);
+  int count;
+};
+
+struct on_member_pointer_incomplete_ty {
+  struct size_unknown * buf __counted_by(count); // expected-error{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct size_unknown' is an incomplete type}}
+  int count;
+};
+
+struct on_member_pointer_const_incomplete_ty {
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'const struct size_unknown' is an incomplete type}}
+  const struct size_unknown * buf __counted_by(count);
+  int count;
+};
+
+struct on_member_pointer_void_ty {
+  void* buf __counted_by(count); // expected-error{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
+  int count;
+};
+
+struct on_member_pointer_fn_ptr_ty {
+  // buffer of `count` function pointers is allowed
+  void (**fn_ptr)(void) __counted_by(count);
+  int count;
+};
+
+
+struct on_member_pointer_fn_ptr_ty_ptr_ty {
+  // buffer of `count` function pointers is allowed
+  fn_ptr_ty* fn_ptr __counted_by(count);
+  int count;
+};
+
+struct on_member_pointer_fn_ty {
+  // buffer of `count` functions is not allowed
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
+  void (*fn_ptr)(void) __counted_by(count);
+  int count;
+};
+
+struct on_member_pointer_fn_ptr_ty_ty {
+  // buffer of `count` functions is not allowed
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
+  fn_ptr_ty fn_ptr __counted_by(count);
+  int count;
+};
+
+struct has_unannotated_vla {
+  int count;
+  int buffer[];
+};
+
+struct on_member_pointer_struct_with_vla {
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_unannotated_vla' is a struct type with a flexible array member}}
+  struct has_unannotated_vla* objects __counted_by(count);
+  int count;
+};
+
+struct has_annotated_vla {
+  int count;
+  int buffer[] __counted_by(count);
+};
+
+// Currently prevented because computing the size of `objects` at runtime would
+// require an O(N) walk of `objects` to take into account the length of the VLA
+// in each struct instance.
+struct on_member_pointer_struct_with_annotated_vla {
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_annotated_vla' is a struct type with a flexible array member}}
+  struct has_annotated_vla* objects __counted_by(count);
+  int count;
+};
+
+struct on_pointer_anon_buf {
+  // TODO: Support referring to parent scope
+  struct {
+    // expected-error@+1{{use of undeclared identifier 'count'}}
+    struct size_known *buf __counted_by(count);
+  };
+  int count;
+};
+
+struct on_pointer_anon_count {
+  struct size_known *buf __counted_by(count);
+  struct {
+    int count;
+  };
+};
+
+//==============================================================================
+// __counted_by on struct member pointer in type attribute position
+//==============================================================================
+// TODO: Correctly parse counted_by as a type attribute. Currently it is parsed
+// as a declaration attribute and is **not** late parsed resulting in the `count`
+// field being unavailable.
+
+struct on_member_pointer_complete_ty_ty_pos {
+  // TODO: Allow this
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  struct size_known *__counted_by(count) buf;
+  int count;
+};
+
+struct on_member_pointer_incomplete_ty_ty_pos {
+  // TODO: Allow this
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  struct size_unknown * __counted_by(count) buf;
+  int count;
+};
+
+struct on_member_pointer_const_incomplete_ty_ty_pos {
+  // TODO: Allow this
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  const struct size_unknown * __counted_by(count) buf;
+  int count;
+};
+
+struct on_member_pointer_void_ty_ty_pos {
+  // TODO: This should fail because the attribute is
+  // on a pointer with the pointee being an incomplete type.
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  void *__counted_by(count) buf;
+  int count;
+};
+
+// -
+
+struct on_member_pointer_fn_ptr_ty_pos {
+  // TODO: buffer of `count` function pointers should be allowed
+  // but fails because this isn't late parsed.
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  void (** __counted_by(count) fn_ptr)(void);
+  int count;
+};
+
+struct on_member_pointer_fn_ptr_ty_ptr_ty_pos {
+  // TODO: buffer of `count` function pointers should be allowed
+  // but fails because this isn't late parsed.
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  fn_ptr_ty* __counted_by(count) fn_ptr;
+  int count;
+};
+
+struct on_member_pointer_fn_ty_ty_pos {
+  // TODO: This should fail because the attribute is
+  // on a pointer with the pointee being a function type.
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  void (* __counted_by(count) fn_ptr)(void);
+  int count;
+};
+
+struct on_member_pointer_fn_ptr_ty_ty_pos {
+  // TODO: buffer of `count` function pointers should be allowed
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  void (** __counted_by(count) fn_ptr)(void);
+  int count;
+};
+
+struct on_member_pointer_fn_ptr_ty_typedef_ty_pos {
+  // TODO: This should fail because the attribute is
+  // on a pointer with the pointee being a function type.
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  fn_ptr_ty __counted_by(count) fn_ptr;
+  int count;
+};
+
+struct on_member_pointer_fn_ptr_ty_ty_pos_inner {
+  // TODO: This should fail because the attribute is
+  // on a pointer with the pointee being a function type.
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  void (* __counted_by(count) * fn_ptr)(void);
+  int count;
+};
+
+struct on_member_pointer_struct_with_vla_ty_pos {
+  // TODO: This should fail because the attribute is
+  // on a pointer with the pointee being a struct type with a VLA.
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  struct has_unannotated_vla *__counted_by(count) objects;
+  int count;
+};
+
+struct on_member_pointer_struct_with_annotated_vla_ty_pos {
+  // TODO: This should fail because the attribute is
+  // on a pointer with the pointee being a struct type with a VLA.
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  struct has_annotated_vla* __counted_by(count) objects;
+  int count;
+};
+
+struct on_nested_pointer_inner {
+  // TODO: This should be disallowed because in the `-fbounds-safety` model
+  // `__counted_by` can only be nested when used in function parameters.
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  struct size_known *__counted_by(count) *buf;
+  int count;
+};
+
+struct on_nested_pointer_outer {
+  // TODO: Allow this
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  struct size_known **__counted_by(count) buf;
+  int count;
+};
+
+struct on_pointer_anon_buf_ty_pos {
+  struct {
+    // TODO: Support referring to parent scope
+    // expected-error@+1{{use of undeclared identifier 'count'}}
+    struct size_known * __counted_by(count) buf;
+  };
+  int count;
+};
+
+struct on_pointer_anon_count_ty_pos {
+  // TODO: Allow this
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  struct size_known *__counted_by(count) buf;
+  struct {
+    int count;
+  };
+};
+
+//==============================================================================
+// __counted_by on struct non-pointer members
+//==============================================================================
+
+struct on_pod_ty {
+  // expected-error@+1{{'counted_by' only applies to pointers or C99 flexible array members}}
+  int wrong_ty __counted_by(count);
+  int count;
+};
+
+struct on_void_ty {
+  // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
+  // expected-error@+1{{field has incomplete type 'void'}}
+  void wrong_ty __counted_by(count);
+  int count;
+};
diff --git a/clang/test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c b/clang/test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c
new file mode 100644
index 0000000..9b0f2ea
--- /dev/null
+++ b/clang/test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c
@@ -0,0 +1,17 @@
+// __SVInt8_t is specific to ARM64 so specify that in the target triple
+// RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
+
+#define __counted_by(f)  __attribute__((counted_by(f)))
+
+struct on_sizeless_pointee_ty {
+    int count;
+    // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because '__SVInt8_t' is a sizeless type}}
+    __SVInt8_t* member __counted_by(count);
+};
+
+struct on_sizeless_ty {
+    int count;
+    // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
+    // expected-error@+1{{field has sizeless type '__SVInt8_t'}}
+    __SVInt8_t member __counted_by(count);
+};
diff --git a/clang/test/Sema/attr-counted-by-struct-ptrs.c b/clang/test/Sema/attr-counted-by-struct-ptrs.c
new file mode 100644
index 0000000..cd2bfe3
--- /dev/null
+++ b/clang/test/Sema/attr-counted-by-struct-ptrs.c
@@ -0,0 +1,224 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+#define __counted_by(f)  __attribute__((counted_by(f)))
+
+struct size_unknown;
+struct size_known {
+  int field;
+};
+
+typedef void(*fn_ptr_ty)(void);
+
+//==============================================================================
+// __counted_by on struct member pointer in decl attribute position
+//==============================================================================
+
+struct on_member_pointer_complete_ty {
+  int count;
+  struct size_known * buf __counted_by(count);
+};
+
+struct on_member_pointer_incomplete_ty {
+  int count;
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct size_unknown' is an incomplete type}}
+  struct size_unknown * buf __counted_by(count);
+};
+
+struct on_member_pointer_const_incomplete_ty {
+  int count;
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'const struct size_unknown' is an incomplete type}}
+  const struct size_unknown * buf __counted_by(count);
+};
+
+struct on_member_pointer_void_ty {
+  int count;
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
+  void* buf __counted_by(count);
+};
+
+struct on_member_pointer_fn_ptr_ty {
+  int count;
+  // buffer of `count` function pointers is allowed
+  void (**fn_ptr)(void) __counted_by(count);
+};
+
+struct on_member_pointer_fn_ptr_ty_ptr_ty {
+  int count;
+  // buffer of `count` function pointers is allowed
+  fn_ptr_ty* fn_ptr __counted_by(count);
+};
+
+struct on_member_pointer_fn_ty {
+  int count;
+  // buffer of `count` functions is not allowed
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
+  void (*fn_ptr)(void) __counted_by(count);
+};
+
+struct on_member_pointer_fn_ptr_ty_ty {
+  int count;
+  // buffer of `count` functions is not allowed
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
+  fn_ptr_ty fn_ptr __counted_by(count);
+};
+
+struct has_unannotated_vla {
+  int count;
+  int buffer[];
+};
+
+struct on_member_pointer_struct_with_vla {
+  int count;
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_unannotated_vla' is a struct type with a flexible array member}}
+  struct has_unannotated_vla* objects __counted_by(count);
+};
+
+struct has_annotated_vla {
+  int count;
+  int buffer[] __counted_by(count);
+};
+
+// Currently prevented because computing the size of `objects` at runtime would
+// require an O(N) walk of `objects` to take into account the length of the VLA
+// in each struct instance.
+struct on_member_pointer_struct_with_annotated_vla {
+  int count;
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_annotated_vla' is a struct type with a flexible array member}}
+  struct has_annotated_vla* objects __counted_by(count);
+};
+
+struct on_pointer_anon_buf {
+  int count;
+  struct {
+    struct size_known *buf __counted_by(count);
+  };
+};
+
+struct on_pointer_anon_count {
+  struct {
+    int count;
+  };
+  struct size_known *buf __counted_by(count);
+};
+
+//==============================================================================
+// __counted_by on struct member pointer in type attribute position
+//==============================================================================
+// TODO: Correctly parse counted_by as a type attribute. Currently it is parsed
+// as a declaration attribute
+
+struct on_member_pointer_complete_ty_ty_pos {
+  int count;
+  struct size_known *__counted_by(count) buf;
+};
+
+struct on_member_pointer_incomplete_ty_ty_pos {
+  int count;
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct size_unknown' is an incomplete type}}
+  struct size_unknown * __counted_by(count) buf;
+};
+
+struct on_member_pointer_const_incomplete_ty_ty_pos {
+  int count;
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'const struct size_unknown' is an incomplete type}}
+  const struct size_unknown * __counted_by(count) buf;
+};
+
+struct on_member_pointer_void_ty_ty_pos {
+  int count;
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
+  void *__counted_by(count) buf;
+};
+
+// -
+
+struct on_member_pointer_fn_ptr_ty_pos {
+  int count;
+  // buffer of `count` function pointers is allowed
+  void (** __counted_by(count) fn_ptr)(void);
+};
+
+struct on_member_pointer_fn_ptr_ty_ptr_ty_pos {
+  int count;
+  // buffer of `count` function pointers is allowed
+  fn_ptr_ty* __counted_by(count) fn_ptr;
+};
+
+struct on_member_pointer_fn_ty_ty_pos {
+  int count;
+  // buffer of `count` functions is not allowed
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
+  void (* __counted_by(count) fn_ptr)(void);
+};
+
+struct on_member_pointer_fn_ptr_ty_ty_pos {
+  int count;
+  // buffer of `count` functions is not allowed
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
+  fn_ptr_ty __counted_by(count) fn_ptr;
+};
+
+// TODO: This should be forbidden but isn't due to counted_by being treated
+// as a declaration attribute.
+struct on_member_pointer_fn_ptr_ty_ty_pos_inner {
+  int count;
+  void (* __counted_by(count) * fn_ptr)(void);
+};
+
+struct on_member_pointer_struct_with_vla_ty_pos {
+  int count;
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_unannotated_vla' is a struct type with a flexible array member}}
+  struct has_unannotated_vla *__counted_by(count) objects;
+};
+
+// Currently prevented because computing the size of `objects` at runtime would
+// require an O(N) walk of `objects` to take into account the length of the VLA
+// in each struct instance.
+struct on_member_pointer_struct_with_annotated_vla_ty_pos {
+  int count;
+  // expected-error@+1{{counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_annotated_vla' is a struct type with a flexible array member}}
+  struct has_annotated_vla* __counted_by(count) objects;
+};
+
+struct on_nested_pointer_inner {
+  // TODO: This should be disallowed because in the `-fbounds-safety` model
+  // `__counted_by` can only be nested when used in function parameters.
+  int count;
+  struct size_known *__counted_by(count) *buf;
+};
+
+struct on_nested_pointer_outer {
+  int count;
+  struct size_known **__counted_by(count) buf;
+};
+
+struct on_pointer_anon_buf_ty_pos {
+  int count;
+  struct {
+    struct size_known * __counted_by(count) buf;
+  };
+};
+
+struct on_pointer_anon_count_ty_pos {
+  struct {
+    int count;
+  };
+  struct size_known *__counted_by(count) buf;
+};
+
+//==============================================================================
+// __counted_by on struct non-pointer members
+//==============================================================================
+
+struct on_pod_ty {
+  int count;
+  // expected-error@+1{{'counted_by' only applies to pointers or C99 flexible array members}}
+  int wrong_ty __counted_by(count);
+};
+
+struct on_void_ty {
+  int count;
+  // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
+  // expected-error@+1{{field has incomplete type 'void'}}
+  void wrong_ty __counted_by(count);
+};
diff --git a/clang/test/Sema/attr-counted-by-vla-sizeless-types.c b/clang/test/Sema/attr-counted-by-vla-sizeless-types.c
new file mode 100644
index 0000000..31c0007
--- /dev/null
+++ b/clang/test/Sema/attr-counted-by-vla-sizeless-types.c
@@ -0,0 +1,11 @@
+// __SVInt8_t is specific to ARM64 so specify that in the target triple
+// RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
+
+#define __counted_by(f)  __attribute__((counted_by(f)))
+
+struct on_sizeless_elt_ty {
+    int count;
+    // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
+    // expected-error@+1{{array has sizeless element type '__SVInt8_t'}}
+    __SVInt8_t arr[] __counted_by(count);
+};
diff --git a/clang/test/Sema/attr-counted-by-vla.c b/clang/test/Sema/attr-counted-by-vla.c
new file mode 100644
index 0000000..b25f719
--- /dev/null
+++ b/clang/test/Sema/attr-counted-by-vla.c
@@ -0,0 +1,196 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+#define __counted_by(f)  __attribute__((counted_by(f)))
+
+struct bar;
+
+struct not_found {
+  int count;
+  struct bar *fam[] __counted_by(bork); // expected-error {{use of undeclared identifier 'bork'}}
+};
+
+struct no_found_count_not_in_substruct {
+  unsigned long flags;
+  unsigned char count; // expected-note {{'count' declared here}}
+  struct A {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
+  } a;
+};
+
+struct not_found_count_not_in_unnamed_substruct {
+  unsigned char count; // expected-note {{'count' declared here}}
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
+  } a;
+};
+
+struct not_found_count_not_in_unnamed_substruct_2 {
+  struct {
+    unsigned char count; // expected-note {{'count' declared here}}
+  };
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
+  } a;
+};
+
+struct not_found_count_in_other_unnamed_substruct {
+  struct {
+    unsigned char count;
+  } a1;
+
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
+  };
+};
+
+struct not_found_count_in_other_substruct {
+  struct _a1 {
+    unsigned char count;
+  } a1;
+
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
+  };
+};
+
+struct not_found_count_in_other_substruct_2 {
+  struct _a2 {
+    unsigned char count;
+  } a2;
+
+  int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
+};
+
+struct not_found_suggest {
+  int bork;
+  struct bar *fam[] __counted_by(blork); // expected-error {{use of undeclared identifier 'blork'}}
+};
+
+int global; // expected-note {{'global' declared here}}
+
+struct found_outside_of_struct {
+  int bork;
+  struct bar *fam[] __counted_by(global); // expected-error {{field 'global' in 'counted_by' not inside structure}}
+};
+
+struct self_referrential {
+  int bork;
+  struct bar *self[] __counted_by(self); // expected-error {{use of undeclared identifier 'self'}}
+};
+
+struct non_int_count {
+  double dbl_count;
+  struct bar *fam[] __counted_by(dbl_count); // expected-error {{'counted_by' requires a non-boolean integer type argument}}
+};
+
+struct array_of_ints_count {
+  int integers[2];
+  struct bar *fam[] __counted_by(integers); // expected-error {{'counted_by' requires a non-boolean integer type argument}}
+};
+
+struct not_a_fam {
+  int count;
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct bar' is an incomplete type}}
+  struct bar *non_fam __counted_by(count);
+};
+
+struct not_a_c99_fam {
+  int count;
+  struct bar *non_c99_fam[0] __counted_by(count); // expected-error {{'counted_by' on arrays only applies to C99 flexible array members}}
+};
+
+struct annotated_with_anon_struct {
+  unsigned long flags;
+  struct {
+    unsigned char count;
+    int array[] __counted_by(crount); // expected-error {{use of undeclared identifier 'crount'}}
+  };
+};
+
+//==============================================================================
+// __counted_by on a struct VLA with element type that has unknown size
+//==============================================================================
+
+struct size_unknown; // expected-note 2{{forward declaration of 'struct size_unknown'}}
+struct on_member_arr_incomplete_ty_ty_pos {
+  int count;
+  // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
+  // expected-error@+1{{array has incomplete element type 'struct size_unknown'}}
+  struct size_unknown buf[] __counted_by(count);
+};
+
+struct on_member_arr_incomplete_const_ty_ty_pos {
+  int count;
+  // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
+  // expected-error@+1{{array has incomplete element type 'const struct size_unknown'}}
+  const struct size_unknown buf[] __counted_by(count);
+};
+
+struct on_member_arr_void_ty_ty_pos {
+  int count;
+  // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
+  // expected-error@+1{{array has incomplete element type 'void'}}
+  void buf[] __counted_by(count);
+};
+
+typedef void(fn_ty)(int);
+
+struct on_member_arr_fn_ptr_ty {
+  int count;
+  // An Array of function pointers is allowed
+  fn_ty* buf[] __counted_by(count);
+};
+
+struct on_member_arr_fn_ty {
+  int count;
+  // An array of functions is not allowed.
+  // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
+  // expected-error@+1{{'buf' declared as array of functions of type 'fn_ty' (aka 'void (int)')}}
+  fn_ty buf[] __counted_by(count);
+};
+
+
+// `buffer_of_structs_with_unnannotated_vla`,
+// `buffer_of_structs_with_annotated_vla`, and
+// `buffer_of_const_structs_with_annotated_vla` are currently prevented because
+// computing the size of `Arr` at runtime would require an O(N) walk of `Arr`
+// elements to take into account the length of the VLA in each struct instance.
+
+struct has_unannotated_VLA {
+  int count;
+  char buffer[];
+};
+
+struct has_annotated_VLA {
+  int count;
+  char buffer[] __counted_by(count);
+};
+
+struct buffer_of_structs_with_unnannotated_vla {
+  int count;
+  // Treating this as a warning is a temporary fix for existing attribute adopters. It **SHOULD BE AN ERROR**.
+  // expected-warning@+1{{'counted_by' should not be applied to an array with element of unknown size because 'struct has_unannotated_VLA' is a struct type with a flexible array member. This will be an error in a future compiler version}}
+  struct has_unannotated_VLA Arr[] __counted_by(count);
+};
+
+
+struct buffer_of_structs_with_annotated_vla {
+  int count;
+  // Treating this as a warning is a temporary fix for existing attribute adopters. It **SHOULD BE AN ERROR**.
+  // expected-warning@+1{{'counted_by' should not be applied to an array with element of unknown size because 'struct has_annotated_VLA' is a struct type with a flexible array member. This will be an error in a future compiler version}}
+  struct has_annotated_VLA Arr[] __counted_by(count);
+};
+
+struct buffer_of_const_structs_with_annotated_vla {
+  int count;
+  // Treating this as a warning is a temporary fix for existing attribute adopters. It **SHOULD BE AN ERROR**.
+  // Make sure the `const` qualifier is printed when printing the element type.
+  // expected-warning@+1{{'counted_by' should not be applied to an array with element of unknown size because 'const struct has_annotated_VLA' is a struct type with a flexible array member. This will be an error in a future compiler version}}
+  const struct has_annotated_VLA Arr[] __counted_by(count);
+};
+
diff --git a/clang/test/Sema/attr-counted-by.c b/clang/test/Sema/attr-counted-by.c
deleted file mode 100644
index d5d4ebf..0000000
--- a/clang/test/Sema/attr-counted-by.c
+++ /dev/null
@@ -1,112 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-
-#define __counted_by(f)  __attribute__((counted_by(f)))
-
-struct bar;
-
-struct not_found {
-  int count;
-  struct bar *fam[] __counted_by(bork); // expected-error {{use of undeclared identifier 'bork'}}
-};
-
-struct no_found_count_not_in_substruct {
-  unsigned long flags;
-  unsigned char count; // expected-note {{'count' declared here}}
-  struct A {
-    int dummy;
-    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
-  } a;
-};
-
-struct not_found_count_not_in_unnamed_substruct {
-  unsigned char count; // expected-note {{'count' declared here}}
-  struct {
-    int dummy;
-    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
-  } a;
-};
-
-struct not_found_count_not_in_unnamed_substruct_2 {
-  struct {
-    unsigned char count; // expected-note {{'count' declared here}}
-  };
-  struct {
-    int dummy;
-    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
-  } a;
-};
-
-struct not_found_count_in_other_unnamed_substruct {
-  struct {
-    unsigned char count;
-  } a1;
-
-  struct {
-    int dummy;
-    int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
-  };
-};
-
-struct not_found_count_in_other_substruct {
-  struct _a1 {
-    unsigned char count;
-  } a1;
-
-  struct {
-    int dummy;
-    int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
-  };
-};
-
-struct not_found_count_in_other_substruct_2 {
-  struct _a2 {
-    unsigned char count;
-  } a2;
-
-  int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
-};
-
-struct not_found_suggest {
-  int bork;
-  struct bar *fam[] __counted_by(blork); // expected-error {{use of undeclared identifier 'blork'}}
-};
-
-int global; // expected-note {{'global' declared here}}
-
-struct found_outside_of_struct {
-  int bork;
-  struct bar *fam[] __counted_by(global); // expected-error {{field 'global' in 'counted_by' not inside structure}}
-};
-
-struct self_referrential {
-  int bork;
-  struct bar *self[] __counted_by(self); // expected-error {{use of undeclared identifier 'self'}}
-};
-
-struct non_int_count {
-  double dbl_count;
-  struct bar *fam[] __counted_by(dbl_count); // expected-error {{'counted_by' requires a non-boolean integer type argument}}
-};
-
-struct array_of_ints_count {
-  int integers[2];
-  struct bar *fam[] __counted_by(integers); // expected-error {{'counted_by' requires a non-boolean integer type argument}}
-};
-
-struct not_a_fam {
-  int count;
-  struct bar *non_fam __counted_by(count); // expected-error {{'counted_by' only applies to C99 flexible array members}}
-};
-
-struct not_a_c99_fam {
-  int count;
-  struct bar *non_c99_fam[0] __counted_by(count); // expected-error {{'counted_by' only applies to C99 flexible array members}}
-};
-
-struct annotated_with_anon_struct {
-  unsigned long flags;
-  struct {
-    unsigned char count;
-    int array[] __counted_by(crount); // expected-error {{use of undeclared identifier 'crount'}}
-  };
-};
diff --git a/clang/test/Sema/attr-objc-bridge-related.m b/clang/test/Sema/attr-objc-bridge-related.m
index 7b2e3e5..6c7fb25 100644
--- a/clang/test/Sema/attr-objc-bridge-related.m
+++ b/clang/test/Sema/attr-objc-bridge-related.m
@@ -3,5 +3,5 @@
 struct [[clang::objc_bridge_related(NSParagraphStyle,,)]] TestBridgedRef;
 
 struct [[clang::objc_bridge_related(NSColor,colorWithCGColor:,CGColor)]] CGColorRefOk;
-struct [[clang::objc_bridge_related(,colorWithCGColor:,CGColor)]] CGColorRef1NotOk; // expected-error {{expected a related ObjectiveC class name, e.g., 'NSColor'}}
+struct [[clang::objc_bridge_related(,colorWithCGColor:,CGColor)]] CGColorRef1NotOk; // expected-error {{expected a related Objective-C class name, e.g., 'NSColor'}}
 struct [[clang::objc_bridge_related(NSColor,colorWithCGColor::,CGColor)]] CGColorRef3NotOk; // expected-error {{expected a class method selector with single argument, e.g., 'colorWithCGColor:'}}
diff --git a/clang/test/Sema/builtins-x86.c b/clang/test/Sema/builtins-x86.c
index cbaf7bc..7d9cdce 100644
--- a/clang/test/Sema/builtins-x86.c
+++ b/clang/test/Sema/builtins-x86.c
@@ -106,14 +106,6 @@ __m128i test_mm_mask_i32gather_epi32(__m128i a, int const *b, __m128i c, __m128i
   return __builtin_ia32_gatherd_d(a, b, c, mask, 5); // expected-error {{scale argument must be 1, 2, 4, or 8}}
 }
 
-void _mm512_mask_prefetch_i32gather_ps(__m512i index, __mmask16 mask, int const *addr) {
-  __builtin_ia32_gatherpfdps(mask, index, addr, 5, 1); // expected-error {{scale argument must be 1, 2, 4, or 8}}
-}
-
-void _mm512_mask_prefetch_i32gather_ps_2(__m512i index, __mmask16 mask, int const *addr) {
-  __builtin_ia32_gatherpfdps(mask, index, addr, 1, 1); // expected-error {{argument value 1 is outside the valid range [2, 3]}}
-}
-
 __m512i test_mm512_shldi_epi64(__m512i __A, __m512i __B) {
   return __builtin_ia32_vpshldq512(__A, __B, 1024); // expected-error {{argument value 1024 is outside the valid range [0, 255]}}
 }
diff --git a/clang/test/Sema/builtins.c b/clang/test/Sema/builtins.c
index 3bee314..4f843ae 100644
--- a/clang/test/Sema/builtins.c
+++ b/clang/test/Sema/builtins.c
@@ -277,9 +277,9 @@ void test21(const int *ptr) {
 }
 
 void test_ei_i42i(_BitInt(42) *ptr, int value) {
-  __sync_fetch_and_add(ptr, value); // expected-error {{Atomic memory operand must have a power-of-two size}}
+  __sync_fetch_and_add(ptr, value); // expected-error {{atomic memory operand must have a power-of-two size}}
   // expected-warning@+1 {{the semantics of this intrinsic changed with GCC version 4.4 - the newer semantics are provided here}}
-  __sync_nand_and_fetch(ptr, value); // expected-error {{Atomic memory operand must have a power-of-two size}}
+  __sync_nand_and_fetch(ptr, value); // expected-error {{atomic memory operand must have a power-of-two size}}
 
   __atomic_fetch_add(ptr, 1, 0); // expected-error {{argument to atomic builtin of type '_BitInt' is not supported}}
 }
@@ -305,9 +305,9 @@ void test_ei_ii64(int *ptr, _BitInt(64) value) {
 }
 
 void test_ei_i42i42(_BitInt(42) *ptr, _BitInt(42) value) {
-  __sync_fetch_and_add(ptr, value); // expected-error {{Atomic memory operand must have a power-of-two size}}
+  __sync_fetch_and_add(ptr, value); // expected-error {{atomic memory operand must have a power-of-two size}}
   // expected-warning@+1 {{the semantics of this intrinsic changed with GCC version 4.4 - the newer semantics are provided here}}
-  __sync_nand_and_fetch(ptr, value); // expected-error {{Atomic memory operand must have a power-of-two size}}
+  __sync_nand_and_fetch(ptr, value); // expected-error {{atomic memory operand must have a power-of-two size}}
 }
 
 void test_ei_i64i64(_BitInt(64) *ptr, _BitInt(64) value) {
diff --git a/clang/test/Sema/constant_builtins_vector.cpp b/clang/test/Sema/constant_builtins_vector.cpp
index ddb7869..c6b1b37 100644
--- a/clang/test/Sema/constant_builtins_vector.cpp
+++ b/clang/test/Sema/constant_builtins_vector.cpp
@@ -719,7 +719,7 @@ constexpr vector4char
     vectorShuffleFail1 = // expected-error {{constexpr variable 'vectorShuffleFail1'\
  must be initialized by a constant expression}}
     __builtin_shufflevector( // expected-error {{index for __builtin_shufflevector \
-not within the bounds of the input vectors; index of -1 found at position 0 not \
-permitted in a constexpr context.}}
+not within the bounds of the input vectors; index of -1 found at position 0 is not \
+permitted in a constexpr context}}
         vector4charConst1,
         vector4charConst2, -1, -1, -1, -1);
diff --git a/clang/test/Sema/fmv-namespace.cpp b/clang/test/Sema/fmv-namespace.cpp
new file mode 100644
index 0000000..1c12fd6
--- /dev/null
+++ b/clang/test/Sema/fmv-namespace.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu  -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+namespace Name {
+int __attribute((target_version("default"))) foo() { return 0; }
+}
+
+namespace Name {
+int __attribute((target_version("sve"))) foo() { return 1; }
+}
+
+int bar() { return Name::foo(); }
diff --git a/clang/test/Sema/x86-eval-method.c b/clang/test/Sema/x86-eval-method.c
index f475b0d..e540a59 100644
--- a/clang/test/Sema/x86-eval-method.c
+++ b/clang/test/Sema/x86-eval-method.c
@@ -10,9 +10,9 @@
 
 float add1(float a, float b, float c) {
   return a + b + c;
-} // warn-warning{{Setting the floating point evaluation method to `source` on a target without SSE is not supported.}}
+} // warn-warning{{setting the floating point evaluation method to `source` on a target without SSE is not supported}}
 
 float add2(float a, float b, float c) {
 #pragma clang fp eval_method(source)
   return a + b + c;
-} // warn-warning{{Setting the floating point evaluation method to `source` on a target without SSE is not supported.}}
+} // warn-warning{{setting the floating point evaluation method to `source` on a target without SSE is not supported}}
diff --git a/clang/test/Sema/x86_64-eval-method.c b/clang/test/Sema/x86_64-eval-method.c
index dbdc1f8..fe4368a 100644
--- a/clang/test/Sema/x86_64-eval-method.c
+++ b/clang/test/Sema/x86_64-eval-method.c
@@ -10,4 +10,4 @@
 float add2(float a, float b, float c) {
 #pragma clang fp eval_method(source)
   return a + b + c;
-} // warn-warning{{Setting the floating point evaluation method to `source` on a target without SSE is not supported.}}
+} // warn-warning{{setting the floating point evaluation method to `source` on a target without SSE is not supported}}
diff --git a/clang/test/SemaCUDA/device-var-init.cu b/clang/test/SemaCUDA/device-var-init.cu
index ee7a9e2..1555d15 100644
--- a/clang/test/SemaCUDA/device-var-init.cu
+++ b/clang/test/SemaCUDA/device-var-init.cu
@@ -13,17 +13,17 @@
 #include "Inputs/cuda-initializers.h"
 
 __shared__ int s_v_i = 1;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 
 __device__ int d_v_f = f();
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ int s_v_f = f();
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ int c_v_f = f();
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __shared__ T s_t_i = {2};
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __device__ T d_t_i = {2};
 __constant__ T c_t_i = {2};
 
@@ -40,175 +40,175 @@ __shared__ CGTC s_cgtc;
 __constant__ CGTC c_cgtc;
 
 __device__ EC d_ec_i(3);
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ EC s_ec_i(3);
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ EC c_ec_i(3);
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ EC d_ec_i2 = {3};
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ EC s_ec_i2 = {3};
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ EC c_ec_i2 = {3};
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ ETC d_etc_i(3);
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ ETC s_etc_i(3);
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ ETC c_etc_i(3);
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ ETC d_etc_i2 = {3};
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ ETC s_etc_i2 = {3};
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ ETC c_etc_i2 = {3};
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ UC d_uc;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ UC s_uc;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ UC c_uc;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ UD d_ud;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ UD s_ud;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ UD c_ud;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ ECI d_eci;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ ECI s_eci;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ ECI c_eci;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ NEC d_nec;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ NEC s_nec;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ NEC c_nec;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ NED d_ned;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ NED s_ned;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ NED c_ned;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ NCV d_ncv;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ NCV s_ncv;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ NCV c_ncv;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ VD d_vd;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ VD s_vd;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ VD c_vd;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ NCF d_ncf;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ NCF s_ncf;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ NCF c_ncf;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __shared__ NCFS s_ncfs;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 
 __device__ UTC d_utc;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ UTC s_utc;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ UTC c_utc;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ UTC d_utc_i(3);
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ UTC s_utc_i(3);
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ UTC c_utc_i(3);
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ NETC d_netc;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ NETC s_netc;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ NETC c_netc;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ NETC d_netc_i(3);
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ NETC s_netc_i(3);
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ NETC c_netc_i(3);
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ EC_I_EC1 d_ec_i_ec1;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ EC_I_EC1 s_ec_i_ec1;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ EC_I_EC1 c_ec_i_ec1;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ T_V_T d_t_v_t;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ T_V_T s_t_v_t;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ T_V_T c_t_v_t;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ T_B_NEC d_t_b_nec;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ T_B_NEC s_t_b_nec;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ T_B_NEC c_t_b_nec;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ T_F_NEC d_t_f_nec;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ T_F_NEC s_t_f_nec;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ T_F_NEC c_t_f_nec;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ T_FA_NEC d_t_fa_nec;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ T_FA_NEC s_t_fa_nec;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ T_FA_NEC c_t_fa_nec;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ T_B_NED d_t_b_ned;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ T_B_NED s_t_b_ned;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ T_B_NED c_t_b_ned;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ T_F_NED d_t_f_ned;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ T_F_NED s_t_f_ned;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ T_F_NED c_t_f_ned;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ T_FA_NED d_t_fa_ned;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ T_FA_NED s_t_fa_ned;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ T_FA_NED c_t_fa_ned;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 // Verify that local variables may be static on device
 // side and that they conform to the initialization constraints.
@@ -244,14 +244,14 @@ __device__ void df_sema() {
   // Same test cases as for the globals above.
 
   static __device__ int d_v_f = f();
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ int s_v_f = f();
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ int c_v_f = f();
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __shared__ T s_t_i = {2};
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __device__ T d_t_i = {2};
   static __constant__ T c_t_i = {2};
 
@@ -260,175 +260,175 @@ __device__ void df_sema() {
   static __constant__ ECD c_ecd_i;
 
   static __device__ EC d_ec_i(3);
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ EC s_ec_i(3);
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ EC c_ec_i(3);
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ EC d_ec_i2 = {3};
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ EC s_ec_i2 = {3};
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ EC c_ec_i2 = {3};
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ ETC d_etc_i(3);
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ ETC s_etc_i(3);
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ ETC c_etc_i(3);
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ ETC d_etc_i2 = {3};
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ ETC s_etc_i2 = {3};
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ ETC c_etc_i2 = {3};
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ UC d_uc;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ UC s_uc;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ UC c_uc;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ UD d_ud;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ UD s_ud;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ UD c_ud;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ ECI d_eci;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ ECI s_eci;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ ECI c_eci;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ NEC d_nec;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ NEC s_nec;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ NEC c_nec;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ NED d_ned;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ NED s_ned;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ NED c_ned;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ NCV d_ncv;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ NCV s_ncv;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ NCV c_ncv;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ VD d_vd;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ VD s_vd;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ VD c_vd;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ NCF d_ncf;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ NCF s_ncf;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ NCF c_ncf;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __shared__ NCFS s_ncfs;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
 
   static __device__ UTC d_utc;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ UTC s_utc;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ UTC c_utc;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ UTC d_utc_i(3);
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ UTC s_utc_i(3);
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ UTC c_utc_i(3);
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ NETC d_netc;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ NETC s_netc;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ NETC c_netc;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ NETC d_netc_i(3);
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ NETC s_netc_i(3);
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ NETC c_netc_i(3);
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ EC_I_EC1 d_ec_i_ec1;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ EC_I_EC1 s_ec_i_ec1;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ EC_I_EC1 c_ec_i_ec1;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ T_V_T d_t_v_t;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ T_V_T s_t_v_t;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ T_V_T c_t_v_t;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ T_B_NEC d_t_b_nec;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ T_B_NEC s_t_b_nec;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ T_B_NEC c_t_b_nec;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ T_F_NEC d_t_f_nec;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ T_F_NEC s_t_f_nec;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ T_F_NEC c_t_f_nec;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ T_FA_NEC d_t_fa_nec;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ T_FA_NEC s_t_fa_nec;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ T_FA_NEC c_t_fa_nec;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ T_B_NED d_t_b_ned;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ T_B_NED s_t_b_ned;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ T_B_NED c_t_b_ned;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ T_F_NED d_t_f_ned;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ T_F_NED s_t_f_ned;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ T_F_NED c_t_f_ned;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ T_FA_NED d_t_fa_ned;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ T_FA_NED s_t_fa_ned;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ T_FA_NED c_t_fa_ned;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 }
 
 __host__ __device__ void hd_sema() {
@@ -449,7 +449,7 @@ struct NontrivialInitializer {
 template <typename T>
 __global__ void bar() {
   __shared__ T bad;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
   for (int i = 0; i < 10; i++) {
     static __device__ CEEC sd_ceec;
     static __shared__ CEEC ss_ceec;
@@ -467,7 +467,7 @@ __global__ void bar() {
 template <>
 __global__ void bar<int>() {
   __shared__ NontrivialInitializer bad;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
   for (int i = 0; i < 10; i++) {
     static __device__ CEEC sd_ceec;
     static __shared__ CEEC ss_ceec;
diff --git a/clang/test/SemaCUDA/function-overload.cu b/clang/test/SemaCUDA/function-overload.cu
index 163648c..4710c81 100644
--- a/clang/test/SemaCUDA/function-overload.cu
+++ b/clang/test/SemaCUDA/function-overload.cu
@@ -469,7 +469,7 @@ int test_constexpr_overload(C2 &x, C2 &y) {
 // Verify no ambiguity for new operator.
 void *a = new int;
 __device__ void *b = new int;
-// expected-error@-1{{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1{{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 // Verify no ambiguity for new operator.
 template<typename _Tp> _Tp&& f();
diff --git a/clang/test/SemaCUDA/union-init.cu b/clang/test/SemaCUDA/union-init.cu
index 9e4d14a..dd4b129 100644
--- a/clang/test/SemaCUDA/union-init.cu
+++ b/clang/test/SemaCUDA/union-init.cu
@@ -31,14 +31,14 @@ union D {
 
 __device__ B b;
 __device__ C c;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __device__ D d;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ void foo() {
   __shared__ B b;
   __shared__ C c;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   __shared__ D d;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
 }
diff --git a/clang/test/SemaCXX/MicrosoftExtensions.cpp b/clang/test/SemaCXX/MicrosoftExtensions.cpp
index 7286217..98c1997 100644
--- a/clang/test/SemaCXX/MicrosoftExtensions.cpp
+++ b/clang/test/SemaCXX/MicrosoftExtensions.cpp
@@ -571,11 +571,17 @@ class PR34109_class {
   virtual ~PR34109_class() {}
 };
 
+#if !defined(__cpp_sized_deallocation)
 void operator delete(void *) throw();
 // expected-note@-1 {{previous declaration is here}}
 __declspec(dllexport) void operator delete(void *) throw();
 // expected-error@-1  {{redeclaration of 'operator delete' cannot add 'dllexport' attribute}}
-
+#else
+void operator delete(void *, unsigned int) throw();
+// expected-note@-1 {{previous declaration is here}}
+__declspec(dllexport) void operator delete(void *, unsigned int) throw();
+// expected-error@-1  {{redeclaration of 'operator delete' cannot add 'dllexport' attribute}}
+#endif
 void PR34109(int* a) {
   delete a;
 }
diff --git a/clang/test/SemaCXX/addr-label-in-coroutines.cpp b/clang/test/SemaCXX/addr-label-in-coroutines.cpp
index e37ee64..65d7863 100644
--- a/clang/test/SemaCXX/addr-label-in-coroutines.cpp
+++ b/clang/test/SemaCXX/addr-label-in-coroutines.cpp
@@ -13,9 +13,9 @@ struct resumable {
 };
 
 resumable f1(int &out, int *inst) {
-    static void* dispatch_table[] = {&&inc,      // expected-error {{the GNU address of label extension is not allowed in coroutines.}}
-                                     &&suspend,  // expected-error {{the GNU address of label extension is not allowed in coroutines.}}
-                                     &&stop};    // expected-error {{the GNU address of label extension is not allowed in coroutines.}}
+    static void* dispatch_table[] = {&&inc,      // expected-error {{the GNU address of label extension is not allowed in coroutines}}
+                                     &&suspend,  // expected-error {{the GNU address of label extension is not allowed in coroutines}}
+                                     &&stop};    // expected-error {{the GNU address of label extension is not allowed in coroutines}}
     #define DISPATCH() goto *dispatch_table[*inst++]
 inc:
     out++;
@@ -31,9 +31,9 @@ stop:
 
 resumable f2(int &out, int *inst) {
     void* dispatch_table[] = {nullptr, nullptr, nullptr};
-    dispatch_table[0] = &&inc;      // expected-error {{the GNU address of label extension is not allowed in coroutines.}}
-    dispatch_table[1] = &&suspend;  // expected-error {{the GNU address of label extension is not allowed in coroutines.}}
-    dispatch_table[2] = &&stop;     // expected-error {{the GNU address of label extension is not allowed in coroutines.}}
+    dispatch_table[0] = &&inc;      // expected-error {{the GNU address of label extension is not allowed in coroutines}}
+    dispatch_table[1] = &&suspend;  // expected-error {{the GNU address of label extension is not allowed in coroutines}}
+    dispatch_table[2] = &&stop;     // expected-error {{the GNU address of label extension is not allowed in coroutines}}
     #define DISPATCH() goto *dispatch_table[*inst++]
 inc:
     out++;
@@ -50,9 +50,9 @@ stop:
 resumable f3(int &out, int *inst) {
     void* dispatch_table[] = {nullptr, nullptr, nullptr};
     [&]() -> resumable {
-        dispatch_table[0] = &&inc;      // expected-error {{the GNU address of label extension is not allowed in coroutines.}}
-        dispatch_table[1] = &&suspend;  // expected-error {{the GNU address of label extension is not allowed in coroutines.}}
-        dispatch_table[2] = &&stop;     // expected-error {{the GNU address of label extension is not allowed in coroutines.}}
+        dispatch_table[0] = &&inc;      // expected-error {{the GNU address of label extension is not allowed in coroutines}}
+        dispatch_table[1] = &&suspend;  // expected-error {{the GNU address of label extension is not allowed in coroutines}}
+        dispatch_table[2] = &&stop;     // expected-error {{the GNU address of label extension is not allowed in coroutines}}
         #define DISPATCH() goto *dispatch_table[*inst++]
     inc:
         out++;
diff --git a/clang/test/SemaCXX/builtin-operator-new-delete.cpp b/clang/test/SemaCXX/builtin-operator-new-delete.cpp
index 6fcff92..db15616 100644
--- a/clang/test/SemaCXX/builtin-operator-new-delete.cpp
+++ b/clang/test/SemaCXX/builtin-operator-new-delete.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++1z -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++1z -fno-sized-deallocation -fsyntax-only -verify %s
 // RUN: %clang_cc1 -std=c++03 -fsyntax-only -verify %s
 // RUN: %clang_cc1 -std=c++03 -faligned-allocation -fsyntax-only -verify %s
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
diff --git a/clang/test/SemaCXX/constexpr-default-arg.cpp b/clang/test/SemaCXX/constexpr-default-arg.cpp
index ec9b292..901123b 100644
--- a/clang/test/SemaCXX/constexpr-default-arg.cpp
+++ b/clang/test/SemaCXX/constexpr-default-arg.cpp
@@ -32,8 +32,8 @@ void test_default_arg2() {
 }
 
 // Check that multiple CXXDefaultInitExprs don't cause an assertion failure.
-struct A { int &&r = 0; }; // expected-note 2{{default member initializer}}
+struct A { int &&r = 0; };
 struct B { A x, y; };
-B b = {}; // expected-warning 2{{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported}}
+B b = {}; // expected-no-diagnostics
 
 }
diff --git a/clang/test/SemaCXX/cxx11-default-member-initializers.cpp b/clang/test/SemaCXX/cxx11-default-member-initializers.cpp
index dd8e9c6..1ea8b98 100644
--- a/clang/test/SemaCXX/cxx11-default-member-initializers.cpp
+++ b/clang/test/SemaCXX/cxx11-default-member-initializers.cpp
@@ -27,6 +27,80 @@ class MemInit {
   C m = s;
 };
 
+namespace std {
+typedef decltype(sizeof(int)) size_t;
+
+// libc++'s implementation
+template <class _E> class initializer_list {
+  const _E *__begin_;
+  size_t __size_;
+
+  initializer_list(const _E *__b, size_t __s) : __begin_(__b), __size_(__s) {}
+
+public:
+  typedef _E value_type;
+  typedef const _E &reference;
+  typedef const _E &const_reference;
+  typedef size_t size_type;
+
+  typedef const _E *iterator;
+  typedef const _E *const_iterator;
+
+  initializer_list() : __begin_(nullptr), __size_(0) {}
+
+  size_t size() const { return __size_; }
+  const _E *begin() const { return __begin_; }
+  const _E *end() const { return __begin_ + __size_; }
+};
+} // namespace std
+
+#if __cplusplus >= 201703L
+namespace test_rebuild {
+template <typename T, int> class C {
+public:
+  C(std::initializer_list<T>);
+};
+
+template <typename T> using Ptr = __remove_pointer(T) *;
+template <typename T> C(T) -> C<Ptr<T>, sizeof(T)>;
+
+class A {
+public:
+  template <typename T1, typename T2> T1 *some_func(T2 &&);
+};
+
+struct B : A {
+  // Test CXXDefaultInitExpr rebuild issue in 
+  // https://github.com/llvm/llvm-project/pull/87933
+  int *ar = some_func<int>(C{some_func<int>(0)});
+  B() {}
+};
+
+int TestBody_got;
+template <int> class Vector {
+public:
+  Vector(std::initializer_list<int>);
+};
+template <typename... Ts> Vector(Ts...) -> Vector<sizeof...(Ts)>;
+class ProgramBuilder {
+public:
+  template <typename T, typename ARGS> int *create(ARGS);
+};
+
+struct TypeTest : ProgramBuilder {
+  int *str_f16 = create<int>(Vector{0});
+  TypeTest() {}
+};
+class TypeTest_Element_Test : TypeTest {
+  void TestBody();
+};
+void TypeTest_Element_Test::TestBody() {
+  int *expect = str_f16;
+  &TestBody_got != expect; // expected-warning {{inequality comparison result unused}}
+}
+} //  namespace test_rebuild
+#endif // __cplusplus >= 201703L
+
 #if __cplusplus >= 202002L
 // This test ensures cleanup expressions are correctly produced
 // in the presence of default member initializers.
diff --git a/clang/test/SemaCXX/cxx1y-sized-deallocation.cpp b/clang/test/SemaCXX/cxx1y-sized-deallocation.cpp
index 3ec65a6..462f172 100644
--- a/clang/test/SemaCXX/cxx1y-sized-deallocation.cpp
+++ b/clang/test/SemaCXX/cxx1y-sized-deallocation.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++1y -verify %s -fsized-deallocation -fexceptions -fcxx-exceptions
+// RUN: %clang_cc1 -std=c++1y -verify %s -fexceptions -fcxx-exceptions
 
 using size_t = decltype(sizeof(0));
 void operator delete(void *, size_t) noexcept;   // expected-note {{'operator delete' declared here}}
diff --git a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
index 4c6ef5a..b71dfc6 100644
--- a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
+++ b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
@@ -284,7 +284,7 @@ class Foo {};
 // Verify that template template type parameter TTP is referenced/used in the
 // template arguments of the RHS.
 template <template<typename> typename TTP>
-using Bar = Foo<K<TTP>>; // expected-note {{candidate template ignored: could not match 'Foo<K<>>' against 'int'}}
+using Bar = Foo<K<TTP>>; // expected-note {{candidate template ignored: could not match 'Foo<K<template-parameter-0-0>>' against 'int'}}
 
 template <class T>
 class Container {};
diff --git a/clang/test/SemaCXX/cxx23-assume.cpp b/clang/test/SemaCXX/cxx23-assume.cpp
index e67d72a..ea71e7b2 100644
--- a/clang/test/SemaCXX/cxx23-assume.cpp
+++ b/clang/test/SemaCXX/cxx23-assume.cpp
@@ -58,6 +58,11 @@ void g(int x) {
   [[assume(true)]] while (false) {} // expected-error {{only applies to empty statements}}
   [[assume(true)]] label:; // expected-error {{cannot be applied to a declaration}}
   [[assume(true)]] goto label; // expected-error {{only applies to empty statements}}
+
+  // Also check variant spellings.
+  __attribute__((__assume__(true))); // Should not issue a warning because it doesn't use the [[]] spelling.
+  __attribute__((assume(true))) {}; // expected-error {{only applies to empty statements}}
+  [[clang::assume(true)]] {}; // expected-error {{only applies to empty statements}}
 }
 
 // Check that 'x' is ODR-used here.
@@ -143,3 +148,13 @@ template <bool ...val>
 void f() {
     [[assume(val)]]; // expected-error {{expression contains unexpanded parameter pack}}
 }
+
+namespace gh71858 {
+int
+foo (int x, int y)
+{
+  __attribute__((assume(x == 42)));
+  __attribute__((assume(++y == 43))); // expected-warning {{has side effects that will be discarded}}
+  return x + y;
+}
+}
diff --git a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
index 07937de..b70c022 100644
--- a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
+++ b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
@@ -446,3 +446,11 @@ int h(int x) {
 }
 
 #endif
+
+
+namespace GH91308 {
+    constexpr void f(auto) {
+        static_assert(false);
+    }
+    using R1 = decltype(&f<int>);
+}
diff --git a/clang/test/SemaCXX/eval-crashes.cpp b/clang/test/SemaCXX/eval-crashes.cpp
index 017df97..a06f60f 100644
--- a/clang/test/SemaCXX/eval-crashes.cpp
+++ b/clang/test/SemaCXX/eval-crashes.cpp
@@ -25,11 +25,9 @@ namespace pr33140_0b {
 }
 
 namespace pr33140_2 {
-  // FIXME: The declaration of 'b' below should lifetime-extend two int
-  // temporaries.
-  struct A { int &&r = 0; }; // expected-note 2{{initializing field 'r' with default member initializer}}
+  struct A { int &&r = 0; };
   struct B { A x, y; };
-  B b = {}; // expected-warning 2{{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported}}
+  B b = {};
 }
 
 namespace pr33140_3 {
diff --git a/clang/test/SemaCXX/unavailable_aligned_allocation.cpp b/clang/test/SemaCXX/unavailable_aligned_allocation.cpp
index be593ea..45fdec6 100644
--- a/clang/test/SemaCXX/unavailable_aligned_allocation.cpp
+++ b/clang/test/SemaCXX/unavailable_aligned_allocation.cpp
@@ -75,7 +75,7 @@ void testOveraligned() {
 // expected-error-re@-22 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}}
 // expected-note@-23 {{if you supply your own aligned allocation functions}}
 
-// expected-error-re@-24 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}}
+// expected-error-re@-24 {{aligned deallocation function of type 'void (void *, std::size_t, std::align_val_t) noexcept' is {{only|not}} available on}}
 // expected-note@-25 {{if you supply your own aligned allocation functions}}
 
 // expected-error-re@-26 {{aligned allocation function of type 'void *(std::size_t, std::align_val_t, const std::nothrow_t &) noexcept' is {{only|not}} available on}}
@@ -143,19 +143,19 @@ OveralignedS2::~OveralignedS2() {}
 // expected-no-diagnostics
 #else
 #if defined(IOS)
-// expected-error@-6 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on iOS 11 or newer}}}
+// expected-error@-6 {{aligned deallocation function of type 'void (void *, std::size_t, std::align_val_t) noexcept' is only available on iOS 11 or newer}}}
 // expected-note@-7 {{if you supply your own aligned allocation functions}}
 #elif defined(TVOS)
-// expected-error@-9 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on tvOS 11 or newer}}}
+// expected-error@-9 {{aligned deallocation function of type 'void (void *, std::size_t, std::align_val_t) noexcept' is only available on tvOS 11 or newer}}}
 // expected-note@-10 {{if you supply your own aligned allocation functions}}
 #elif defined(WATCHOS)
-// expected-error@-12 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on watchOS 4 or newer}}}
+// expected-error@-12 {{aligned deallocation function of type 'void (void *, std::size_t, std::align_val_t) noexcept' is only available on watchOS 4 or newer}}}
 // expected-note@-13 {{if you supply your own aligned allocation functions}}
 #elif defined(MACOS)
-// expected-error@-15 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on macOS 10.13 or newer}}}
+// expected-error@-15 {{aligned deallocation function of type 'void (void *, std::size_t, std::align_val_t) noexcept' is only available on macOS 10.13 or newer}}}
 // expected-note@-16 {{if you supply your own aligned allocation functions}}
 #elif defined(ZOS)
-// expected-error@-18 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is not available on z/OS}}}
+// expected-error@-18 {{aligned deallocation function of type 'void (void *, std::size_t, std::align_val_t) noexcept' is not available on z/OS}}}
 // expected-note@-19 {{if you supply your own aligned allocation functions}}
 #endif
 #endif
@@ -209,6 +209,9 @@ void *operator new(std::size_t __sz, std::align_val_t) {
 void operator delete(void *p, std::align_val_t) {
 }
 
+void operator delete(void *p, std::size_t __sz, std::align_val_t) {
+}
+
 void testOveraligned2() {
   auto p = new ((std::align_val_t)8) OveralignedS;
   delete p;
diff --git a/clang/test/SemaCXX/warn-thread-safety-analysis.cpp b/clang/test/SemaCXX/warn-thread-safety-analysis.cpp
index 749d9e1..73cc946 100644
--- a/clang/test/SemaCXX/warn-thread-safety-analysis.cpp
+++ b/clang/test/SemaCXX/warn-thread-safety-analysis.cpp
@@ -5838,12 +5838,12 @@ class Foo5 {
 
 
 class Foo6 {
-  Mutex mu1 ACQUIRED_AFTER(mu3);     // expected-warning {{Cycle in acquired_before/after dependencies, starting with 'mu1'}}
-  Mutex mu2 ACQUIRED_AFTER(mu1);     // expected-warning {{Cycle in acquired_before/after dependencies, starting with 'mu2'}}
-  Mutex mu3 ACQUIRED_AFTER(mu2);     // expected-warning {{Cycle in acquired_before/after dependencies, starting with 'mu3'}}
+  Mutex mu1 ACQUIRED_AFTER(mu3);     // expected-warning {{cycle in acquired_before/after dependencies, starting with 'mu1'}}
+  Mutex mu2 ACQUIRED_AFTER(mu1);     // expected-warning {{cycle in acquired_before/after dependencies, starting with 'mu2'}}
+  Mutex mu3 ACQUIRED_AFTER(mu2);     // expected-warning {{cycle in acquired_before/after dependencies, starting with 'mu3'}}
 
-  Mutex mu_b ACQUIRED_BEFORE(mu_b);  // expected-warning {{Cycle in acquired_before/after dependencies, starting with 'mu_b'}}
-  Mutex mu_a ACQUIRED_AFTER(mu_a);   // expected-warning {{Cycle in acquired_before/after dependencies, starting with 'mu_a'}}
+  Mutex mu_b ACQUIRED_BEFORE(mu_b);  // expected-warning {{cycle in acquired_before/after dependencies, starting with 'mu_b'}}
+  Mutex mu_a ACQUIRED_AFTER(mu_a);   // expected-warning {{cycle in acquired_before/after dependencies, starting with 'mu_a'}}
 
   void test0() {
     mu_a.Lock();
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-pragma-misuse.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-pragma-misuse.cpp
index 126257e..1066614 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-pragma-misuse.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-pragma-misuse.cpp
@@ -18,8 +18,8 @@ void endUnopened(int *x) {
 }
 
 void wrongOption() {
-#pragma clang unsafe_buffer_usage start // expected-error{{Expected 'begin' or 'end'}}
-#pragma clang unsafe_buffer_usage close // expected-error{{Expected 'begin' or 'end'}}
+#pragma clang unsafe_buffer_usage start // expected-error{{expected 'begin' or 'end'}}
+#pragma clang unsafe_buffer_usage close // expected-error{{expected 'begin' or 'end'}}
 }
 
 void unclosed(int * p1) {
diff --git a/clang/test/SemaObjC/unguarded-availability.m b/clang/test/SemaObjC/unguarded-availability.m
index d0e23ea..ecd9199 100644
--- a/clang/test/SemaObjC/unguarded-availability.m
+++ b/clang/test/SemaObjC/unguarded-availability.m
@@ -177,16 +177,28 @@ void justAtAvailable(void) {
 
 #ifdef OBJCPP
 
-int f(char) AVAILABLE_10_12;
+int f(char) AVAILABLE_10_12; // #f_char_def
 int f(int);
 
 template <class T> int use_f() {
-  // FIXME: We should warn here!
-  return f(T());
+  if (@available(macos 10.12, *)) {
+    return f(T()); // no warning expected
+  } else {
+  // expected-warning@#f_call {{'f' is only available on macOS 10.12 or newer}}
+  // expected-note@#f_char_inst {{in instantiation of function template specialization 'use_f<char>' requested here}}
+  // expected-note@#f_char_def {{'f' has been marked as being introduced in macOS 10.12 here, but the deployment target is macOS 10.9}}
+  // expected-note@#f_call {{enclose 'f' in an @available check to silence this warning}}
+    return f(T()); // #f_call
+  }
 }
 
 int a = use_f<int>();
-int b = use_f<char>();
+int b = use_f<char>(); // #f_char_inst
+
+int use_f2() AVAILABLE_10_12 {
+  int c = use_f<int>();
+  int d = use_f<char>(); // no warning expected
+}
 
 template <class> int use_at_available() {
   if (@available(macos 10.12, *))
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl b/clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl
new file mode 100644
index 0000000..487cc53
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -verify -o - %s
+// REQUIRES: amdgpu-registered-target
+
+typedef unsigned int u32;
+
+void test_global_load_lds_unsupported_size(global u32* src, local u32 *dst, u32 size) {
+  __builtin_amdgcn_global_load_lds(src, dst, size, /*offset=*/0, /*aux=*/0); // expected-error{{expression is not an integer constant expression}}
+  __builtin_amdgcn_global_load_lds(src, dst, /*size=*/5, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} expected-note {{size must be 1, 2, or 4}}
+  __builtin_amdgcn_global_load_lds(src, dst, /*size=*/0, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} expected-note {{size must be 1, 2, or 4}}
+  __builtin_amdgcn_global_load_lds(src, dst, /*size=*/3, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} expected-note {{size must be 1, 2, or 4}}
+  __builtin_amdgcn_global_load_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} expected-note {{size must be 1, 2, or 4}}
+  __builtin_amdgcn_global_load_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} expected-note {{size must be 1, 2, or 4}}
+  __builtin_amdgcn_global_load_lds(src, dst, /*size=*/-1, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} expected-note {{size must be 1, 2, or 4}}
+}
diff --git a/clang/test/SemaOpenCL/vector_swizzle_length.cl b/clang/test/SemaOpenCL/vector_swizzle_length.cl
index f36ae20..b06cc12 100644
--- a/clang/test/SemaOpenCL/vector_swizzle_length.cl
+++ b/clang/test/SemaOpenCL/vector_swizzle_length.cl
@@ -5,6 +5,6 @@ typedef float float8 __attribute__((ext_vector_type(8)));
 void foo(void) {
     float8 f2 = (float8)(0, 0, 0, 0, 0, 0, 0, 0);
 
-    f2.s01234; // expected-error {{vector component access has invalid length 5.  Supported: 1,2,3,4,8,16}}
-    f2.xyzxy; // expected-error {{vector component access has invalid length 5.  Supported: 1,2,3,4,8,16}}
+    f2.s01234; // expected-error {{vector component access has invalid length 5; supported lengths are: 1,2,3,4,8,16}}
+    f2.xyzxy; // expected-error {{vector component access has invalid length 5; supported lengths are: 1,2,3,4,8,16}}
 }
diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp
index a91ab5e..c38b647 100644
--- a/clang/test/SemaTemplate/deduction-guide.cpp
+++ b/clang/test/SemaTemplate/deduction-guide.cpp
@@ -100,11 +100,11 @@ using CT = C<int>;
 // CHECK: | `-NonTypeTemplateParmDecl {{.*}} 'X' depth 1 index 1
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 2 U
 // CHECK: |-NonTypeTemplateParmDecl {{.*}} 'type-parameter-0-2' depth 0 index 3 V
-// CHECK: | `-TemplateArgument expr
+// CHECK: | `-TemplateArgument {{.*}} expr
 // CHECK: |   `-IntegerLiteral {{.*}} 'int' 0
-// CHECK: |-CXXDeductionGuideDecl {{.*}} 'auto (A, Y<>, type-parameter-0-2) -> C<A>'
+// CHECK: |-CXXDeductionGuideDecl {{.*}} 'auto (A, Y<template-parameter-0-1>, type-parameter-0-2) -> C<A>'
 // CHECK: | |-ParmVarDecl {{.*}} 'A'
-// CHECK: | |-ParmVarDecl {{.*}} 'Y<>'
+// CHECK: | |-ParmVarDecl {{.*}} 'Y<template-parameter-0-1>'
 // CHECK: | `-ParmVarDecl {{.*}} 'type-parameter-0-2'
 // CHECK: `-CXXDeductionGuideDecl {{.*}} 'auto (int, Y<B>, int) -> C<int>'
 // CHECK:  |-TemplateArgument type 'int'
@@ -114,12 +114,12 @@ using CT = C<int>;
 // CHECK:  |-ParmVarDecl {{.*}} 'int'
 // CHECK:  |-ParmVarDecl {{.*}} 'Y<B>'
 // CHECK:  `-ParmVarDecl {{.*}} 'int'
-// CHECK: FunctionProtoType {{.*}} 'auto (A, Y<>, type-parameter-0-2) -> C<A>' dependent trailing_return cdecl
+// CHECK: FunctionProtoType {{.*}} 'auto (A, Y<template-parameter-0-1>, type-parameter-0-2) -> C<A>' dependent trailing_return cdecl
 // CHECK: |-InjectedClassNameType {{.*}} 'C<A>' dependent
 // CHECK: |-TemplateTypeParmType {{.*}} 'A' dependent depth 0 index 0
 // CHECK: | `-TemplateTypeParm {{.*}} 'A'
-// CHECK: |-ElaboratedType {{.*}} 'Y<>' sugar dependent
-// CHECK: | `-TemplateSpecializationType {{.*}} 'Y<>' dependent Y
+// CHECK: |-ElaboratedType {{.*}} 'Y<template-parameter-0-1>' sugar dependent
+// CHECK: | `-TemplateSpecializationType {{.*}} 'Y<template-parameter-0-1>' dependent Y
 // CHECK: |   `-TemplateArgument template
 // CHECK: `-TemplateTypeParmType {{.*}} 'type-parameter-0-2' dependent depth 0 index 2
 
@@ -139,7 +139,7 @@ using DT = D<int, int>;
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 0 ... T
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 1 U1
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 2 U2
-// CHECK: `-CXXDeductionGuideDecl {{.*}} 'auto (B<type-parameter-0-1, type-parameter-0-2> *) -> D<T...>'  
+// CHECK: `-CXXDeductionGuideDecl {{.*}} 'auto (B<type-parameter-0-1, type-parameter-0-2> *) -> D<T...>'
 // CHECK:   `-ParmVarDecl {{.*}} 'B<type-parameter-0-1, type-parameter-0-2> *'
 // CHECK: FunctionProtoType {{.*}} 'auto (B<type-parameter-0-1, type-parameter-0-2> *) -> D<T...>' dependent trailing_return
 // CHECK: |-InjectedClassNameType {{.*}} 'D<T...>' dependent
@@ -222,7 +222,7 @@ F s(0);
 // CHECK-LABEL: Dumping <deduction guide for F>:
 // CHECK: FunctionTemplateDecl
 // CHECK: |-NonTypeTemplateParmDecl {{.*}} 'char' depth 0 index 0
-// CHECK:   `-TemplateArgument expr
+// CHECK:   `-TemplateArgument {{.*}} expr
 // CHECK: |   |-inherited from NonTypeTemplateParm {{.*}} '' 'char'
 // CHECK: |   `-CharacterLiteral {{.*}} 'char' 120
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 1 U
diff --git a/clang/test/SemaTemplate/make_integer_seq.cpp b/clang/test/SemaTemplate/make_integer_seq.cpp
index 3a692f5..c5a1e27 100644
--- a/clang/test/SemaTemplate/make_integer_seq.cpp
+++ b/clang/test/SemaTemplate/make_integer_seq.cpp
@@ -61,7 +61,7 @@ using test2 = B<int, 1>;
 
 template <template <class T, T...> class S, class T, int N> struct C {
   using test3 = __make_integer_seq<S, T, N>;
-//      CHECK: |-TypeAliasDecl 0x{{[0-9A-Fa-f]+}} <line:63:3, col:43> col:9 test3 '__make_integer_seq<S, T, N>':'__make_integer_seq<type-parameter-0-1, N>'
+//      CHECK: |-TypeAliasDecl 0x{{[0-9A-Fa-f]+}} <line:63:3, col:43> col:9 test3 '__make_integer_seq<S, T, N>':'__make_integer_seq<template-parameter-0-0, type-parameter-0-1, N>'
 // CHECK-NEXT:   `-ElaboratedType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<S, T, N>' sugar dependent
 // CHECK-NEXT:     `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<S, T, N>' sugar dependent alias __make_integer_seq
 // CHECK-NEXT:       |-TemplateArgument template S
@@ -71,7 +71,7 @@ template <template <class T, T...> class S, class T, int N> struct C {
 // CHECK-NEXT:       |-TemplateArgument expr
 // CHECK-NEXT:       | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:42> 'T' <Dependent>
 // CHECK-NEXT:       |   `-DeclRefExpr 0x{{[0-9A-Fa-f]+}} <col:42> 'int' NonTypeTemplateParm 0x{{[0-9A-Fa-f]+}} 'N' 'int'
-// CHECK-NEXT:       `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<type-parameter-0-1, N>' dependent __make_integer_seq
+// CHECK-NEXT:       `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<template-parameter-0-0, type-parameter-0-1, N>' dependent __make_integer_seq
 // CHECK-NEXT:         |-TemplateArgument template
 // CHECK-NEXT:         |-TemplateArgument type 'type-parameter-0-1'
 // CHECK-NEXT:         | `-TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-1' dependent depth 0 index 1
diff --git a/clang/tools/clang-repl/CMakeLists.txt b/clang/tools/clang-repl/CMakeLists.txt
index d3dec19..4017b14 100644
--- a/clang/tools/clang-repl/CMakeLists.txt
+++ b/clang/tools/clang-repl/CMakeLists.txt
@@ -11,6 +11,49 @@ add_clang_tool(clang-repl
   ClangRepl.cpp
   )
 
+if(MSVC)
+  set_target_properties(clang-repl PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS 1)
+
+  # RTTI/C++ symbols
+  set(clang_repl_exports ${clang_repl_exports} ??_7type_info@@6B@
+    ?__type_info_root_node@@3U__type_info_node@@A
+    ?nothrow@std@@3Unothrow_t@1@B
+  )
+
+  # Compiler added symbols for static variables. NOT for VStudio < 2015
+  set(clang_repl_exports ${clang_repl_exports} _Init_thread_abort _Init_thread_epoch
+    _Init_thread_footer _Init_thread_header _tls_index
+  )
+
+  if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+    # new/delete variants needed when linking to static msvc runtime (esp. Debug)
+    set(clang_repl_exports ${clang_repl_exports}
+      ??2@YAPEAX_K@Z
+      ??3@YAXPEAX@Z
+      ??_U@YAPEAX_K@Z
+      ??_V@YAXPEAX@Z
+      ??3@YAXPEAX_K@Z
+    )
+  else()
+    set(clang_repl_exports ${clang_repl_exports}
+      ??2@YAPAXI@Z
+      ??3@YAXPAX@Z
+      ??3@YAXPAXI@Z
+      ??_U@YAPAXI@Z
+      ??_V@YAXPAX@Z
+      ??_V@YAXPAXI@Z
+    )
+  endif()
+
+  # List to '/EXPORT:sym0 /EXPORT:sym1 /EXPORT:sym2 ...'
+  foreach(sym ${clang_repl_exports})
+    set(clang_repl_link_str "${clang_repl_link_str} /EXPORT:${sym}")
+  endforeach(sym ${clang_repl_exports})
+
+  set_property(TARGET clang-repl APPEND_STRING PROPERTY LINK_FLAGS ${clang_repl_link_str})
+
+endif(MSVC)
+
 clang_target_link_libraries(clang-repl PRIVATE
   clangAST
   clangBasic
diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
index f42af7e..036e57c8 100644
--- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp
+++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
@@ -86,6 +86,8 @@ static bool DeprecatedDriverCommand;
 static ResourceDirRecipeKind ResourceDirRecipe;
 static bool Verbose;
 static bool PrintTiming;
+static llvm::BumpPtrAllocator Alloc;
+static llvm::StringSaver Saver{Alloc};
 static std::vector<const char *> CommandLine;
 
 #ifndef NDEBUG
@@ -99,8 +101,6 @@ static bool RoundTripArgs = DoRoundTripDefault;
 static void ParseArgs(int argc, char **argv) {
   ScanDepsOptTable Tbl;
   llvm::StringRef ToolName = argv[0];
-  llvm::BumpPtrAllocator Alloc;
-  llvm::StringSaver Saver{Alloc};
   llvm::opt::InputArgList Args =
       Tbl.parseArgs(argc, argv, OPT_UNKNOWN, Saver, [&](StringRef Msg) {
         llvm::errs() << Msg << '\n';
@@ -792,6 +792,11 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) {
 
   llvm::cl::PrintOptionValues();
 
+  // Expand response files in advance, so that we can "see" all the arguments
+  // when adjusting below.
+  Compilations = expandResponseFiles(std::move(Compilations),
+                                     llvm::vfs::getRealFileSystem());
+
   // The command options are rewritten to run Clang in preprocessor only mode.
   auto AdjustingCompilations =
       std::make_unique<tooling::ArgumentsAdjustingCompilations>(
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index f00ba9e..49ed60d 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -776,10 +776,9 @@ bool CursorVisitor::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
   }
 
   // Visit the default argument.
-  if (D->hasDefaultArgument() && !D->defaultArgumentWasInherited())
-    if (TypeSourceInfo *DefArg = D->getDefaultArgumentInfo())
-      if (Visit(DefArg->getTypeLoc()))
-        return true;
+  if (D->hasDefaultArgument() && !D->defaultArgumentWasInherited() &&
+      VisitTemplateArgumentLoc(D->getDefaultArgument()))
+    return true;
 
   return false;
 }
@@ -946,8 +945,9 @@ bool CursorVisitor::VisitNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D) {
     return true;
 
   if (D->hasDefaultArgument() && !D->defaultArgumentWasInherited())
-    if (Expr *DefArg = D->getDefaultArgument())
-      return Visit(MakeCXCursor(DefArg, StmtParent, TU, RegionOfInterest));
+    if (D->hasDefaultArgument() &&
+        VisitTemplateArgumentLoc(D->getDefaultArgument()))
+      return true;
 
   return false;
 }
diff --git a/clang/tools/scan-build-py/tests/functional/exec/CMakeLists.txt b/clang/tools/scan-build-py/tests/functional/exec/CMakeLists.txt
index 95c6fdb..cb6ebda 100644
--- a/clang/tools/scan-build-py/tests/functional/exec/CMakeLists.txt
+++ b/clang/tools/scan-build-py/tests/functional/exec/CMakeLists.txt
@@ -2,11 +2,7 @@ project(exec C)
 
 cmake_minimum_required(VERSION 3.20.0)
 
-include(CheckCCompilerFlag)
-check_c_compiler_flag("-std=c99" C99_SUPPORTED)
-if (C99_SUPPORTED)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99")
-endif()
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99")
 
 include(CheckFunctionExists)
 include(CheckSymbolExists)
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 4ee64de..3dc1c33 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -1188,7 +1188,7 @@ TEST_P(ASTImporterOptionSpecificTestBase, TemplateTypeParmDeclDefaultArg) {
       FromTU, templateTypeParmDecl(hasName("T")));
   TemplateTypeParmDecl *To = Import(From, Lang_CXX03);
   ASSERT_TRUE(To->hasDefaultArgument());
-  QualType ToArg = To->getDefaultArgument();
+  QualType ToArg = To->getDefaultArgument().getArgument().getAsType();
   ASSERT_EQ(ToArg, QualType(To->getASTContext().IntTy));
 }
 
@@ -1260,7 +1260,7 @@ TEST_P(ASTImporterOptionSpecificTestBase, NonTypeTemplateParmDeclDefaultArg) {
       FromTU, nonTypeTemplateParmDecl(hasName("S")));
   NonTypeTemplateParmDecl *To = Import(From, Lang_CXX03);
   ASSERT_TRUE(To->hasDefaultArgument());
-  Stmt *ToArg = To->getDefaultArgument();
+  Stmt *ToArg = To->getDefaultArgument().getArgument().getAsExpr();
   ASSERT_TRUE(isa<IntegerLiteral>(ToArg));
   ASSERT_EQ(cast<IntegerLiteral>(ToArg)->getValue().getLimitedValue(), 1U);
 }
diff --git a/clang/unittests/AST/DeclTest.cpp b/clang/unittests/AST/DeclTest.cpp
index 2530ce7..16aa2b5 100644
--- a/clang/unittests/AST/DeclTest.cpp
+++ b/clang/unittests/AST/DeclTest.cpp
@@ -545,3 +545,34 @@ TEST(Decl, TemplateArgumentDefaulted) {
   EXPECT_TRUE(ArgList.get(2).getIsDefaulted());
   EXPECT_TRUE(ArgList.get(3).getIsDefaulted());
 }
+
+TEST(Decl, CXXDestructorDeclsShouldHaveWellFormedNameInfoRanges) {
+  // GH71161
+  llvm::Annotations Code(R"cpp(
+template <typename T> struct Resource {
+  ~Resource(); // 1
+};
+template <typename T>
+Resource<T>::~Resource() {} // 2,3
+
+void instantiate_template() {
+  Resource<int> x;
+}
+)cpp");
+
+  auto AST = tooling::buildASTFromCode(Code.code());
+  ASTContext &Ctx = AST->getASTContext();
+
+  const auto &SM = Ctx.getSourceManager();
+  auto GetNameInfoRange = [&SM](const BoundNodes &Match) {
+    const auto *D = Match.getNodeAs<CXXDestructorDecl>("dtor");
+    return D->getNameInfo().getSourceRange().printToString(SM);
+  };
+
+  auto Matches = match(findAll(cxxDestructorDecl().bind("dtor")),
+                       *Ctx.getTranslationUnitDecl(), Ctx);
+  ASSERT_EQ(Matches.size(), 3U);
+  EXPECT_EQ(GetNameInfoRange(Matches[0]), "<input.cc:3:3, col:4>");
+  EXPECT_EQ(GetNameInfoRange(Matches[1]), "<input.cc:6:14, col:15>");
+  EXPECT_EQ(GetNameInfoRange(Matches[2]), "<input.cc:6:14, col:15>");
+}
diff --git a/clang/unittests/Driver/DXCModeTest.cpp b/clang/unittests/Driver/DXCModeTest.cpp
index 416723d..41ab30b 100644
--- a/clang/unittests/Driver/DXCModeTest.cpp
+++ b/clang/unittests/Driver/DXCModeTest.cpp
@@ -156,9 +156,10 @@ TEST(DxcModeTest, ValidatorVersionValidation) {
   TranslatedArgs.reset(
       TC.TranslateArgs(*DAL, "0", Action::OffloadKind::OFK_None));
   EXPECT_EQ(Diags.getNumErrors(), 1u);
-  EXPECT_STREQ(DiagConsumer->Errors.back().c_str(),
-               "invalid validator version : 0.1\nIf validator major version is "
-               "0, minor version must also be 0.");
+  EXPECT_STREQ(
+      DiagConsumer->Errors.back().c_str(),
+      "invalid validator version : 0.1; if validator major version is 0, "
+      "minor version must also be 0");
   Diags.Clear();
   DiagConsumer->clear();
 
@@ -173,8 +174,8 @@ TEST(DxcModeTest, ValidatorVersionValidation) {
       TC.TranslateArgs(*DAL, "0", Action::OffloadKind::OFK_None));
   EXPECT_EQ(Diags.getNumErrors(), 2u);
   EXPECT_STREQ(DiagConsumer->Errors.back().c_str(),
-               "invalid validator version : 1\nFormat of validator version is "
-               "\"<major>.<minor>\" (ex:\"1.4\").");
+               "invalid validator version : 1; format of validator version is "
+               "\"<major>.<minor>\" (ex:\"1.4\")");
   Diags.Clear();
   DiagConsumer->clear();
 
@@ -190,8 +191,8 @@ TEST(DxcModeTest, ValidatorVersionValidation) {
   EXPECT_EQ(Diags.getNumErrors(), 3u);
   EXPECT_STREQ(
       DiagConsumer->Errors.back().c_str(),
-      "invalid validator version : -Tlib_6_7\nFormat of validator version is "
-      "\"<major>.<minor>\" (ex:\"1.4\").");
+      "invalid validator version : -Tlib_6_7; format of validator version is "
+      "\"<major>.<minor>\" (ex:\"1.4\")");
   Diags.Clear();
   DiagConsumer->clear();
 
@@ -207,8 +208,8 @@ TEST(DxcModeTest, ValidatorVersionValidation) {
   EXPECT_EQ(Diags.getNumErrors(), 4u);
   EXPECT_STREQ(
       DiagConsumer->Errors.back().c_str(),
-      "invalid validator version : foo\nFormat of validator version is "
-      "\"<major>.<minor>\" (ex:\"1.4\").");
+      "invalid validator version : foo; format of validator version is "
+      "\"<major>.<minor>\" (ex:\"1.4\")");
   Diags.Clear();
   DiagConsumer->clear();
 }
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 2f0c0f0..a9df994 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -17340,12 +17340,14 @@ TEST_F(FormatTest, ConfigurableSpaceBeforeAssignmentOperators) {
   verifyFormat("int a = 5;");
   verifyFormat("a += 42;");
   verifyFormat("a or_eq 8;");
+  verifyFormat("xor = foo;");
 
   FormatStyle Spaces = getLLVMStyle();
   Spaces.SpaceBeforeAssignmentOperators = false;
   verifyFormat("int a= 5;", Spaces);
   verifyFormat("a+= 42;", Spaces);
   verifyFormat("a or_eq 8;", Spaces);
+  verifyFormat("xor= foo;", Spaces);
 }
 
 TEST_F(FormatTest, ConfigurableSpaceBeforeColon) {
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 45c1554..6ea9c4a 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -3015,6 +3015,60 @@ TEST_F(TokenAnnotatorTest, SwitchExpression) {
   EXPECT_TOKEN(Tokens[20], tok::arrow, TT_CaseLabelArrow);
 }
 
+TEST_F(TokenAnnotatorTest, CppAltOperatorKeywords) {
+  auto Tokens = annotate("a = b and c;");
+  ASSERT_EQ(Tokens.size(), 7u);
+  EXPECT_TOKEN(Tokens[3], tok::ampamp, TT_BinaryOperator);
+
+  Tokens = annotate("a = b and_eq c;");
+  ASSERT_EQ(Tokens.size(), 7u);
+  EXPECT_TOKEN(Tokens[3], tok::ampequal, TT_BinaryOperator);
+
+  Tokens = annotate("a = b bitand c;");
+  ASSERT_EQ(Tokens.size(), 7u);
+  EXPECT_TOKEN(Tokens[3], tok::amp, TT_BinaryOperator);
+
+  Tokens = annotate("a = b bitor c;");
+  ASSERT_EQ(Tokens.size(), 7u);
+  EXPECT_TOKEN(Tokens[3], tok::pipe, TT_BinaryOperator);
+
+  Tokens = annotate("a = b compl c;");
+  ASSERT_EQ(Tokens.size(), 7u);
+  EXPECT_TOKEN(Tokens[3], tok::tilde, TT_UnaryOperator);
+
+  Tokens = annotate("a = b not c;");
+  ASSERT_EQ(Tokens.size(), 7u);
+  EXPECT_TOKEN(Tokens[3], tok::exclaim, TT_UnaryOperator);
+
+  Tokens = annotate("a = b not_eq c;");
+  ASSERT_EQ(Tokens.size(), 7u);
+  EXPECT_TOKEN(Tokens[3], tok::exclaimequal, TT_BinaryOperator);
+
+  Tokens = annotate("a = b or c;");
+  ASSERT_EQ(Tokens.size(), 7u);
+  EXPECT_TOKEN(Tokens[3], tok::pipepipe, TT_BinaryOperator);
+
+  Tokens = annotate("a = b or_eq c;");
+  ASSERT_EQ(Tokens.size(), 7u);
+  EXPECT_TOKEN(Tokens[3], tok::pipeequal, TT_BinaryOperator);
+
+  Tokens = annotate("a = b xor c;");
+  ASSERT_EQ(Tokens.size(), 7u);
+  EXPECT_TOKEN(Tokens[3], tok::caret, TT_BinaryOperator);
+
+  Tokens = annotate("a = b xor_eq c;");
+  ASSERT_EQ(Tokens.size(), 7u);
+  EXPECT_TOKEN(Tokens[3], tok::caretequal, TT_BinaryOperator);
+
+  Tokens = annotate("xor = foo;");
+  ASSERT_EQ(Tokens.size(), 5u);
+  EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
+
+  Tokens = annotate("int xor = foo;");
+  ASSERT_EQ(Tokens.size(), 6u);
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_StartOfName);
+}
+
 } // namespace
 } // namespace format
 } // namespace clang
diff --git a/clang/unittests/Interpreter/CMakeLists.txt b/clang/unittests/Interpreter/CMakeLists.txt
index e5a77e7..c0fd2d8 100644
--- a/clang/unittests/Interpreter/CMakeLists.txt
+++ b/clang/unittests/Interpreter/CMakeLists.txt
@@ -29,3 +29,46 @@ if(NOT WIN32)
 endif()
 
 export_executable_symbols(ClangReplInterpreterTests)
+
+if(MSVC)
+  set_target_properties(ClangReplInterpreterTests PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS 1)
+
+  # RTTI/C++ symbols
+  set(ClangReplInterpreterTests_exports ${ClangReplInterpreterTests_exports} ??_7type_info@@6B@
+    ?__type_info_root_node@@3U__type_info_node@@A
+    ?nothrow@std@@3Unothrow_t@1@B
+  )
+
+  # Compiler added symbols for static variables. NOT for VStudio < 2015
+  set(ClangReplInterpreterTests_exports ${ClangReplInterpreterTests_exports} _Init_thread_abort _Init_thread_epoch
+    _Init_thread_footer _Init_thread_header _tls_index
+  )
+
+  if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+    # new/delete variants needed when linking to static msvc runtime (esp. Debug)
+    set(ClangReplInterpreterTests_exports ${ClangReplInterpreterTests_exports}
+      ??2@YAPEAX_K@Z
+      ??3@YAXPEAX@Z
+      ??_U@YAPEAX_K@Z
+      ??_V@YAXPEAX@Z
+      ??3@YAXPEAX_K@Z
+    )
+  else()
+    set(ClangReplInterpreterTests_exports ${ClangReplInterpreterTests_exports}
+      ??2@YAPAXI@Z
+      ??3@YAXPAX@Z
+      ??3@YAXPAXI@Z
+      ??_U@YAPAXI@Z
+      ??_V@YAXPAX@Z
+      ??_V@YAXPAXI@Z
+    )
+  endif()
+
+  # List to '/EXPORT:sym0 /EXPORT:sym1 /EXPORT:sym2 ...'
+  foreach(sym ${ClangReplInterpreterTests_exports})
+    set(ClangReplInterpreterTests_link_str "${ClangReplInterpreterTests_link_str} /EXPORT:${sym}")
+  endforeach(sym ${ClangReplInterpreterTests_exports})
+
+  set_property(TARGET ClangReplInterpreterTests APPEND_STRING PROPERTY LINK_FLAGS ${ClangReplInterpreterTests_link_str})
+
+endif(MSVC)
diff --git a/clang/unittests/StaticAnalyzer/CallEventTest.cpp b/clang/unittests/StaticAnalyzer/CallEventTest.cpp
index adbfe02..7c41327 100644
--- a/clang/unittests/StaticAnalyzer/CallEventTest.cpp
+++ b/clang/unittests/StaticAnalyzer/CallEventTest.cpp
@@ -76,7 +76,7 @@ TEST(CXXDeallocatorCall, SimpleDestructor) {
     }
   )",
                                                          Diags));
-  EXPECT_EQ(Diags, "test.CXXDeallocator: NumArgs: 1\n");
+  EXPECT_EQ(Diags, "test.CXXDeallocator: NumArgs: 2\n");
 }
 
 } // namespace
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index 3ddfd32..e77d806 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -1385,17 +1385,14 @@ void SVEEmitter::createHeader(raw_ostream &OS) {
         SVEType ToV(To.BaseType, N);
         for (const ReinterpretTypeInfo &From : Reinterprets) {
           SVEType FromV(From.BaseType, N);
-          if (ShortForm) {
-            OS << "__aio __attribute__((target(\"sve\"))) " << ToV.str()
-               << " svreinterpret_" << To.Suffix;
-            OS << "(" << FromV.str() << " op) __arm_streaming_compatible {\n";
-            OS << "  return __builtin_sve_reinterpret_" << To.Suffix << "_"
-               << From.Suffix << Suffix << "(op);\n";
-            OS << "}\n\n";
-          } else
-            OS << "#define svreinterpret_" << To.Suffix << "_" << From.Suffix
-               << Suffix << "(...) __builtin_sve_reinterpret_" << To.Suffix
-               << "_" << From.Suffix << Suffix << "(__VA_ARGS__)\n";
+          OS << "__aio "
+                "__attribute__((__clang_arm_builtin_alias(__builtin_sve_"
+                "reinterpret_"
+             << To.Suffix << "_" << From.Suffix << Suffix << ")))\n"
+             << ToV.str() << " svreinterpret_" << To.Suffix;
+          if (!ShortForm)
+            OS << "_" << From.Suffix << Suffix;
+          OS << "(" << FromV.str() << " op);\n";
         }
       }
   }
@@ -1453,7 +1450,7 @@ void SVEEmitter::createBuiltins(raw_ostream &OS) {
         SVEType FromV(From.BaseType, N);
         OS << "TARGET_BUILTIN(__builtin_sve_reinterpret_" << To.Suffix << "_"
            << From.Suffix << Suffix << +", \"" << ToV.builtin_str()
-           << FromV.builtin_str() << "\", \"n\", \"sve\")\n";
+           << FromV.builtin_str() << "\", \"n\", \"sme|sve\")\n";
       }
     }
   }
diff --git a/clang/utils/analyzer/entrypoint.py b/clang/utils/analyzer/entrypoint.py
index ff877060b..4deb42d 100644
--- a/clang/utils/analyzer/entrypoint.py
+++ b/clang/utils/analyzer/entrypoint.py
@@ -54,7 +54,7 @@ CMAKE_COMMAND = (
     "cmake -G Ninja -DCMAKE_BUILD_TYPE=Release "
     "-DCMAKE_INSTALL_PREFIX=/analyzer -DLLVM_TARGETS_TO_BUILD=X86 "
     '-DLLVM_ENABLE_PROJECTS="clang;openmp" -DLLVM_BUILD_RUNTIME=OFF '
-    "-DLLVM_ENABLE_TERMINFO=OFF -DCLANG_ENABLE_ARCMT=OFF "
+    "-DCLANG_ENABLE_ARCMT=OFF "
     "-DCLANG_ENABLE_STATIC_ANALYZER=ON"
 )
 
diff --git a/clang/utils/ci/buildkite-pipeline.yml b/clang/utils/ci/buildkite-pipeline.yml
index 7a67917..86cfcf3 100644
--- a/clang/utils/ci/buildkite-pipeline.yml
+++ b/clang/utils/ci/buildkite-pipeline.yml
@@ -17,18 +17,7 @@ env:
     # LLVM RELEASE bump version
     LLVM_HEAD_VERSION: "17"
 steps:
-  - label: "Format"
-    commands:
-      - "clang/utils/ci/run-buildbot check-format"
-    agents:
-      queue: "linux"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 2
-    timeout_in_minutes: 120
-
-  - label: "Building and testing clang (Linux)"
+  - label: "Building Clang (Linux)"
     commands:
       - "clang/utils/ci/run-buildbot build-clang"
     agents:
@@ -39,21 +28,9 @@ steps:
           limit: 2
     timeout_in_minutes: 120
 
-  - label: "Building and testing clang (Windows)"
-    commands:
-      - "C:\\BuildTools\\Common7\\Tools\\VsDevCmd.bat -arch=amd64 -host_arch=amd64"
-      - "bash clang/utils/ci/run-buildbot build-clang-windows"
-    agents:
-      queue: "windows"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 2
-    timeout_in_minutes: 120
-
   - wait
 
-  - label: "Running libc++ test suite in C++03"
+  - label: "Testing libc++ with just-built Clang (C++03)"
     commands:
       - "clang/utils/ci/run-buildbot generic-cxx03"
     artifact_paths:
@@ -70,7 +47,7 @@ steps:
           limit: 2
     timeout_in_minutes: 120
 
-  - label: "Running libc++ test suite in C++26"
+  - label: "Testing libc++ with just-built Clang (C++26)"
     commands:
       - "clang/utils/ci/run-buildbot generic-cxx26"
     artifact_paths:
@@ -87,7 +64,7 @@ steps:
           limit: 2
     timeout_in_minutes: 120
 
-  - label: "Running libc++ test suite with Clang Modules"
+  - label: "Testing libc++ with just-built Clang (w/ Clang Modules)"
     commands:
       - "clang/utils/ci/run-buildbot generic-modules"
     artifact_paths:
diff --git a/clang/utils/ci/run-buildbot b/clang/utils/ci/run-buildbot
index f47ffb5..c68ddad 100755
--- a/clang/utils/ci/run-buildbot
+++ b/clang/utils/ci/run-buildbot
@@ -69,13 +69,6 @@ cmake --version
 ninja --version
 
 case "${BUILDER}" in
-check-format)
-    echo "*** Checking for trailing whitespace left in Clang source files ***"
-    if grep -rnI '[[:blank:]]$' clang/lib clang/include clang/docs; then
-        echo "*** Trailing whitespace has been found in Clang source files as described above ***"
-        exit 1
-    fi
-;;
 build-clang)
     mkdir install
     # We use Release here to avoid including debug information. Otherwise, the
@@ -90,29 +83,13 @@ build-clang)
         -DCMAKE_CXX_COMPILER_LAUNCHER="ccache"                                 \
         -DCMAKE_BUILD_TYPE=Release                                             \
         -DCMAKE_INSTALL_PREFIX=install                                         \
+        -DLLVM_TARGETS_TO_BUILD=Native                                         \
         -DLLVM_ENABLE_PROJECTS="clang;compiler-rt"                             \
 
     ninja -C ${BUILD_DIR} install-clang install-clang-resource-headers
     ccache -s
     tar -cJvf install.tar.xz install/
     buildkite-agent artifact upload --debug install.tar.xz
-
-    ninja -C ${BUILD_DIR} check-clang
-;;
-build-clang-windows)
-    cmake -S llvm -B ${BUILD_DIR} -G Ninja                                      \
-        -D CMAKE_C_COMPILER_LAUNCHER=sccache                                    \
-        -D CMAKE_CXX_COMPILER_LAUNCHER=sccache                                  \
-        -D CMAKE_BUILD_TYPE=Release                                             \
-        -D CMAKE_INSTALL_PREFIX=install-windows                                 \
-        -D LLVM_ENABLE_PROJECTS="clang;compiler-rt"                             \
-        -D LLVM_ENABLE_ASSERTIONS=ON                                            \
-        -D LLVM_BUILD_EXAMPLES=ON                                               \
-        -D COMPILER_RT_BUILD_LIBFUZZER=OFF                                      \
-        -D COMPILER_RT_BUILD_ORC=OFF
-
-    ninja -C ${BUILD_DIR} install-clang install-clang-resource-headers
-    ninja -C ${BUILD_DIR} check-clang
 ;;
 generic-cxx03)
     buildkite-agent artifact download install.tar.xz .
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 9d45833..4cce88f 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -10698,7 +10698,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/1815.html">1815</a></td>
     <td>CD4</td>
     <td>Lifetime extension in aggregate initialization</td>
-    <td class="none" align="center">No</td>
+    <td class="unreleased" align="center">Clang 19</td>
   </tr>
   <tr id="1816">
     <td><a href="https://cplusplus.github.io/CWG/issues/1816.html">1816</a></td>
@@ -17095,7 +17095,7 @@ objects</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/2881.html">2881</a></td>
     <td>tentatively ready</td>
     <td>Type restrictions for the explicit object parameter of a lambda</td>
-    <td align="center">Not resolved</td>
+    <td title="Clang 19 implements 2024-04-19 resolution" align="center">Not Resolved*</td>
   </tr>
   <tr class="open" id="2882">
     <td><a href="https://cplusplus.github.io/CWG/issues/2882.html">2882</a></td>
diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html
index a11bf9a0..4541617 100755
--- a/clang/www/cxx_status.html
+++ b/clang/www/cxx_status.html
@@ -1255,12 +1255,11 @@ version.
 </table>
 
 <p>
-<span id="n3778">(7): In Clang 3.7 and later, sized deallocation is only enabled
-if the user passes the <code>-fsized-deallocation</code> flag. The user must
-supply definitions of the sized deallocation functions, either by providing them
-explicitly or by using a C++ standard library that does. <code>libstdc++</code>
-added these functions in version 5.0, and <code>libc++</code> added them in
-version 3.7.
+<span id="n3778">(7): The user must supply definitions of the sized deallocation
+  functions, either by providing them explicitly or by using a C++ standard library
+  that does. <code>libstdc++</code> added these functions in version 5.0, and
+  <code>libc++</code> added them in version 3.7. The user can also use the
+  <code>-fno-sized-deallocation</code> option to disable sized deallocation.
 </span>
 </p>
 </details>
diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index 42edbe1..bddaa37 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -182,21 +182,6 @@ check_library_exists(m pow "" COMPILER_RT_HAS_LIBM)
 check_library_exists(pthread pthread_create "" COMPILER_RT_HAS_LIBPTHREAD)
 check_library_exists(execinfo backtrace "" COMPILER_RT_HAS_LIBEXECINFO)
 
-# Look for terminfo library, used in unittests that depend on LLVMSupport.
-if(LLVM_ENABLE_TERMINFO STREQUAL FORCE_ON)
-  set(MAYBE_REQUIRED REQUIRED)
-else()
-  set(MAYBE_REQUIRED)
-endif()
-if(LLVM_ENABLE_TERMINFO)
-  find_library(COMPILER_RT_TERMINFO_LIB NAMES terminfo tinfo curses ncurses ncursesw ${MAYBE_REQUIRED})
-endif()
-if(COMPILER_RT_TERMINFO_LIB)
-  set(LLVM_ENABLE_TERMINFO 1)
-else()
-  set(LLVM_ENABLE_TERMINFO 0)
-endif()
-
 if (ANDROID AND COMPILER_RT_HAS_LIBDL)
   # Android's libstdc++ has a dependency on libdl.
   list(APPEND CMAKE_REQUIRED_LIBRARIES dl)
diff --git a/compiler-rt/lib/dfsan/dfsan_allocator.cpp b/compiler-rt/lib/dfsan/dfsan_allocator.cpp
index 63475f4..682df8c 100644
--- a/compiler-rt/lib/dfsan/dfsan_allocator.cpp
+++ b/compiler-rt/lib/dfsan/dfsan_allocator.cpp
@@ -45,7 +45,7 @@ const uptr kAllocatorSpace = 0xE00000000000ULL;
 #else
 const uptr kAllocatorSpace = 0x700000000000ULL;
 #endif
-const uptr kMaxAllowedMallocSize = 8UL << 30;
+const uptr kMaxAllowedMallocSize = 1ULL << 40;
 
 struct AP64 {  // Allocator64 parameters. Deliberately using a short name.
   static const uptr kSpaceBeg = kAllocatorSpace;
diff --git a/compiler-rt/lib/dfsan/dfsan_custom.cpp b/compiler-rt/lib/dfsan/dfsan_custom.cpp
index 3af26e9..af3c1f4 100644
--- a/compiler-rt/lib/dfsan/dfsan_custom.cpp
+++ b/compiler-rt/lib/dfsan/dfsan_custom.cpp
@@ -1901,17 +1901,27 @@ SANITIZER_INTERFACE_ATTRIBUTE int __dfso_nanosleep(
   return __dfsw_nanosleep(req, rem, req_label, rem_label, ret_label);
 }
 
-static void clear_msghdr_labels(size_t bytes_written, struct msghdr *msg) {
+static void clear_msghdr_labels(size_t bytes_written, struct msghdr *msg,
+                                int flags) {
   dfsan_set_label(0, msg, sizeof(*msg));
   dfsan_set_label(0, msg->msg_name, msg->msg_namelen);
   dfsan_set_label(0, msg->msg_control, msg->msg_controllen);
-  for (size_t i = 0; bytes_written > 0; ++i) {
-    assert(i < msg->msg_iovlen);
+  for (size_t i = 0; i < msg->msg_iovlen; ++i) {
     struct iovec *iov = &msg->msg_iov[i];
-    size_t iov_written =
-        bytes_written < iov->iov_len ? bytes_written : iov->iov_len;
+    size_t iov_written = iov->iov_len;
+
+    // When MSG_TRUNC is not set, we want to avoid setting 0 label on bytes that
+    // may not have changed, using bytes_written to bound the 0 label write.
+    // When MSG_TRUNC flag is set, bytes_written may be larger than the buffer,
+    // and should not be used as a bound.
+    if (!(MSG_TRUNC & flags)) {
+      if (bytes_written < iov->iov_len) {
+        iov_written = bytes_written;
+      }
+      bytes_written -= iov_written;
+    }
+
     dfsan_set_label(0, iov->iov_base, iov_written);
-    bytes_written -= iov_written;
   }
 }
 
@@ -1923,7 +1933,7 @@ SANITIZER_INTERFACE_ATTRIBUTE int __dfsw_recvmmsg(
   int ret = recvmmsg(sockfd, msgvec, vlen, flags, timeout);
   for (int i = 0; i < ret; ++i) {
     dfsan_set_label(0, &msgvec[i].msg_len, sizeof(msgvec[i].msg_len));
-    clear_msghdr_labels(msgvec[i].msg_len, &msgvec[i].msg_hdr);
+    clear_msghdr_labels(msgvec[i].msg_len, &msgvec[i].msg_hdr, flags);
   }
   *ret_label = 0;
   return ret;
@@ -1947,7 +1957,7 @@ SANITIZER_INTERFACE_ATTRIBUTE ssize_t __dfsw_recvmsg(
     dfsan_label msg_label, dfsan_label flags_label, dfsan_label *ret_label) {
   ssize_t ret = recvmsg(sockfd, msg, flags);
   if (ret >= 0)
-    clear_msghdr_labels(ret, msg);
+    clear_msghdr_labels(ret, msg, flags);
   *ret_label = 0;
   return ret;
 }
diff --git a/compiler-rt/lib/lsan/lsan_allocator.cpp b/compiler-rt/lib/lsan/lsan_allocator.cpp
index 12d579a..493bf5f 100644
--- a/compiler-rt/lib/lsan/lsan_allocator.cpp
+++ b/compiler-rt/lib/lsan/lsan_allocator.cpp
@@ -31,7 +31,7 @@ static const uptr kMaxAllowedMallocSize = 1ULL << 30;
 #elif defined(__mips64) || defined(__aarch64__)
 static const uptr kMaxAllowedMallocSize = 4ULL << 30;
 #else
-static const uptr kMaxAllowedMallocSize = 8ULL << 30;
+static const uptr kMaxAllowedMallocSize = 1ULL << 40;
 #endif
 
 static Allocator allocator;
diff --git a/compiler-rt/lib/msan/msan_allocator.cpp b/compiler-rt/lib/msan/msan_allocator.cpp
index b1bc5b9..8350106 100644
--- a/compiler-rt/lib/msan/msan_allocator.cpp
+++ b/compiler-rt/lib/msan/msan_allocator.cpp
@@ -71,7 +71,7 @@ static const uptr kAllocatorSpace = 0x700000000000ULL;
 #else
 static const uptr kAllocatorSpace = 0x600000000000ULL;
 #endif
-static const uptr kMaxAllowedMallocSize = 8UL << 30;
+static const uptr kMaxAllowedMallocSize = 1ULL << 40;
 
 struct AP64 {  // Allocator64 parameters. Deliberately using a short name.
   static const uptr kSpaceBeg = kAllocatorSpace;
diff --git a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
index 005bd6d..b4702339 100755
--- a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
+++ b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
@@ -139,7 +139,6 @@ if [[ ! -f ${LLVM_BUILD}/build.ninja ]]; then
     -DLLVM_INCLUDE_TESTS=OFF \
     -DLLVM_ENABLE_ZLIB=ON \
     -DLLVM_ENABLE_ZSTD=OFF \
-    -DLLVM_ENABLE_TERMINFO=OFF \
     -DLLVM_ENABLE_THREADS=OFF \
   $LLVM_SRC
 fi
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index 15a199a..f9ed365 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -1052,6 +1052,10 @@ private:
                                 void *Block, const uptr UserPtr,
                                 const uptr SizeOrUnusedBytes,
                                 const FillContentsMode FillContents) {
+    // Compute the default pointer before adding the header tag
+    const uptr DefaultAlignedPtr =
+        reinterpret_cast<uptr>(Block) + Chunk::getHeaderSize();
+
     Block = addHeaderTag(Block);
     // Only do content fill when it's from primary allocator because secondary
     // allocator has filled the content.
@@ -1064,8 +1068,6 @@ private:
 
     Chunk::UnpackedHeader Header = {};
 
-    const uptr DefaultAlignedPtr =
-        reinterpret_cast<uptr>(Block) + Chunk::getHeaderSize();
     if (UNLIKELY(DefaultAlignedPtr != UserPtr)) {
       const uptr Offset = UserPtr - DefaultAlignedPtr;
       DCHECK_GE(Offset, 2 * sizeof(u32));
@@ -1096,6 +1098,10 @@ private:
     const Options Options = Primary.Options.load();
     DCHECK(useMemoryTagging<AllocatorConfig>(Options));
 
+    // Compute the default pointer before adding the header tag
+    const uptr DefaultAlignedPtr =
+        reinterpret_cast<uptr>(Block) + Chunk::getHeaderSize();
+
     void *Ptr = reinterpret_cast<void *>(UserPtr);
     void *TaggedPtr = Ptr;
 
@@ -1194,8 +1200,6 @@ private:
 
     Chunk::UnpackedHeader Header = {};
 
-    const uptr DefaultAlignedPtr =
-        reinterpret_cast<uptr>(Block) + Chunk::getHeaderSize();
     if (UNLIKELY(DefaultAlignedPtr != UserPtr)) {
       const uptr Offset = UserPtr - DefaultAlignedPtr;
       DCHECK_GE(Offset, 2 * sizeof(u32));
diff --git a/compiler-rt/lib/xray/tests/CMakeLists.txt b/compiler-rt/lib/xray/tests/CMakeLists.txt
index 0a428b9..4c7e92b 100644
--- a/compiler-rt/lib/xray/tests/CMakeLists.txt
+++ b/compiler-rt/lib/xray/tests/CMakeLists.txt
@@ -54,11 +54,6 @@ set(XRAY_UNITTEST_LINK_FLAGS
   ${COMPILER_RT_CXX_LINK_LIBS})
 
 if (NOT APPLE)
-  # Needed by LLVMSupport.
-  append_list_if(
-    LLVM_ENABLE_TERMINFO
-    -l${COMPILER_RT_TERMINFO_LIB} XRAY_UNITTEST_LINK_FLAGS)
-
   # We add the library directories one at a time in our CFLAGS.
   foreach (DIR ${LLVM_LIBRARY_DIR})
     list(APPEND XRAY_UNITTEST_LINK_FLAGS -L${DIR})
diff --git a/compiler-rt/test/asan/TestCases/Windows/bitfield_uaf.cpp b/compiler-rt/test/asan/TestCases/Windows/bitfield_uaf.cpp
index 12ed505..ac3649a 100644
--- a/compiler-rt/test/asan/TestCases/Windows/bitfield_uaf.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/bitfield_uaf.cpp
@@ -24,10 +24,10 @@ int main(void) {
   // CHECK: [[ADDR]] is located 0 bytes inside of 4-byte region
   // CHECK-LABEL: freed by thread T0 here:
   // CHECK:   {{#0 .* free }}
-  // CHECK:   {{ #[1-2] .* main .*bitfield_uaf.cpp}}:[[@LINE-4]]
+  // CHECK:   {{ #[1-3] .* main .*bitfield_uaf.cpp}}:[[@LINE-4]]
   // CHECK-LABEL: previously allocated by thread T0 here:
   // CHECK:   {{#0 .* malloc }}
-  // CHECK:   {{ #[1-2] .* main .*bitfield_uaf.cpp}}:[[@LINE-8]]
+  // CHECK:   {{ #[1-3] .* main .*bitfield_uaf.cpp}}:[[@LINE-8]]
   make_access(s);
   return 0;
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/calloc_left_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/calloc_left_oob.cpp
index e96fb61..e71ffdb 100644
--- a/compiler-rt/test/asan/TestCases/Windows/calloc_left_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/calloc_left_oob.cpp
@@ -12,6 +12,6 @@ int main() {
   // CHECK: [[ADDR]] is located 4 bytes before 168-byte region
   // CHECK: allocated by thread T0 here:
   // CHECK: {{#0 .* calloc }}
-  // CHECK: {{ #[1-2] .* main .*calloc_left_oob.cpp}}:[[@LINE-8]]
+  // CHECK: {{ #[1-3] .* main .*calloc_left_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/calloc_right_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/calloc_right_oob.cpp
index fe0fc20..507d844 100644
--- a/compiler-rt/test/asan/TestCases/Windows/calloc_right_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/calloc_right_oob.cpp
@@ -12,6 +12,6 @@ int main() {
   // CHECK: [[ADDR]] is located 0 bytes after 168-byte region
   // CHECK: allocated by thread T0 here:
   // CHECK-NEXT: {{#0 .* calloc }}
-  // CHECK: {{ #[1-2] .* main .*calloc_right_oob.cpp}}:[[@LINE-8]]
+  // CHECK: {{ #[1-3] .* main .*calloc_right_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/calloc_uaf.cpp b/compiler-rt/test/asan/TestCases/Windows/calloc_uaf.cpp
index bf13f7d..a03c5e1 100644
--- a/compiler-rt/test/asan/TestCases/Windows/calloc_uaf.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/calloc_uaf.cpp
@@ -13,8 +13,8 @@ int main() {
   // CHECK: [[ADDR]] is located 0 bytes inside of 168-byte region
   // CHECK: freed by thread T0 here:
   // CHECK-NEXT: {{#0 .* free }}
-  // CHECK: {{ #[1-2] .* main .*calloc_uaf.cpp}}:[[@LINE-8]]
+  // CHECK: {{ #[1-3] .* main .*calloc_uaf.cpp}}:[[@LINE-8]]
   // CHECK: previously allocated by thread T0 here:
   // CHECK-NEXT: {{#0 .* calloc }}
-  // CHECK: {{ #[1-2] .* main .*calloc_uaf.cpp}}:[[@LINE-12]]
+  // CHECK: {{ #[1-3] .* main .*calloc_uaf.cpp}}:[[@LINE-12]]
 }
diff --git a/compiler-rt/test/dfsan/custom.cpp b/compiler-rt/test/dfsan/custom.cpp
index f544e48..cede0d6 100644
--- a/compiler-rt/test/dfsan/custom.cpp
+++ b/compiler-rt/test/dfsan/custom.cpp
@@ -768,26 +768,53 @@ void test_recvmsg() {
   ssize_t sent = sendmsg(sockfds[0], &smsg, 0);
   assert(sent > 0);
 
-  char rbuf[128];
-  struct iovec riovs[2] = {{&rbuf[0], 4}, {&rbuf[4], 4}};
-  struct msghdr rmsg = {};
-  rmsg.msg_iov = riovs;
-  rmsg.msg_iovlen = 2;
-
-  dfsan_set_label(i_label, rbuf, sizeof(rbuf));
-  dfsan_set_label(i_label, &rmsg, sizeof(rmsg));
-
-  DEFINE_AND_SAVE_ORIGINS(rmsg)
-
-  ssize_t received = recvmsg(sockfds[1], &rmsg, 0);
-  assert(received == sent);
-  assert(memcmp(sbuf, rbuf, 8) == 0);
-  ASSERT_ZERO_LABEL(received);
-  ASSERT_READ_ZERO_LABEL(&rmsg, sizeof(rmsg));
-  ASSERT_READ_ZERO_LABEL(&rbuf[0], 8);
-  ASSERT_READ_LABEL(&rbuf[8], 1, i_label);
-
-  ASSERT_SAVED_ORIGINS(rmsg)
+  {
+    char rpbuf[2];
+    struct iovec peek_iov;
+    peek_iov.iov_base = rpbuf;
+    peek_iov.iov_len = 2;
+
+    struct msghdr peek_header = {};
+    peek_header.msg_iov = &peek_iov;
+    peek_header.msg_iovlen = 1;
+
+    dfsan_set_label(i_label, rpbuf, sizeof(rpbuf));
+    dfsan_set_label(i_label, &peek_header, sizeof(peek_header));
+
+    DEFINE_AND_SAVE_ORIGINS(peek_header)
+
+    ssize_t received = recvmsg(sockfds[1], &peek_header, MSG_PEEK | MSG_TRUNC);
+    assert(received == sent);
+    assert(memcmp(sbuf, rpbuf, 2) == 0);
+    ASSERT_ZERO_LABEL(received);
+    ASSERT_READ_ZERO_LABEL(&peek_header, sizeof(peek_header));
+    ASSERT_READ_ZERO_LABEL(&rpbuf[0], 0);
+
+    ASSERT_SAVED_ORIGINS(peek_header)
+  }
+
+  {
+    char rbuf[128];
+    struct iovec riovs[2] = {{&rbuf[0], 4}, {&rbuf[4], 4}};
+    struct msghdr rmsg = {};
+    rmsg.msg_iov = riovs;
+    rmsg.msg_iovlen = 2;
+
+    dfsan_set_label(i_label, rbuf, sizeof(rbuf));
+    dfsan_set_label(i_label, &rmsg, sizeof(rmsg));
+
+    DEFINE_AND_SAVE_ORIGINS(rmsg)
+
+    ssize_t received = recvmsg(sockfds[1], &rmsg, 0);
+    assert(received == sent);
+    assert(memcmp(sbuf, rbuf, 8) == 0);
+    ASSERT_ZERO_LABEL(received);
+    ASSERT_READ_ZERO_LABEL(&rmsg, sizeof(rmsg));
+    ASSERT_READ_ZERO_LABEL(&rbuf[0], 8);
+    ASSERT_READ_LABEL(&rbuf[8], 1, i_label);
+
+    ASSERT_SAVED_ORIGINS(rmsg)
+  }
 
   close(sockfds[0]);
   close(sockfds[1]);
diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index c8e7502..af34366 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -336,7 +336,7 @@ endif()
 
 if (FLANG_RUNTIME_F128_MATH_LIB)
   add_compile_definitions(
-    -DFLANG_RUNTIME_F128_MATH_LIB="${FLANG_RUNTIME_F128_MATH_LIB}"
+    FLANG_RUNTIME_F128_MATH_LIB="${FLANG_RUNTIME_F128_MATH_LIB}"
     )
 endif()
 
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 43ed35e..7b872c7 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -223,6 +223,10 @@ end
 * When a dummy argument is `POINTER` or `ALLOCATABLE` and is `INTENT(IN)`, we
   relax enforcement of some requirements on actual arguments that must otherwise
   hold true for definable arguments.
+* We allow a limited polymorphic `POINTER` or `ALLOCATABLE` actual argument
+  to be associated with a compatible monomorphic dummy argument, as
+  our implementation, like others, supports a reallocation that would
+  change the dynamic type
 * Assignment of `LOGICAL` to `INTEGER` and vice versa (but not other types) is
   allowed.  The values are normalized to canonical `.TRUE.`/`.FALSE.`.
   The values are also normalized for assignments of `LOGICAL(KIND=K1)` to
diff --git a/flang/include/flang/Common/Fortran-features.h b/flang/include/flang/Common/Fortran-features.h
index f57fcdc..15c4af6 100644
--- a/flang/include/flang/Common/Fortran-features.h
+++ b/flang/include/flang/Common/Fortran-features.h
@@ -49,7 +49,8 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines,
     IndistinguishableSpecifics, SubroutineAndFunctionSpecifics,
     EmptySequenceType, NonSequenceCrayPointee, BranchIntoConstruct,
     BadBranchTarget, ConvertedArgument, HollerithPolymorphic, ListDirectedSize,
-    NonBindCInteroperability, CudaManaged, CudaUnified)
+    NonBindCInteroperability, CudaManaged, CudaUnified,
+    PolymorphicActualAllocatableOrPointerToMonomorphicDummy)
 
 // Portability and suspicious usage warnings
 ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
diff --git a/flang/include/flang/Common/api-attrs.h b/flang/include/flang/Common/api-attrs.h
index 04ee307..d73e609 100644
--- a/flang/include/flang/Common/api-attrs.h
+++ b/flang/include/flang/Common/api-attrs.h
@@ -156,4 +156,26 @@
 #define RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
 #endif /* !defined(__CUDACC__) */
 
+/*
+ * RT_DEVICE_NOINLINE may be used for non-performance critical
+ * functions that should not be inlined to minimize the amount
+ * of code that needs to be processed by the device compiler's
+ * optimizer.
+ */
+#ifndef __has_attribute
+#define __has_attribute(x) 0
+#endif
+#if __has_attribute(noinline)
+#define RT_NOINLINE_ATTR __attribute__((noinline))
+#else
+#define RT_NOINLINE_ATTR
+#endif
+#if (defined(__CUDACC__) || defined(__CUDA__)) && defined(__CUDA_ARCH__)
+#define RT_DEVICE_NOINLINE RT_NOINLINE_ATTR
+#define RT_DEVICE_NOINLINE_HOST_INLINE
+#else
+#define RT_DEVICE_NOINLINE
+#define RT_DEVICE_NOINLINE_HOST_INLINE inline
+#endif
+
 #endif /* !FORTRAN_RUNTIME_API_ATTRS_H_ */
diff --git a/flang/include/flang/Common/visit.h b/flang/include/flang/Common/visit.h
index d867338..ad66297 100644
--- a/flang/include/flang/Common/visit.h
+++ b/flang/include/flang/Common/visit.h
@@ -30,7 +30,7 @@ namespace log2visit {
 
 template <std::size_t LOW, std::size_t HIGH, typename RESULT, typename VISITOR,
     typename... VARIANT>
-inline RT_API_ATTRS RESULT Log2VisitHelper(
+RT_DEVICE_NOINLINE_HOST_INLINE RT_API_ATTRS RESULT Log2VisitHelper(
     VISITOR &&visitor, std::size_t which, VARIANT &&...u) {
   if constexpr (LOW + 7 >= HIGH) {
     switch (which - LOW) {
@@ -68,8 +68,9 @@ inline RT_API_ATTRS RESULT Log2VisitHelper(
 }
 
 template <typename VISITOR, typename... VARIANT>
-inline RT_API_ATTRS auto visit(VISITOR &&visitor, VARIANT &&...u)
-    -> decltype(visitor(std::get<0>(std::forward<VARIANT>(u))...)) {
+RT_DEVICE_NOINLINE_HOST_INLINE RT_API_ATTRS auto
+visit(VISITOR &&visitor, VARIANT &&...u) -> decltype(visitor(std::get<0>(
+                                             std::forward<VARIANT>(u))...)) {
   using Result = decltype(visitor(std::get<0>(std::forward<VARIANT>(u))...));
   if constexpr (sizeof...(u) == 1) {
     static constexpr std::size_t high{
diff --git a/flang/include/flang/Evaluate/characteristics.h b/flang/include/flang/Evaluate/characteristics.h
index 8aa065b..9695c66 100644
--- a/flang/include/flang/Evaluate/characteristics.h
+++ b/flang/include/flang/Evaluate/characteristics.h
@@ -386,7 +386,7 @@ struct Procedure {
   bool HasExplicitInterface() const {
     return !attrs.test(Attr::ImplicitInterface);
   }
-  int FindPassIndex(std::optional<parser::CharBlock>) const;
+  std::optional<int> FindPassIndex(std::optional<parser::CharBlock>) const;
   bool CanBeCalledViaImplicitInterface(std::string *whyNot = nullptr) const;
   bool CanOverride(const Procedure &, std::optional<int> passIndex) const;
   bool IsCompatibleWith(const Procedure &, bool ignoreImplicitVsExplicit,
diff --git a/flang/include/flang/Evaluate/constant.h b/flang/include/flang/Evaluate/constant.h
index 71be790..d9866a0 100644
--- a/flang/include/flang/Evaluate/constant.h
+++ b/flang/include/flang/Evaluate/constant.h
@@ -126,8 +126,7 @@ public:
   constexpr Result result() const { return result_; }
 
   constexpr DynamicType GetType() const { return result_.GetType(); }
-  llvm::raw_ostream &AsFortran(llvm::raw_ostream &,
-      const parser::CharBlock *derivedTypeRename = nullptr) const;
+  llvm::raw_ostream &AsFortran(llvm::raw_ostream &) const;
 
 protected:
   std::vector<Element> Reshape(const ConstantSubscripts &) const;
diff --git a/flang/include/flang/Evaluate/expression.h b/flang/include/flang/Evaluate/expression.h
index 64db0b8..642ddf5 100644
--- a/flang/include/flang/Evaluate/expression.h
+++ b/flang/include/flang/Evaluate/expression.h
@@ -735,8 +735,7 @@ public:
   StructureConstructor &Add(const semantics::Symbol &, Expr<SomeType> &&);
   int Rank() const { return 0; }
   DynamicType GetType() const;
-  llvm::raw_ostream &AsFortran(llvm::raw_ostream &,
-      const parser::CharBlock *derivedTypeRename = nullptr) const;
+  llvm::raw_ostream &AsFortran(llvm::raw_ostream &) const;
 
 private:
   std::optional<Expr<SomeType>> CreateParentComponent(const Symbol &) const;
diff --git a/flang/include/flang/Evaluate/type.h b/flang/include/flang/Evaluate/type.h
index 93a0f21..de19e3d 100644
--- a/flang/include/flang/Evaluate/type.h
+++ b/flang/include/flang/Evaluate/type.h
@@ -272,9 +272,6 @@ const semantics::DerivedTypeSpec *GetDerivedTypeSpec(
 const semantics::DerivedTypeSpec *GetParentTypeSpec(
     const semantics::DerivedTypeSpec &);
 
-std::string DerivedTypeSpecAsFortran(const semantics::DerivedTypeSpec &,
-    const parser::CharBlock *derivedTypeRename = nullptr);
-
 template <TypeCategory CATEGORY, int KIND = 0> struct TypeBase {
   static constexpr TypeCategory category{CATEGORY};
   static constexpr int kind{KIND};
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 977a69a..357df3b 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -333,7 +333,10 @@ struct IntrinsicLibrary {
                                    llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genScale(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genScan(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
+  fir::ExtendedValue genSelectedCharKind(mlir::Type,
+                                         llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genSelectedIntKind(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genSelectedLogicalKind(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genSelectedRealKind(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genSetExponent(mlir::Type resultType,
                              llvm::ArrayRef<mlir::Value> args);
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Numeric.h b/flang/include/flang/Optimizer/Builder/Runtime/Numeric.h
index fec8c99..5583582 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/Numeric.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Numeric.h
@@ -46,10 +46,18 @@ mlir::Value genRRSpacing(fir::FirOpBuilder &builder, mlir::Location loc,
 mlir::Value genScale(fir::FirOpBuilder &builder, mlir::Location loc,
                      mlir::Value x, mlir::Value i);
 
+/// Generate call to Selected_char_kind intrinsic runtime routine.
+mlir::Value genSelectedCharKind(fir::FirOpBuilder &builder, mlir::Location loc,
+                                mlir::Value name, mlir::Value length);
+
 /// Generate call to Selected_int_kind intrinsic runtime routine.
 mlir::Value genSelectedIntKind(fir::FirOpBuilder &builder, mlir::Location loc,
                                mlir::Value x);
 
+/// Generate call to Selected_logical_kind intrinsic runtime routine.
+mlir::Value genSelectedLogicalKind(fir::FirOpBuilder &builder,
+                                   mlir::Location loc, mlir::Value x);
+
 /// Generate call to Selected_real_kind intrinsic runtime routine.
 mlir::Value genSelectedRealKind(fir::FirOpBuilder &builder, mlir::Location loc,
                                 mlir::Value precision, mlir::Value range,
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index 72157bc..37b8da0 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -152,15 +152,21 @@ def cuf_DataTransferOp : cuf_Op<"data_transfer", []> {
       a = adev ! transfer device to host
       bdev = adev ! transfer device to device
     ```
+    
+    When the data transfer is done on data hold by descriptors, the LHS data
+    hold by the descriptor are updated. When required, the LHS decriptor is also
+    updated.
   }];
 
-  let arguments = (ins Arg<AnyReferenceLike, "", [MemWrite]>:$src,
-                       Arg<AnyReferenceLike, "", [MemRead]>:$dst,
+  let arguments = (ins Arg<AnyType, "", [MemRead]>:$src,
+                       Arg<AnyRefOrBoxType, "", [MemWrite]>:$dst,
                        cuf_DataTransferKindAttr:$transfer_kind);
 
   let assemblyFormat = [{
     $src `to` $dst attr-dict `:` type(operands)
   }];
+
+  let hasVerifier = 1;
 }
 
 def cuf_KernelLaunchOp : cuf_Op<"kernel_launch", [CallOpInterface,
diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.h b/flang/include/flang/Optimizer/HLFIR/Passes.h
index f0736c7..4fa619c 100644
--- a/flang/include/flang/Optimizer/HLFIR/Passes.h
+++ b/flang/include/flang/Optimizer/HLFIR/Passes.h
@@ -24,9 +24,6 @@ namespace hlfir {
 
 std::unique_ptr<mlir::Pass> createConvertHLFIRtoFIRPass();
 std::unique_ptr<mlir::Pass> createBufferizeHLFIRPass();
-std::unique_ptr<mlir::Pass> createLowerHLFIRIntrinsicsPass();
-std::unique_ptr<mlir::Pass> createLowerHLFIROrderedAssignmentsPass();
-std::unique_ptr<mlir::Pass> createOptimizedBufferizationPass();
 
 #define GEN_PASS_REGISTRATION
 #include "flang/Optimizer/HLFIR/Passes.h.inc"
diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.td b/flang/include/flang/Optimizer/HLFIR/Passes.td
index 0d4496a4..fc3d2a0 100644
--- a/flang/include/flang/Optimizer/HLFIR/Passes.td
+++ b/flang/include/flang/Optimizer/HLFIR/Passes.td
@@ -23,19 +23,16 @@ def BufferizeHLFIR : Pass<"bufferize-hlfir", "::mlir::ModuleOp"> {
   let constructor = "hlfir::createBufferizeHLFIRPass()";
 }
 
-def OptimizedBufferization : Pass<"opt-bufferization", "::mlir::func::FuncOp"> {
+def OptimizedBufferization : Pass<"opt-bufferization"> {
   let summary = "Special cases for hlfir.expr bufferization where we can avoid a temporary which would be created by the generic bufferization pass";
-  let constructor = "hlfir::createOptimizedBufferizationPass()";
 }
 
 def LowerHLFIRIntrinsics : Pass<"lower-hlfir-intrinsics", "::mlir::ModuleOp"> {
   let summary = "Lower HLFIR transformational intrinsic operations";
-  let constructor = "hlfir::createLowerHLFIRIntrinsicsPass()";
 }
 
 def LowerHLFIROrderedAssignments : Pass<"lower-hlfir-ordered-assignments", "::mlir::ModuleOp"> {
   let summary = "Lower HLFIR ordered assignments like forall and where operations";
-  let constructor = "hlfir::createLowerHLFIROrderedAssignmentsPass()";
   let options = [
     Option<"tryFusingAssignments", "fuse-assignments",
            "bool", /*default=*/"false",
diff --git a/flang/include/flang/Semantics/scope.h b/flang/include/flang/Semantics/scope.h
index 2107277..a58163f 100644
--- a/flang/include/flang/Semantics/scope.h
+++ b/flang/include/flang/Semantics/scope.h
@@ -225,6 +225,7 @@ public:
   ImportKind GetImportKind() const;
   // Names appearing in IMPORT statements in this scope
   std::set<SourceName> importNames() const { return importNames_; }
+  bool CanImport(const SourceName &) const;
 
   // Set the kind of imports from host into this scope.
   // Return an error message for incompatible kinds.
@@ -298,7 +299,6 @@ private:
   // or Symbol& points to one in there.
   static Symbols<1024> allSymbols;
 
-  bool CanImport(const SourceName &) const;
   const DeclTypeSpec &MakeLengthlessType(DeclTypeSpec &&);
 
   friend llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Scope &);
diff --git a/flang/include/flang/Semantics/semantics.h b/flang/include/flang/Semantics/semantics.h
index 167e613..d382663 100644
--- a/flang/include/flang/Semantics/semantics.h
+++ b/flang/include/flang/Semantics/semantics.h
@@ -110,6 +110,9 @@ public:
   evaluate::FoldingContext &foldingContext() { return foldingContext_; }
   parser::AllCookedSources &allCookedSources() { return allCookedSources_; }
   ModuleDependences &moduleDependences() { return moduleDependences_; }
+  std::map<const Symbol *, SourceName> &moduleFileOutputRenamings() {
+    return moduleFileOutputRenamings_;
+  }
 
   SemanticsContext &set_location(
       const std::optional<parser::CharBlock> &location) {
@@ -299,6 +302,7 @@ private:
   std::list<parser::Program> modFileParseTrees_;
   std::unique_ptr<CommonBlockMap> commonBlockMap_;
   ModuleDependences moduleDependences_;
+  std::map<const Symbol *, SourceName> moduleFileOutputRenamings_;
 };
 
 class Semantics {
diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
index 50f7b68..f130036 100644
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -815,6 +815,7 @@ public:
   void SetIsExplicitBindName(bool);
   bool IsFuncResult() const;
   bool IsObjectArray() const;
+  const ArraySpec *GetShape() const;
   bool IsSubprogram() const;
   bool IsFromModFile() const;
   bool HasExplicitInterface() const {
diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc
index 3900b17..56cc9da 100644
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -324,10 +324,11 @@ inline void createHLFIRToFIRPassPipeline(
   if (optLevel.isOptimizingForSpeed()) {
     addCanonicalizerPassWithoutRegionSimplification(pm);
     pm.addPass(mlir::createCSEPass());
-    pm.addPass(hlfir::createOptimizedBufferizationPass());
+    addNestedPassToAllTopLevelOperations(
+        pm, hlfir::createOptimizedBufferization);
   }
-  pm.addPass(hlfir::createLowerHLFIROrderedAssignmentsPass());
-  pm.addPass(hlfir::createLowerHLFIRIntrinsicsPass());
+  pm.addPass(hlfir::createLowerHLFIROrderedAssignments());
+  pm.addPass(hlfir::createLowerHLFIRIntrinsics());
   pm.addPass(hlfir::createBufferizeHLFIRPass());
   pm.addPass(hlfir::createConvertHLFIRtoFIRPass());
 }
diff --git a/flang/lib/Evaluate/characteristics.cpp b/flang/lib/Evaluate/characteristics.cpp
index ab03ca5..a0ce190 100644
--- a/flang/lib/Evaluate/characteristics.cpp
+++ b/flang/lib/Evaluate/characteristics.cpp
@@ -1333,16 +1333,21 @@ bool Procedure::IsCompatibleWith(const Procedure &actual,
   return false;
 }
 
-int Procedure::FindPassIndex(std::optional<parser::CharBlock> name) const {
+std::optional<int> Procedure::FindPassIndex(
+    std::optional<parser::CharBlock> name) const {
   int argCount{static_cast<int>(dummyArguments.size())};
-  int index{0};
   if (name) {
-    while (index < argCount && *name != dummyArguments[index].name.c_str()) {
-      ++index;
+    for (int index{0}; index < argCount; ++index) {
+      if (*name == dummyArguments[index].name.c_str()) {
+        return index;
+      }
     }
+    return std::nullopt;
+  } else if (argCount > 0) {
+    return 0;
+  } else {
+    return std::nullopt;
   }
-  CHECK(index < argCount);
-  return index;
 }
 
 bool Procedure::CanOverride(
diff --git a/flang/lib/Evaluate/formatting.cpp b/flang/lib/Evaluate/formatting.cpp
index 20193b0..0870d565 100644
--- a/flang/lib/Evaluate/formatting.cpp
+++ b/flang/lib/Evaluate/formatting.cpp
@@ -14,6 +14,7 @@
 #include "flang/Evaluate/fold.h"
 #include "flang/Evaluate/tools.h"
 #include "flang/Parser/characters.h"
+#include "flang/Semantics/semantics.h"
 #include "flang/Semantics/symbol.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -53,7 +54,7 @@ static void ShapeAsFortran(llvm::raw_ostream &o,
 
 template <typename RESULT, typename VALUE>
 llvm::raw_ostream &ConstantBase<RESULT, VALUE>::AsFortran(
-    llvm::raw_ostream &o, const parser::CharBlock *derivedTypeRename) const {
+    llvm::raw_ostream &o) const {
   bool hasNonDefaultLowerBound{printLbounds && HasNonDefaultLowerBound()};
   if (Rank() > 1 || hasNonDefaultLowerBound) {
     o << "reshape(";
@@ -85,8 +86,7 @@ llvm::raw_ostream &ConstantBase<RESULT, VALUE>::AsFortran(
         o << ".false." << '_' << Result::kind;
       }
     } else {
-      StructureConstructor{result_.derivedTypeSpec(), value}.AsFortran(
-          o, derivedTypeRename);
+      StructureConstructor{result_.derivedTypeSpec(), value}.AsFortran(o);
     }
   }
   if (Rank() > 0) {
@@ -124,9 +124,89 @@ llvm::raw_ostream &Constant<Type<TypeCategory::Character, KIND>>::AsFortran(
   return o;
 }
 
+llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const Symbol &symbol,
+    std::optional<parser::CharBlock> name = std::nullopt) {
+  const auto &renamings{symbol.owner().context().moduleFileOutputRenamings()};
+  if (auto iter{renamings.find(&symbol)}; iter != renamings.end()) {
+    return o << iter->second.ToString();
+  } else if (name) {
+    return o << name->ToString();
+  } else {
+    return o << symbol.name().ToString();
+  }
+}
+
+llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const std::string &lit) {
+  return o << parser::QuoteCharacterLiteral(lit);
+}
+
+llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const std::u16string &lit) {
+  return o << parser::QuoteCharacterLiteral(lit);
+}
+
+llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const std::u32string &lit) {
+  return o << parser::QuoteCharacterLiteral(lit);
+}
+
+template <typename A>
+llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const A &x) {
+  return x.AsFortran(o);
+}
+
+template <typename A>
+llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, common::Reference<A> x) {
+  return EmitVar(o, *x);
+}
+
+template <typename A>
+llvm::raw_ostream &EmitVar(
+    llvm::raw_ostream &o, const A *p, const char *kw = nullptr) {
+  if (p) {
+    if (kw) {
+      o << kw;
+    }
+    EmitVar(o, *p);
+  }
+  return o;
+}
+
+template <typename A>
+llvm::raw_ostream &EmitVar(
+    llvm::raw_ostream &o, const std::optional<A> &x, const char *kw = nullptr) {
+  if (x) {
+    if (kw) {
+      o << kw;
+    }
+    EmitVar(o, *x);
+  }
+  return o;
+}
+
+template <typename A, bool COPY>
+llvm::raw_ostream &EmitVar(llvm::raw_ostream &o,
+    const common::Indirection<A, COPY> &p, const char *kw = nullptr) {
+  if (kw) {
+    o << kw;
+  }
+  EmitVar(o, p.value());
+  return o;
+}
+
+template <typename A>
+llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const std::shared_ptr<A> &p) {
+  CHECK(p);
+  return EmitVar(o, *p);
+}
+
+template <typename... A>
+llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const std::variant<A...> &u) {
+  common::visit([&](const auto &x) { EmitVar(o, x); }, u);
+  return o;
+}
+
 llvm::raw_ostream &ActualArgument::AssumedType::AsFortran(
     llvm::raw_ostream &o) const {
-  return o << symbol_->name().ToString();
+  return EmitVar(o, *symbol_);
 }
 
 llvm::raw_ostream &ActualArgument::AsFortran(llvm::raw_ostream &o) const {
@@ -504,15 +584,37 @@ llvm::raw_ostream &ExpressionBase<RESULT>::AsFortran(
   return o;
 }
 
-llvm::raw_ostream &StructureConstructor::AsFortran(
-    llvm::raw_ostream &o, const parser::CharBlock *derivedTypeRename) const {
-  o << DerivedTypeSpecAsFortran(result_.derivedTypeSpec(), derivedTypeRename);
+static std::string DerivedTypeSpecAsFortran(
+    const semantics::DerivedTypeSpec &spec) {
+  std::string buf;
+  llvm::raw_string_ostream ss{buf};
+  EmitVar(ss, spec.typeSymbol(), spec.name());
+  char ch{'('};
+  for (const auto &[name, value] : spec.parameters()) {
+    ss << ch << name.ToString() << '=';
+    ch = ',';
+    if (value.isAssumed()) {
+      ss << '*';
+    } else if (value.isDeferred()) {
+      ss << ':';
+    } else {
+      value.GetExplicit()->AsFortran(ss);
+    }
+  }
+  if (ch != '(') {
+    ss << ')';
+  }
+  return ss.str();
+}
+
+llvm::raw_ostream &StructureConstructor::AsFortran(llvm::raw_ostream &o) const {
+  o << DerivedTypeSpecAsFortran(result_.derivedTypeSpec());
   if (values_.empty()) {
     o << '(';
   } else {
     char ch{'('};
     for (const auto &[symbol, value] : values_) {
-      value.value().AsFortran(o << ch << symbol->name().ToString() << '=');
+      value.value().AsFortran(EmitVar(o << ch, *symbol) << '=');
       ch = ',';
     }
   }
@@ -568,101 +670,6 @@ std::string SomeDerived::AsFortran() const {
   }
 }
 
-std::string DerivedTypeSpecAsFortran(const semantics::DerivedTypeSpec &spec,
-    const parser::CharBlock *derivedTypeRename) {
-  std::string buf;
-  llvm::raw_string_ostream ss{buf};
-  ss << (derivedTypeRename ? *derivedTypeRename : spec.name()).ToString();
-  char ch{'('};
-  for (const auto &[name, value] : spec.parameters()) {
-    ss << ch << name.ToString() << '=';
-    ch = ',';
-    if (value.isAssumed()) {
-      ss << '*';
-    } else if (value.isDeferred()) {
-      ss << ':';
-    } else {
-      value.GetExplicit()->AsFortran(ss);
-    }
-  }
-  if (ch != '(') {
-    ss << ')';
-  }
-  return ss.str();
-}
-
-llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const Symbol &symbol) {
-  return o << symbol.name().ToString();
-}
-
-llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const std::string &lit) {
-  return o << parser::QuoteCharacterLiteral(lit);
-}
-
-llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const std::u16string &lit) {
-  return o << parser::QuoteCharacterLiteral(lit);
-}
-
-llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const std::u32string &lit) {
-  return o << parser::QuoteCharacterLiteral(lit);
-}
-
-template <typename A>
-llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const A &x) {
-  return x.AsFortran(o);
-}
-
-template <typename A>
-llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, common::Reference<A> x) {
-  return EmitVar(o, *x);
-}
-
-template <typename A>
-llvm::raw_ostream &EmitVar(
-    llvm::raw_ostream &o, const A *p, const char *kw = nullptr) {
-  if (p) {
-    if (kw) {
-      o << kw;
-    }
-    EmitVar(o, *p);
-  }
-  return o;
-}
-
-template <typename A>
-llvm::raw_ostream &EmitVar(
-    llvm::raw_ostream &o, const std::optional<A> &x, const char *kw = nullptr) {
-  if (x) {
-    if (kw) {
-      o << kw;
-    }
-    EmitVar(o, *x);
-  }
-  return o;
-}
-
-template <typename A, bool COPY>
-llvm::raw_ostream &EmitVar(llvm::raw_ostream &o,
-    const common::Indirection<A, COPY> &p, const char *kw = nullptr) {
-  if (kw) {
-    o << kw;
-  }
-  EmitVar(o, p.value());
-  return o;
-}
-
-template <typename A>
-llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const std::shared_ptr<A> &p) {
-  CHECK(p);
-  return EmitVar(o, *p);
-}
-
-template <typename... A>
-llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const std::variant<A...> &u) {
-  common::visit([&](const auto &x) { EmitVar(o, x); }, u);
-  return o;
-}
-
 llvm::raw_ostream &BaseObject::AsFortran(llvm::raw_ostream &o) const {
   return EmitVar(o, u);
 }
diff --git a/flang/lib/Evaluate/shape.cpp b/flang/lib/Evaluate/shape.cpp
index 6246cb9..5cf48b2 100644
--- a/flang/lib/Evaluate/shape.cpp
+++ b/flang/lib/Evaluate/shape.cpp
@@ -885,8 +885,12 @@ auto GetShapeHelper::operator()(const ProcedureRef &call) const -> Result {
         intrinsic->name == "ubound") {
       // For LBOUND/UBOUND, these are the array-valued cases (no DIM=)
       if (!call.arguments().empty() && call.arguments().front()) {
-        return Shape{
-            MaybeExtentExpr{ExtentExpr{call.arguments().front()->Rank()}}};
+        if (IsAssumedRank(*call.arguments().front())) {
+          return Shape{MaybeExtentExpr{}};
+        } else {
+          return Shape{
+              MaybeExtentExpr{ExtentExpr{call.arguments().front()->Rank()}}};
+        }
       }
     } else if (intrinsic->name == "all" || intrinsic->name == "any" ||
         intrinsic->name == "count" || intrinsic->name == "iall" ||
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 4e50de3..898b375 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -57,6 +57,7 @@
 #include "flang/Semantics/symbol.h"
 #include "flang/Semantics/tools.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
+#include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Transforms/RegionUtils.h"
@@ -3782,21 +3783,36 @@ private:
                            hlfir::Entity &lhs, hlfir::Entity &rhs) {
     bool lhsIsDevice = Fortran::evaluate::HasCUDAAttrs(assign.lhs);
     bool rhsIsDevice = Fortran::evaluate::HasCUDAAttrs(assign.rhs);
-    if (rhs.isBoxAddressOrValue() || lhs.isBoxAddressOrValue())
-      TODO(loc, "CUDA data transfler with descriptors");
+
+    auto getRefIfLoaded = [](mlir::Value val) -> mlir::Value {
+      if (auto loadOp =
+              mlir::dyn_cast_or_null<fir::LoadOp>(val.getDefiningOp()))
+        return loadOp.getMemref();
+      return val;
+    };
+
+    mlir::Value rhsVal = getRefIfLoaded(rhs.getBase());
+    mlir::Value lhsVal = getRefIfLoaded(lhs.getBase());
 
     // device = host
     if (lhsIsDevice && !rhsIsDevice) {
       auto transferKindAttr = cuf::DataTransferKindAttr::get(
           builder.getContext(), cuf::DataTransferKind::HostDevice);
       if (!rhs.isVariable()) {
-        auto associate = hlfir::genAssociateExpr(
-            loc, builder, rhs, rhs.getType(), ".cuf_host_tmp");
-        builder.create<cuf::DataTransferOp>(loc, associate.getBase(), lhs,
-                                            transferKindAttr);
-        builder.create<hlfir::EndAssociateOp>(loc, associate);
+        // Special case if the rhs is a constant.
+        if (matchPattern(rhs.getDefiningOp(), mlir::m_Constant())) {
+          builder.create<cuf::DataTransferOp>(loc, rhs, lhsVal,
+                                              transferKindAttr);
+        } else {
+          auto associate = hlfir::genAssociateExpr(
+              loc, builder, rhs, rhs.getType(), ".cuf_host_tmp");
+          builder.create<cuf::DataTransferOp>(loc, associate.getBase(), lhsVal,
+                                              transferKindAttr);
+          builder.create<hlfir::EndAssociateOp>(loc, associate);
+        }
       } else {
-        builder.create<cuf::DataTransferOp>(loc, rhs, lhs, transferKindAttr);
+        builder.create<cuf::DataTransferOp>(loc, rhsVal, lhsVal,
+                                            transferKindAttr);
       }
       return;
     }
@@ -3805,26 +3821,18 @@ private:
     if (!lhsIsDevice && rhsIsDevice) {
       auto transferKindAttr = cuf::DataTransferKindAttr::get(
           builder.getContext(), cuf::DataTransferKind::DeviceHost);
-      if (!rhs.isVariable()) {
-        // evaluateRhs loads scalar. Look for the memory reference to be used in
-        // the transfer.
-        if (mlir::isa_and_nonnull<fir::LoadOp>(rhs.getDefiningOp())) {
-          auto loadOp = mlir::dyn_cast<fir::LoadOp>(rhs.getDefiningOp());
-          builder.create<cuf::DataTransferOp>(loc, loadOp.getMemref(), lhs,
-                                              transferKindAttr);
-          return;
-        }
-      } else {
-        builder.create<cuf::DataTransferOp>(loc, rhs, lhs, transferKindAttr);
-      }
+      builder.create<cuf::DataTransferOp>(loc, rhsVal, lhsVal,
+                                          transferKindAttr);
       return;
     }
 
+    // device = device
     if (lhsIsDevice && rhsIsDevice) {
       assert(rhs.isVariable() && "CUDA Fortran assignment rhs is not legal");
       auto transferKindAttr = cuf::DataTransferKindAttr::get(
           builder.getContext(), cuf::DataTransferKind::DeviceDevice);
-      builder.create<cuf::DataTransferOp>(loc, rhs, lhs, transferKindAttr);
+      builder.create<cuf::DataTransferOp>(loc, rhsVal, lhsVal,
+                                          transferKindAttr);
       return;
     }
     llvm_unreachable("Unhandled CUDA data transfer");
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 8755990..68619f6 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -882,8 +882,11 @@ bool ClauseProcessor::processMap(
           // Explicit map captures are captured ByRef by default,
           // optimisation passes may alter this to ByCopy or other capture
           // types to optimise
+          auto location = mlir::NameLoc::get(
+              mlir::StringAttr::get(firOpBuilder.getContext(), asFortran.str()),
+              symAddr.getLoc());
           mlir::omp::MapInfoOp mapOp = createMapInfoOp(
-              firOpBuilder, clauseLocation, symAddr,
+              firOpBuilder, location, symAddr,
               /*varPtrPtr=*/mlir::Value{}, asFortran.str(), bounds,
               /*members=*/{}, /*membersIndex=*/mlir::DenseIntElementsAttr{},
               static_cast<
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 17b362c..1569605 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1604,9 +1604,12 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
           mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO;
           mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
         }
-
+        auto location =
+            mlir::NameLoc::get(mlir::StringAttr::get(firOpBuilder.getContext(),
+                                                     sym.name().ToString()),
+                               baseOp.getLoc());
         mlir::Value mapOp = createMapInfoOp(
-            firOpBuilder, baseOp.getLoc(), baseOp, /*varPtrPtr=*/mlir::Value{},
+            firOpBuilder, location, baseOp, /*varPtrPtr=*/mlir::Value{},
             name.str(), bounds, /*members=*/{},
             /*membersIndex=*/mlir::DenseIntElementsAttr{},
             static_cast<
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index ae7e650..ad2f923 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -549,10 +549,18 @@ static constexpr IntrinsicHandler handlers[]{
        {"back", asValue, handleDynamicOptional},
        {"kind", asValue}}},
      /*isElemental=*/true},
+    {"selected_char_kind",
+     &I::genSelectedCharKind,
+     {{{"name", asAddr}}},
+     /*isElemental=*/false},
     {"selected_int_kind",
      &I::genSelectedIntKind,
      {{{"scalar", asAddr}}},
      /*isElemental=*/false},
+    {"selected_logical_kind",
+     &I::genSelectedLogicalKind,
+     {{{"bits", asAddr}}},
+     /*isElemental=*/false},
     {"selected_real_kind",
      &I::genSelectedRealKind,
      {{{"precision", asAddr, handleDynamicOptional},
@@ -5873,6 +5881,18 @@ IntrinsicLibrary::genScan(mlir::Type resultType,
   return readAndAddCleanUp(resultMutableBox, resultType, "SCAN");
 }
 
+// SELECTED_CHAR_KIND
+fir::ExtendedValue
+IntrinsicLibrary::genSelectedCharKind(mlir::Type resultType,
+                                      llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 1);
+
+  return builder.createConvert(
+      loc, resultType,
+      fir::runtime::genSelectedCharKind(builder, loc, fir::getBase(args[0]),
+                                        fir::getLen(args[0])));
+}
+
 // SELECTED_INT_KIND
 mlir::Value
 IntrinsicLibrary::genSelectedIntKind(mlir::Type resultType,
@@ -5884,6 +5904,17 @@ IntrinsicLibrary::genSelectedIntKind(mlir::Type resultType,
       fir::runtime::genSelectedIntKind(builder, loc, fir::getBase(args[0])));
 }
 
+// SELECTED_LOGICAL_KIND
+mlir::Value
+IntrinsicLibrary::genSelectedLogicalKind(mlir::Type resultType,
+                                         llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 1);
+
+  return builder.createConvert(loc, resultType,
+                               fir::runtime::genSelectedLogicalKind(
+                                   builder, loc, fir::getBase(args[0])));
+}
+
 // SELECTED_REAL_KIND
 mlir::Value
 IntrinsicLibrary::genSelectedRealKind(mlir::Type resultType,
diff --git a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
index 81d5d21..8ac9d64 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
@@ -468,6 +468,26 @@ mlir::Value fir::runtime::genScale(fir::FirOpBuilder &builder,
   return builder.create<fir::CallOp>(loc, func, args).getResult(0);
 }
 
+/// Generate call to Selected_char_kind intrinsic runtime routine.
+mlir::Value fir::runtime::genSelectedCharKind(fir::FirOpBuilder &builder,
+                                              mlir::Location loc,
+                                              mlir::Value name,
+                                              mlir::Value length) {
+  mlir::func::FuncOp func =
+      fir::runtime::getRuntimeFunc<mkRTKey(SelectedCharKind)>(loc, builder);
+  auto fTy = func.getFunctionType();
+  auto sourceFile = fir::factory::locationToFilename(builder, loc);
+  auto sourceLine =
+      fir::factory::locationToLineNo(builder, loc, fTy.getInput(1));
+  if (!fir::isa_ref_type(name.getType()))
+    fir::emitFatalError(loc, "argument address for runtime not found");
+
+  auto args = fir::runtime::createArguments(builder, loc, fTy, sourceFile,
+                                            sourceLine, name, length);
+
+  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+}
+
 /// Generate call to Selected_int_kind intrinsic runtime routine.
 mlir::Value fir::runtime::genSelectedIntKind(fir::FirOpBuilder &builder,
                                              mlir::Location loc,
@@ -489,6 +509,27 @@ mlir::Value fir::runtime::genSelectedIntKind(fir::FirOpBuilder &builder,
   return builder.create<fir::CallOp>(loc, func, args).getResult(0);
 }
 
+/// Generate call to Selected_logical_kind intrinsic runtime routine.
+mlir::Value fir::runtime::genSelectedLogicalKind(fir::FirOpBuilder &builder,
+                                                 mlir::Location loc,
+                                                 mlir::Value x) {
+  mlir::func::FuncOp func =
+      fir::runtime::getRuntimeFunc<mkRTKey(SelectedLogicalKind)>(loc, builder);
+  auto fTy = func.getFunctionType();
+  auto sourceFile = fir::factory::locationToFilename(builder, loc);
+  auto sourceLine =
+      fir::factory::locationToLineNo(builder, loc, fTy.getInput(1));
+  if (!fir::isa_ref_type(x.getType()))
+    fir::emitFatalError(loc, "argument address for runtime not found");
+  mlir::Type eleTy = fir::unwrapRefType(x.getType());
+  mlir::Value xKind = builder.createIntegerConstant(
+      loc, fTy.getInput(3), eleTy.getIntOrFloatBitWidth() / 8);
+  auto args = fir::runtime::createArguments(builder, loc, fTy, sourceFile,
+                                            sourceLine, x, xKind);
+
+  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+}
+
 /// Generate call to Selected_real_kind intrinsic runtime routine.
 mlir::Value fir::runtime::genSelectedRealKind(fir::FirOpBuilder &builder,
                                               mlir::Location loc,
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 72172f6..74e6872 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -2716,6 +2716,18 @@ struct GlobalOpConversion : public fir::FIROpConversion<fir::GlobalOp> {
   mlir::LogicalResult
   matchAndRewrite(fir::GlobalOp global, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
+
+    mlir::LLVM::DIGlobalVariableExpressionAttr dbgExpr;
+
+    if (auto fusedLoc = mlir::dyn_cast<mlir::FusedLoc>(global.getLoc())) {
+      if (auto gvAttr =
+              mlir::dyn_cast_or_null<mlir::LLVM::DIGlobalVariableAttr>(
+                  fusedLoc.getMetadata())) {
+        dbgExpr = mlir::LLVM::DIGlobalVariableExpressionAttr::get(
+            global.getContext(), gvAttr, mlir::LLVM::DIExpressionAttr());
+      }
+    }
+
     auto tyAttr = convertType(global.getType());
     if (auto boxType = mlir::dyn_cast<fir::BaseBoxType>(global.getType()))
       tyAttr = this->lowerTy().convertBoxTypeAsStruct(boxType);
@@ -2724,8 +2736,11 @@ struct GlobalOpConversion : public fir::FIROpConversion<fir::GlobalOp> {
     assert(attributeTypeIsCompatible(global.getContext(), initAttr, tyAttr));
     auto linkage = convertLinkage(global.getLinkName());
     auto isConst = global.getConstant().has_value();
+    mlir::SymbolRefAttr comdat;
+    llvm::ArrayRef<mlir::NamedAttribute> attrs;
     auto g = rewriter.create<mlir::LLVM::GlobalOp>(
-        loc, tyAttr, isConst, linkage, global.getSymName(), initAttr);
+        loc, tyAttr, isConst, linkage, global.getSymName(), initAttr, 0, 0,
+        false, false, comdat, attrs, dbgExpr);
 
     auto module = global->getParentOfType<mlir::ModuleOp>();
     // Add comdat if necessary
@@ -2966,39 +2981,40 @@ struct SelectCaseOpConversion : public fir::FIROpConversion<fir::SelectCaseOp> {
           caseOp.getSuccessorOperands(adaptor.getOperands(), t);
       std::optional<mlir::ValueRange> cmpOps =
           *caseOp.getCompareOperands(adaptor.getOperands(), t);
-      mlir::Value caseArg = *(cmpOps.value().begin());
       mlir::Attribute attr = cases[t];
+      assert(mlir::isa<mlir::UnitAttr>(attr) || cmpOps.has_value());
       if (mlir::isa<fir::PointIntervalAttr>(attr)) {
         auto cmp = rewriter.create<mlir::LLVM::ICmpOp>(
-            loc, mlir::LLVM::ICmpPredicate::eq, selector, caseArg);
+            loc, mlir::LLVM::ICmpPredicate::eq, selector, cmpOps->front());
         genCaseLadderStep(loc, cmp, dest, destOps, rewriter);
         continue;
       }
       if (mlir::isa<fir::LowerBoundAttr>(attr)) {
         auto cmp = rewriter.create<mlir::LLVM::ICmpOp>(
-            loc, mlir::LLVM::ICmpPredicate::sle, caseArg, selector);
+            loc, mlir::LLVM::ICmpPredicate::sle, cmpOps->front(), selector);
         genCaseLadderStep(loc, cmp, dest, destOps, rewriter);
         continue;
       }
       if (mlir::isa<fir::UpperBoundAttr>(attr)) {
         auto cmp = rewriter.create<mlir::LLVM::ICmpOp>(
-            loc, mlir::LLVM::ICmpPredicate::sle, selector, caseArg);
+            loc, mlir::LLVM::ICmpPredicate::sle, selector, cmpOps->front());
         genCaseLadderStep(loc, cmp, dest, destOps, rewriter);
         continue;
       }
       if (mlir::isa<fir::ClosedIntervalAttr>(attr)) {
-        auto cmp = rewriter.create<mlir::LLVM::ICmpOp>(
-            loc, mlir::LLVM::ICmpPredicate::sle, caseArg, selector);
+        mlir::Value caseArg0 = *cmpOps->begin();
+        auto cmp0 = rewriter.create<mlir::LLVM::ICmpOp>(
+            loc, mlir::LLVM::ICmpPredicate::sle, caseArg0, selector);
         auto *thisBlock = rewriter.getInsertionBlock();
         auto *newBlock1 = createBlock(rewriter, dest);
         auto *newBlock2 = createBlock(rewriter, dest);
         rewriter.setInsertionPointToEnd(thisBlock);
-        rewriter.create<mlir::LLVM::CondBrOp>(loc, cmp, newBlock1, newBlock2);
+        rewriter.create<mlir::LLVM::CondBrOp>(loc, cmp0, newBlock1, newBlock2);
         rewriter.setInsertionPointToEnd(newBlock1);
-        mlir::Value caseArg0 = *(cmpOps.value().begin() + 1);
-        auto cmp0 = rewriter.create<mlir::LLVM::ICmpOp>(
-            loc, mlir::LLVM::ICmpPredicate::sle, selector, caseArg0);
-        genCondBrOp(loc, cmp0, dest, destOps, rewriter, newBlock2);
+        mlir::Value caseArg1 = *(cmpOps->begin() + 1);
+        auto cmp1 = rewriter.create<mlir::LLVM::ICmpOp>(
+            loc, mlir::LLVM::ICmpPredicate::sle, selector, caseArg1);
+        genCondBrOp(loc, cmp1, dest, destOps, rewriter, newBlock2);
         rewriter.setInsertionPointToEnd(newBlock2);
         continue;
       }
diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
index 870652c..2c0c4c2 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
@@ -90,6 +90,24 @@ mlir::LogicalResult cuf::AllocateOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
+// DataTransferOp
+//===----------------------------------------------------------------------===//
+
+mlir::LogicalResult cuf::DataTransferOp::verify() {
+  mlir::Type srcTy = getSrc().getType();
+  mlir::Type dstTy = getDst().getType();
+  if ((fir::isa_ref_type(srcTy) && fir::isa_ref_type(dstTy)) ||
+      (fir::isa_box_type(srcTy) && fir::isa_box_type(dstTy)))
+    return mlir::success();
+  if (fir::isa_trivial(srcTy) &&
+      matchPattern(getSrc().getDefiningOp(), mlir::m_Constant()))
+    return mlir::success();
+  return emitOpError()
+         << "expect src and dst to be both references or descriptors or src to "
+            "be a constant";
+}
+
+//===----------------------------------------------------------------------===//
 // DeallocateOp
 //===----------------------------------------------------------------------===//
 
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
index 1119635..218b38e 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
@@ -1115,7 +1115,7 @@ mlir::LogicalResult
 hlfir::MatmulOp::canonicalize(MatmulOp matmulOp,
                               mlir::PatternRewriter &rewriter) {
   // the only two uses of the transposed matrix should be for the hlfir.matmul
-  // and hlfir.destory
+  // and hlfir.destroy
   auto isOtherwiseUnused = [&](hlfir::TransposeOp transposeOp) -> bool {
     std::size_t numUses = 0;
     for (mlir::Operation *user : transposeOp.getResult().getUsers()) {
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp
index 06d0518..6c8e3e1 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp
@@ -32,7 +32,7 @@ namespace hlfir {
 } // namespace hlfir
 
 /// If the elemental has only two uses and those two are an apply operation and
-/// a destory operation, return those two, otherwise return {}
+/// a destroy operation, return those two, otherwise return {}
 static std::optional<std::pair<hlfir::ApplyOp, hlfir::DestroyOp>>
 getTwoUses(hlfir::ElementalOp elemental) {
   mlir::Operation::user_range users = elemental->getUsers();
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
index e9dbb70..707c0fe 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
@@ -468,13 +468,6 @@ class LowerHLFIRIntrinsics
     : public hlfir::impl::LowerHLFIRIntrinsicsBase<LowerHLFIRIntrinsics> {
 public:
   void runOnOperation() override {
-    // TODO: make this a pass operating on FuncOp. The issue is that
-    // FirOpBuilder helpers may generate new FuncOp because of runtime/llvm
-    // intrinsics calls creation. This may create race conflict if the pass is
-    // scheduled on FuncOp. A solution could be to provide an optional mutex
-    // when building a FirOpBuilder and locking around FuncOp and GlobalOp
-    // creation, but this needs a bit more thinking, so at this point the pass
-    // is scheduled on the moduleOp.
     mlir::ModuleOp module = this->getOperation();
     mlir::MLIRContext *context = &getContext();
     mlir::RewritePatternSet patterns(context);
@@ -504,7 +497,3 @@ public:
   }
 };
 } // namespace
-
-std::unique_ptr<mlir::Pass> hlfir::createLowerHLFIRIntrinsicsPass() {
-  return std::make_unique<LowerHLFIRIntrinsics>();
-}
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
index c9ff4b1..a1a89bb 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
@@ -1383,6 +1383,9 @@ class LowerHLFIROrderedAssignments
     : public hlfir::impl::LowerHLFIROrderedAssignmentsBase<
           LowerHLFIROrderedAssignments> {
 public:
+  using LowerHLFIROrderedAssignmentsBase<
+      LowerHLFIROrderedAssignments>::LowerHLFIROrderedAssignmentsBase;
+
   void runOnOperation() override {
     // Running on a ModuleOp because this pass may generate FuncOp declaration
     // for runtime calls. This could be a FuncOp pass otherwise.
@@ -1409,7 +1412,3 @@ public:
   }
 };
 } // namespace
-
-std::unique_ptr<mlir::Pass> hlfir::createLowerHLFIROrderedAssignmentsPass() {
-  return std::make_unique<LowerHLFIROrderedAssignments>();
-}
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
index 8d68c70..3c8424c 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
@@ -1038,7 +1038,6 @@ class OptimizedBufferizationPass
           OptimizedBufferizationPass> {
 public:
   void runOnOperation() override {
-    mlir::func::FuncOp func = getOperation();
     mlir::MLIRContext *context = &getContext();
 
     mlir::GreedyRewriteConfig config;
@@ -1062,15 +1061,11 @@ public:
     patterns.insert<MinMaxlocElementalConversion<hlfir::MaxlocOp>>(context);
 
     if (mlir::failed(mlir::applyPatternsAndFoldGreedily(
-            func, std::move(patterns), config))) {
-      mlir::emitError(func.getLoc(),
+            getOperation(), std::move(patterns), config))) {
+      mlir::emitError(getOperation()->getLoc(),
                       "failure in HLFIR optimized bufferization");
       signalPassFailure();
     }
   }
 };
 } // namespace
-
-std::unique_ptr<mlir::Pass> hlfir::createOptimizedBufferizationPass() {
-  return std::make_unique<OptimizedBufferizationPass>();
-}
diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index 07e8aed..fb7c0bf 100644
--- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -54,6 +54,16 @@ class AddDebugInfoPass : public fir::impl::AddDebugInfoBase<AddDebugInfoPass> {
 public:
   AddDebugInfoPass(fir::AddDebugInfoOptions options) : Base(options) {}
   void runOnOperation() override;
+
+private:
+  llvm::StringMap<mlir::LLVM::DIModuleAttr> moduleMap;
+
+  mlir::LLVM::DIModuleAttr getOrCreateModuleAttr(
+      const std::string &name, mlir::LLVM::DIFileAttr fileAttr,
+      mlir::LLVM::DIScopeAttr scope, unsigned line, bool decl);
+
+  void handleGlobalOp(fir::GlobalOp glocalOp, mlir::LLVM::DIFileAttr fileAttr,
+                      mlir::LLVM::DIScopeAttr scope);
 };
 
 static uint32_t getLineFromLoc(mlir::Location loc) {
@@ -99,6 +109,70 @@ void AddDebugInfoPass::handleDeclareOp(fir::cg::XDeclareOp declOp,
   declOp->setLoc(builder.getFusedLoc({declOp->getLoc()}, localVarAttr));
 }
 
+// The `module` does not have a first class representation in the `FIR`. We
+// extract information about it from the name of the identifiers and keep a
+// map to avoid duplication.
+mlir::LLVM::DIModuleAttr AddDebugInfoPass::getOrCreateModuleAttr(
+    const std::string &name, mlir::LLVM::DIFileAttr fileAttr,
+    mlir::LLVM::DIScopeAttr scope, unsigned line, bool decl) {
+  mlir::MLIRContext *context = &getContext();
+  mlir::LLVM::DIModuleAttr modAttr;
+  if (auto iter{moduleMap.find(name)}; iter != moduleMap.end()) {
+    modAttr = iter->getValue();
+  } else {
+    modAttr = mlir::LLVM::DIModuleAttr::get(
+        context, fileAttr, scope, mlir::StringAttr::get(context, name),
+        /* configMacros */ mlir::StringAttr(),
+        /* includePath */ mlir::StringAttr(),
+        /* apinotes */ mlir::StringAttr(), line, decl);
+    moduleMap[name] = modAttr;
+  }
+  return modAttr;
+}
+
+void AddDebugInfoPass::handleGlobalOp(fir::GlobalOp globalOp,
+                                      mlir::LLVM::DIFileAttr fileAttr,
+                                      mlir::LLVM::DIScopeAttr scope) {
+  mlir::ModuleOp module = getOperation();
+  mlir::MLIRContext *context = &getContext();
+  fir::DebugTypeGenerator typeGen(module);
+  mlir::OpBuilder builder(context);
+
+  std::pair result = fir::NameUniquer::deconstruct(globalOp.getSymName());
+  if (result.first != fir::NameUniquer::NameKind::VARIABLE)
+    return;
+
+  unsigned line = getLineFromLoc(globalOp.getLoc());
+
+  // DWARF5 says following about the fortran modules:
+  // A Fortran 90 module may also be represented by a module entry
+  // (but no declaration attribute is warranted because Fortran has no concept
+  // of a corresponding module body).
+  // But in practice, compilers use declaration attribute with a module in cases
+  // where module was defined in another source file (only being used in this
+  // one). The isInitialized() seems to provide the right information
+  // but inverted. It is true where module is actually defined but false where
+  // it is used.
+  // FIXME: Currently we don't have the line number on which a module was
+  // declared. We are using a best guess of line - 1 where line is the source
+  // line of the first member of the module that we encounter.
+
+  if (result.second.modules.empty())
+    return;
+
+  scope = getOrCreateModuleAttr(result.second.modules[0], fileAttr, scope,
+                                line - 1, !globalOp.isInitialized());
+
+  mlir::LLVM::DITypeAttr diType = typeGen.convertType(
+      globalOp.getType(), fileAttr, scope, globalOp.getLoc());
+  auto gvAttr = mlir::LLVM::DIGlobalVariableAttr::get(
+      context, scope, mlir::StringAttr::get(context, result.second.name),
+      mlir::StringAttr::get(context, globalOp.getName()), fileAttr, line,
+      diType, /*isLocalToUnit*/ false,
+      /*isDefinition*/ globalOp.isInitialized(), /* alignInBits*/ 0);
+  globalOp->setLoc(builder.getFusedLoc({globalOp->getLoc()}, gvAttr));
+}
+
 void AddDebugInfoPass::runOnOperation() {
   mlir::ModuleOp module = getOperation();
   mlir::MLIRContext *context = &getContext();
@@ -138,6 +212,12 @@ void AddDebugInfoPass::runOnOperation() {
       llvm::dwarf::getLanguage("DW_LANG_Fortran95"), fileAttr, producer,
       isOptimized, debugLevel);
 
+  if (debugLevel == mlir::LLVM::DIEmissionKind::Full) {
+    // Process 'GlobalOp' only if full debug info is requested.
+    for (auto globalOp : module.getOps<fir::GlobalOp>())
+      handleGlobalOp(globalOp, fileAttr, cuAttr);
+  }
+
   module.walk([&](mlir::func::FuncOp funcOp) {
     mlir::Location l = funcOp->getLoc();
     // If fused location has already been created then nothing to do
@@ -180,6 +260,7 @@ void AddDebugInfoPass::runOnOperation() {
 
     // Only definitions need a distinct identifier and a compilation unit.
     mlir::DistinctAttr id;
+    mlir::LLVM::DIScopeAttr Scope = fileAttr;
     mlir::LLVM::DICompileUnitAttr compilationUnit;
     mlir::LLVM::DISubprogramFlags subprogramFlags =
         mlir::LLVM::DISubprogramFlags{};
@@ -192,9 +273,13 @@ void AddDebugInfoPass::runOnOperation() {
           subprogramFlags | mlir::LLVM::DISubprogramFlags::Definition;
     }
     unsigned line = getLineFromLoc(l);
+    if (!result.second.modules.empty())
+      Scope = getOrCreateModuleAttr(result.second.modules[0], fileAttr, cuAttr,
+                                    line - 1, false);
+
     auto spAttr = mlir::LLVM::DISubprogramAttr::get(
-        context, id, compilationUnit, fileAttr, funcName, fullName,
-        funcFileAttr, line, line, subprogramFlags, subTypeAttr);
+        context, id, compilationUnit, Scope, funcName, fullName, funcFileAttr,
+        line, line, subprogramFlags, subTypeAttr);
     funcOp->setLoc(builder.getFusedLoc({funcOp->getLoc()}, spAttr));
 
     // Don't process variables if user asked for line tables only.
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
index 64c6547..07163de 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
@@ -37,6 +37,45 @@ static mlir::LLVM::DITypeAttr genPlaceholderType(mlir::MLIRContext *context) {
                       llvm::dwarf::DW_ATE_signed);
 }
 
+mlir::LLVM::DITypeAttr DebugTypeGenerator::convertSequenceType(
+    fir::SequenceType seqTy, mlir::LLVM::DIFileAttr fileAttr,
+    mlir::LLVM::DIScopeAttr scope, mlir::Location loc) {
+
+  mlir::MLIRContext *context = module.getContext();
+  // FIXME: Only fixed sizes arrays handled at the moment.
+  if (seqTy.hasDynamicExtents())
+    return genPlaceholderType(context);
+
+  llvm::SmallVector<mlir::LLVM::DINodeAttr> elements;
+  mlir::LLVM::DITypeAttr elemTy =
+      convertType(seqTy.getEleTy(), fileAttr, scope, loc);
+
+  for (fir::SequenceType::Extent dim : seqTy.getShape()) {
+    auto intTy = mlir::IntegerType::get(context, 64);
+    // FIXME: Only supporting lower bound of 1 at the moment. The
+    // 'SequenceType' has information about the shape but not the shift. In
+    // cases where the conversion originated during the processing of
+    // 'DeclareOp', it may be possible to pass on this information. But the
+    // type conversion should ideally be based on what information present in
+    // the type class so that it works from everywhere (e.g. when it is part
+    // of a module or a derived type.)
+    auto countAttr = mlir::IntegerAttr::get(intTy, llvm::APInt(64, dim));
+    auto lowerAttr = mlir::IntegerAttr::get(intTy, llvm::APInt(64, 1));
+    auto subrangeTy = mlir::LLVM::DISubrangeAttr::get(
+        context, countAttr, lowerAttr, nullptr, nullptr);
+    elements.push_back(subrangeTy);
+  }
+  // Apart from arrays, the `DICompositeTypeAttr` is used for other things like
+  // structure types. Many of its fields which are not applicable to arrays
+  // have been set to some valid default values.
+
+  return mlir::LLVM::DICompositeTypeAttr::get(
+      context, llvm::dwarf::DW_TAG_array_type, /*recursive id*/ {},
+      /* name */ nullptr, /* file */ nullptr, /* line */ 0, /* scope */ nullptr,
+      elemTy, mlir::LLVM::DIFlags::Zero, /* sizeInBits */ 0,
+      /*alignInBits*/ 0, elements);
+}
+
 mlir::LLVM::DITypeAttr
 DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr,
                                 mlir::LLVM::DIScopeAttr scope,
@@ -57,6 +96,20 @@ DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr,
                         mlir::StringAttr::get(context, logTy.getMnemonic()),
                         kindMapping.getLogicalBitsize(logTy.getFKind()),
                         llvm::dwarf::DW_ATE_boolean);
+  } else if (fir::isa_complex(Ty)) {
+    unsigned bitWidth;
+    if (auto cplxTy = mlir::dyn_cast_or_null<mlir::ComplexType>(Ty)) {
+      auto floatTy = mlir::cast<mlir::FloatType>(cplxTy.getElementType());
+      bitWidth = floatTy.getWidth();
+    } else if (auto cplxTy = mlir::dyn_cast_or_null<fir::ComplexType>(Ty)) {
+      bitWidth = kindMapping.getRealBitsize(cplxTy.getFKind());
+    } else {
+      llvm_unreachable("Unhandled complex type");
+    }
+    return genBasicType(context, mlir::StringAttr::get(context, "complex"),
+                        bitWidth * 2, llvm::dwarf::DW_ATE_complex_float);
+  } else if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(Ty)) {
+    return convertSequenceType(seqTy, fileAttr, scope, loc);
   } else {
     // FIXME: These types are currently unhandled. We are generating a
     // placeholder type to allow us to test supported bits.
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
index 5a2bb20..963c919 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
@@ -31,6 +31,10 @@ public:
                                      mlir::Location loc);
 
 private:
+  mlir::LLVM::DITypeAttr convertSequenceType(fir::SequenceType seqTy,
+                                             mlir::LLVM::DIFileAttr fileAttr,
+                                             mlir::LLVM::DIScopeAttr scope,
+                                             mlir::Location loc);
   mlir::ModuleOp module;
   KindMapping kindMapping;
 };
diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index 8f51ef5..48c888c 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -761,7 +761,8 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
   }
 
   // 15.5.2.5 -- actual & dummy are both POINTER or both ALLOCATABLE
-  // For INTENT(IN) we relax two checks that are in Fortran to
+  // For INTENT(IN), and for a polymorphic actual being associated with a
+  // monomorphic dummy, we relax two checks that are in Fortran to
   // prevent the callee from changing the type or to avoid having
   // to use a descriptor.
   if (!typesCompatible) {
@@ -770,7 +771,9 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
       (actualIsAllocatable && dummyIsAllocatable)) {
     bool actualIsUnlimited{actualType.type().IsUnlimitedPolymorphic()};
     bool dummyIsUnlimited{dummy.type.type().IsUnlimitedPolymorphic()};
+    bool checkTypeCompatibility{true};
     if (actualIsUnlimited != dummyIsUnlimited) {
+      checkTypeCompatibility = false;
       if (dummyIsUnlimited && dummy.intent == common::Intent::In &&
           context.IsEnabled(common::LanguageFeature::RelaxedIntentInChecking)) {
         if (context.ShouldWarn(
@@ -790,11 +793,21 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
           messages.Say(
               "If a POINTER or ALLOCATABLE dummy or actual argument is polymorphic, both should be so"_port_en_US);
         }
+      } else if (actualIsPolymorphic &&
+          context.IsEnabled(common::LanguageFeature::
+                  PolymorphicActualAllocatableOrPointerToMonomorphicDummy)) {
+        if (context.ShouldWarn(common::LanguageFeature::
+                    PolymorphicActualAllocatableOrPointerToMonomorphicDummy)) {
+          messages.Say(
+              "If a POINTER or ALLOCATABLE actual argument is polymorphic, the corresponding dummy argument should also be so"_port_en_US);
+        }
       } else {
+        checkTypeCompatibility = false;
         messages.Say(
             "If a POINTER or ALLOCATABLE dummy or actual argument is polymorphic, both must be so"_err_en_US);
       }
-    } else if (!actualIsUnlimited) {
+    }
+    if (checkTypeCompatibility && !actualIsUnlimited) {
       if (!actualType.type().IsTkCompatibleWith(dummy.type.type())) {
         if (dummy.intent == common::Intent::In &&
             context.IsEnabled(
@@ -1116,20 +1129,20 @@ static void CheckExplicitInterfaceArg(evaluate::ActualArgument &arg,
   }
   auto restorer{
       messages.SetLocation(arg.sourceLocation().value_or(messages.at()))};
-  auto checkActualArgForLabel = [&](evaluate::ActualArgument &arg) {
+  auto CheckActualArgForLabel = [&](evaluate::ActualArgument &arg) {
     if (arg.isAlternateReturn()) {
       messages.Say(
           "Alternate return label '%d' cannot be associated with %s"_err_en_US,
           arg.GetLabel(), dummyName);
-      return true;
-    } else {
       return false;
+    } else {
+      return true;
     }
   };
   common::visit(
       common::visitors{
           [&](const characteristics::DummyDataObject &object) {
-            if (!checkActualArgForLabel(arg)) {
+            if (CheckActualArgForLabel(arg)) {
               ConvertBOZLiteralArg(arg, object.type.type());
               if (auto *expr{arg.UnwrapExpr()}) {
                 if (auto type{characteristics::TypeAndShape::Characterize(
@@ -1147,9 +1160,16 @@ static void CheckExplicitInterfaceArg(evaluate::ActualArgument &arg,
                     evaluate::IsNullObjectPointer(*expr)) {
                   // ok, ASSOCIATED(NULL(without MOLD=))
                 } else if (object.type.attrs().test(characteristics::
-                                   TypeAndShape::Attr::AssumedRank)) {
+                                   TypeAndShape::Attr::AssumedRank) &&
+                    evaluate::IsNullObjectPointer(*expr) &&
+                    (object.attrs.test(
+                         characteristics::DummyDataObject::Attr::Allocatable) ||
+                        object.attrs.test(
+                            characteristics::DummyDataObject::Attr::Pointer) ||
+                        !object.attrs.test(characteristics::DummyDataObject::
+                                Attr::Optional))) {
                   messages.Say(
-                      "NULL() without MOLD= must not be associated with an assumed-rank dummy argument"_err_en_US);
+                      "NULL() without MOLD= must not be associated with an assumed-rank dummy argument that is ALLOCATABLE, POINTER, or non-OPTIONAL"_err_en_US);
                 } else if ((object.attrs.test(characteristics::DummyDataObject::
                                     Attr::Pointer) ||
                                object.attrs.test(characteristics::
@@ -1210,7 +1230,7 @@ static void CheckExplicitInterfaceArg(evaluate::ActualArgument &arg,
             }
           },
           [&](const characteristics::DummyProcedure &dummy) {
-            if (!checkActualArgForLabel(arg)) {
+            if (CheckActualArgForLabel(arg)) {
               CheckProcedureArg(arg, proc, dummy, dummyName, context,
                   ignoreImplicitVsExplicit);
             }
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index f564a0b..7034902 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -2430,16 +2430,18 @@ void CheckHelper::CheckProcBinding(
                   "A NOPASS type-bound procedure and its override must have identical interfaces"_err_en_US);
             }
           } else if (!context_.HasError(binding.symbol())) {
-            int passIndex{bindingChars->FindPassIndex(binding.passName())};
-            int overriddenPassIndex{
+            auto passIndex{bindingChars->FindPassIndex(binding.passName())};
+            auto overriddenPassIndex{
                 overriddenChars->FindPassIndex(overriddenBinding->passName())};
-            if (passIndex != overriddenPassIndex) {
-              SayWithDeclaration(*overridden,
-                  "A type-bound procedure and its override must use the same PASS argument"_err_en_US);
-            } else if (!bindingChars->CanOverride(
-                           *overriddenChars, passIndex)) {
-              SayWithDeclaration(*overridden,
-                  "A type-bound procedure and its override must have compatible interfaces"_err_en_US);
+            if (passIndex && overriddenPassIndex) {
+              if (*passIndex != *overriddenPassIndex) {
+                SayWithDeclaration(*overridden,
+                    "A type-bound procedure and its override must use the same PASS argument"_err_en_US);
+              } else if (!bindingChars->CanOverride(
+                             *overriddenChars, passIndex)) {
+                SayWithDeclaration(*overridden,
+                    "A type-bound procedure and its override must have compatible interfaces"_err_en_US);
+              }
             }
           }
         }
@@ -2960,32 +2962,6 @@ parser::Messages CheckHelper::WhyNotInteroperableDerivedType(
   return msgs;
 }
 
-static UnorderedSymbolSet CollectEntryPointsWithDummy(const Symbol &dummy) {
-  UnorderedSymbolSet entries;
-  const Scope &subpScope{dummy.owner()};
-  for (const auto &[_, ref] : subpScope.parent()) {
-    const Symbol &x{*ref};
-    if (const auto *subp{x.detailsIf<SubprogramDetails>()}) {
-      if (x.scope() == &subpScope || subp->entryScope() == &dummy.owner()) {
-        if (std::find(subp->dummyArgs().begin(), subp->dummyArgs().end(),
-                &dummy) != subp->dummyArgs().end()) {
-          entries.insert(x);
-        }
-      }
-    }
-  }
-  return entries;
-}
-
-static bool AnyNonBindCEntry(const Symbol &dummy) {
-  for (const Symbol &subp : CollectEntryPointsWithDummy(dummy)) {
-    if (!subp.attrs().test(Attr::BIND_C)) {
-      return true;
-    }
-  }
-  return false;
-}
-
 parser::Messages CheckHelper::WhyNotInteroperableObject(
     const Symbol &symbol, bool isError) {
   parser::Messages msgs;
@@ -2998,14 +2974,14 @@ parser::Messages CheckHelper::WhyNotInteroperableObject(
   examinedByWhyNotInteroperable_.insert(symbol);
   CHECK(symbol.has<ObjectEntityDetails>());
   if (isExplicitBindC && !symbol.owner().IsModule()) {
-    messages_.Say(symbol.name(),
+    msgs.Say(symbol.name(),
         "A variable with BIND(C) attribute may only appear in the specification part of a module"_err_en_US);
   }
   auto shape{evaluate::GetShape(foldingContext_, symbol)};
   if (shape) {
     if (evaluate::GetRank(*shape) == 0) { // 18.3.4
       if (IsAllocatableOrPointer(symbol) && !IsDummy(symbol)) {
-        messages_.Say(symbol.name(),
+        msgs.Say(symbol.name(),
             "A scalar interoperable variable may not be ALLOCATABLE or POINTER"_err_en_US);
       }
     } else if (auto extents{
@@ -3026,33 +3002,26 @@ parser::Messages CheckHelper::WhyNotInteroperableObject(
     if (derived) {
       if (derived->typeSymbol().attrs().test(Attr::BIND_C)) {
       } else if (isError) {
-        if (auto *msg{messages_.Say(symbol.name(),
-                "The derived type of a BIND(C) object must also be BIND(C)"_err_en_US)}) {
-          msg->Attach(derived->typeSymbol().name(), "Non-BIND(C) type"_en_US);
-        }
-        context_.SetError(symbol);
+        msgs.Say(symbol.name(),
+                "The derived type of a BIND(C) object must also be BIND(C)"_err_en_US)
+            .Attach(derived->typeSymbol().name(), "Non-BIND(C) type"_en_US);
       } else if (auto bad{WhyNotInteroperableDerivedType(
                      derived->typeSymbol(), /*isError=*/false)};
                  bad.AnyFatalError()) {
-        if (auto *msg{messages_.Say(symbol.name(),
-                "The derived type of an interoperable object must be interoperable, but is not"_err_en_US)}) {
-          msg->Attach(
-              derived->typeSymbol().name(), "Non-interoperable type"_en_US);
-          bad.AttachTo(*msg, parser::Severity::None);
-        }
+        bad.AttachTo(
+            msgs.Say(symbol.name(),
+                    "The derived type of an interoperable object must be interoperable, but is not"_err_en_US)
+                .Attach(derived->typeSymbol().name(),
+                    "Non-interoperable type"_en_US),
+            parser::Severity::None);
       } else {
-        if (auto *msg{messages_.Say(symbol.name(),
-                "The derived type of an interoperable object should be BIND(C)"_warn_en_US)}) {
-          msg->Attach(derived->typeSymbol().name(), "Non-BIND(C) type"_en_US);
-        }
+        msgs.Say(symbol.name(),
+                "The derived type of an interoperable object should be BIND(C)"_warn_en_US)
+            .Attach(derived->typeSymbol().name(), "Non-BIND(C) type"_en_US);
       }
     }
     if (type->IsAssumedType()) { // ok
     } else if (IsAssumedLengthCharacter(symbol)) {
-      if (AnyNonBindCEntry(symbol)) {
-        msgs.Say(symbol.name(),
-            "An assumed-length dummy argument must not appear in a non-BIND(C) entry in a subprogram with an entry that must be interoperable"_err_en_US);
-      }
     } else if (IsAllocatableOrPointer(symbol) &&
         type->category() == DeclTypeSpec::Character &&
         type->characterTypeSpec().length().isDeferred()) {
@@ -3083,12 +3052,6 @@ parser::Messages CheckHelper::WhyNotInteroperableObject(
     msgs.Say(symbol.name(),
         "An interoperable procedure with an OPTIONAL dummy argument might not be portable"_port_en_US);
   }
-  if (symbol.attrs().test(Attr::VALUE)) {
-    if (AnyNonBindCEntry(symbol)) {
-      msgs.Say(symbol.name(),
-          "A VALUE dummy argument must not appear in a non-BIND(C) entry of a subprogram with an entry that must be interoperable"_err_en_US);
-    }
-  }
   if (IsDescriptor(symbol) && IsPointer(symbol) &&
       symbol.attrs().test(Attr::CONTIGUOUS)) {
     msgs.Say(symbol.name(),
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index e9637b7..5e3a572 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -2310,6 +2310,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Reduction &x) {
   if (CheckReductionOperators(x)) {
     CheckReductionTypeList(x);
   }
+  CheckReductionModifier(x);
 }
 
 bool OmpStructureChecker::CheckReductionOperators(
@@ -2394,6 +2395,64 @@ void OmpStructureChecker::CheckReductionTypeList(
   }
 }
 
+void OmpStructureChecker::CheckReductionModifier(
+    const parser::OmpClause::Reduction &x) {
+  using ReductionModifier = parser::OmpReductionClause::ReductionModifier;
+  const auto &maybeModifier{std::get<std::optional<ReductionModifier>>(x.v.t)};
+  if (!maybeModifier || *maybeModifier == ReductionModifier::Default) {
+    // No modifier, or the default one is always ok.
+    return;
+  }
+  ReductionModifier modifier{*maybeModifier};
+  const DirectiveContext &dirCtx{GetContext()};
+  if (dirCtx.directive == llvm::omp::Directive::OMPD_loop) {
+    // [5.2:257:33-34]
+    // If a reduction-modifier is specified in a reduction clause that
+    // appears on the directive, then the reduction modifier must be
+    // default.
+    context_.Say(GetContext().clauseSource,
+        "REDUCTION modifier on LOOP directive must be DEFAULT"_err_en_US);
+  }
+  if (modifier == ReductionModifier::Task) {
+    // "Task" is only allowed on worksharing or "parallel" directive.
+    static llvm::omp::Directive worksharing[]{
+        llvm::omp::Directive::OMPD_do, llvm::omp::Directive::OMPD_scope,
+        llvm::omp::Directive::OMPD_sections,
+        // There are more worksharing directives, but they do not apply:
+        // "for" is C++ only,
+        // "single" and "workshare" don't allow reduction clause,
+        // "loop" has different restrictions (checked above).
+    };
+    if (dirCtx.directive != llvm::omp::Directive::OMPD_parallel &&
+        !llvm::is_contained(worksharing, dirCtx.directive)) {
+      context_.Say(GetContext().clauseSource,
+          "Modifier 'TASK' on REDUCTION clause is only allowed with "
+          "PARALLEL or worksharing directive"_err_en_US);
+    }
+  } else if (modifier == ReductionModifier::Inscan) {
+    // "Inscan" is only allowed on worksharing-loop, worksharing-loop simd,
+    // or "simd" directive.
+    // The worksharing-loop directives are OMPD_do and OMPD_for. Only the
+    // former is allowed in Fortran.
+    switch (dirCtx.directive) {
+    case llvm::omp::Directive::OMPD_do: // worksharing-loop
+    case llvm::omp::Directive::OMPD_do_simd: // worksharing-loop simd
+    case llvm::omp::Directive::OMPD_simd: // "simd"
+      break;
+    default:
+      context_.Say(GetContext().clauseSource,
+          "Modifier 'INSCAN' on REDUCTION clause is only allowed with "
+          "worksharing-loop, worksharing-loop simd, "
+          "or SIMD directive"_err_en_US);
+    }
+  } else {
+    // Catch-all for potential future modifiers to make sure that this
+    // function is up-to-date.
+    context_.Say(GetContext().clauseSource,
+        "Unexpected modifier on REDUCTION clause"_err_en_US);
+  }
+}
+
 void OmpStructureChecker::CheckIntentInPointerAndDefinable(
     const parser::OmpObjectList &objectList, const llvm::omp::Clause clause) {
   for (const auto &ompObject : objectList.v) {
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index 1f72843..4770577 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -205,6 +205,7 @@ private:
   bool CheckIntrinsicOperator(
       const parser::DefinedOperator::IntrinsicOperator &);
   void CheckReductionTypeList(const parser::OmpClause::Reduction &);
+  void CheckReductionModifier(const parser::OmpClause::Reduction &);
   void CheckMasterNesting(const parser::OpenMPBlockConstruct &x);
   void ChecksOnOrderedAsBlock();
   void CheckBarrierNesting(const parser::OpenMPSimpleStandaloneConstruct &x);
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 06e38da..50e2b41 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -1600,16 +1600,23 @@ private:
       parser::CharBlock name, std::int64_t lower, std::int64_t upper,
       std::int64_t stride);
 
-  template <int KIND, typename A>
-  std::optional<Expr<Type<TypeCategory::Integer, KIND>>> GetSpecificIntExpr(
-      const A &x) {
-    if (MaybeExpr y{exprAnalyzer_.Analyze(x)}) {
+  template <int KIND>
+  std::optional<Expr<Type<TypeCategory::Integer, KIND>>> ToSpecificInt(
+      MaybeExpr &&y) {
+    if (y) {
       Expr<SomeInteger> *intExpr{UnwrapExpr<Expr<SomeInteger>>(*y)};
       return Fold(exprAnalyzer_.GetFoldingContext(),
           ConvertToType<Type<TypeCategory::Integer, KIND>>(
               std::move(DEREF(intExpr))));
+    } else {
+      return std::nullopt;
     }
-    return std::nullopt;
+  }
+
+  template <int KIND, typename A>
+  std::optional<Expr<Type<TypeCategory::Integer, KIND>>> GetSpecificIntExpr(
+      const A &x) {
+    return ToSpecificInt<KIND>(exprAnalyzer_.Analyze(x));
   }
 
   // Nested array constructors all reference the same ExpressionAnalyzer,
@@ -1772,26 +1779,45 @@ void ArrayConstructorContext::Add(const parser::AcValue &x) {
 
 // Transforms l:u(:s) into (_,_=l,u(,s)) with an anonymous index '_'
 void ArrayConstructorContext::Add(const parser::AcValue::Triplet &triplet) {
-  std::optional<Expr<ImpliedDoIntType>> lower{
-      GetSpecificIntExpr<ImpliedDoIntType::kind>(std::get<0>(triplet.t))};
-  std::optional<Expr<ImpliedDoIntType>> upper{
-      GetSpecificIntExpr<ImpliedDoIntType::kind>(std::get<1>(triplet.t))};
-  std::optional<Expr<ImpliedDoIntType>> stride{
-      GetSpecificIntExpr<ImpliedDoIntType::kind>(std::get<2>(triplet.t))};
-  if (lower && upper) {
-    if (!stride) {
-      stride = Expr<ImpliedDoIntType>{1};
-    }
-    if (!type_) {
-      type_ = DynamicTypeWithLength{ImpliedDoIntType::GetType()};
+  MaybeExpr lowerExpr{exprAnalyzer_.Analyze(std::get<0>(triplet.t))};
+  MaybeExpr upperExpr{exprAnalyzer_.Analyze(std::get<1>(triplet.t))};
+  MaybeExpr strideExpr{exprAnalyzer_.Analyze(std::get<2>(triplet.t))};
+  if (lowerExpr && upperExpr) {
+    auto lowerType{lowerExpr->GetType()};
+    auto upperType{upperExpr->GetType()};
+    auto strideType{strideExpr ? strideExpr->GetType() : lowerType};
+    if (lowerType && upperType && strideType) {
+      int kind{lowerType->kind()};
+      if (upperType->kind() > kind) {
+        kind = upperType->kind();
+      }
+      if (strideType->kind() > kind) {
+        kind = strideType->kind();
+      }
+      auto lower{ToSpecificInt<ImpliedDoIntType::kind>(std::move(lowerExpr))};
+      auto upper{ToSpecificInt<ImpliedDoIntType::kind>(std::move(upperExpr))};
+      if (lower && upper) {
+        auto stride{
+            ToSpecificInt<ImpliedDoIntType::kind>(std::move(strideExpr))};
+        if (!stride) {
+          stride = Expr<ImpliedDoIntType>{1};
+        }
+        DynamicType type{TypeCategory::Integer, kind};
+        if (!type_) {
+          type_ = DynamicTypeWithLength{type};
+        }
+        parser::CharBlock anonymous;
+        if (auto converted{ConvertToType(type,
+                AsGenericExpr(
+                    Expr<ImpliedDoIntType>{ImpliedDoIndex{anonymous}}))}) {
+          auto v{std::move(values_)};
+          Push(std::move(converted));
+          std::swap(v, values_);
+          values_.Push(ImpliedDo<SomeType>{anonymous, std::move(*lower),
+              std::move(*upper), std::move(*stride), std::move(v)});
+        }
+      }
     }
-    auto v{std::move(values_)};
-    parser::CharBlock anonymous;
-    Push(Expr<SomeType>{
-        Expr<SomeInteger>{Expr<ImpliedDoIntType>{ImpliedDoIndex{anonymous}}}});
-    std::swap(v, values_);
-    values_.Push(ImpliedDo<SomeType>{anonymous, std::move(*lower),
-        std::move(*upper), std::move(*stride), std::move(v)});
   }
 }
 
diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp
index bb8c6c7..d7f1494 100644
--- a/flang/lib/Semantics/mod-file.cpp
+++ b/flang/lib/Semantics/mod-file.cpp
@@ -46,11 +46,11 @@ struct ModHeader {
 };
 
 static std::optional<SourceName> GetSubmoduleParent(const parser::Program &);
-static void CollectSymbols(const Scope &, SymbolVector &, SymbolVector &,
-    std::map<const Symbol *, SourceName> &, UnorderedSymbolSet &);
+static void CollectSymbols(
+    const Scope &, SymbolVector &, SymbolVector &, UnorderedSymbolSet &);
 static void PutPassName(llvm::raw_ostream &, const std::optional<SourceName> &);
 static void PutInit(llvm::raw_ostream &, const Symbol &, const MaybeExpr &,
-    const parser::Expr *, const std::map<const Symbol *, SourceName> &);
+    const parser::Expr *);
 static void PutInit(llvm::raw_ostream &, const MaybeIntExpr &);
 static void PutBound(llvm::raw_ostream &, const Bound &);
 static void PutShapeSpec(llvm::raw_ostream &, const ShapeSpec &);
@@ -200,47 +200,105 @@ std::string ModFileWriter::GetAsString(const Symbol &symbol) {
   return all.str();
 }
 
-// Collect symbols from initializations that are being referenced directly
-// from other modules; they may require new USE associations.
-static void HarvestInitializerSymbols(
-    SourceOrderedSymbolSet &set, const Scope &scope) {
-  for (const auto &[_, symbol] : scope) {
-    if (symbol->has<DerivedTypeDetails>()) {
-      if (symbol->scope()) {
-        HarvestInitializerSymbols(set, *symbol->scope());
+// Collect symbols from constant and specification expressions that are being
+// referenced directly from other modules; they may require new USE
+// associations.
+static void HarvestSymbolsNeededFromOtherModules(
+    SourceOrderedSymbolSet &, const Scope &);
+static void HarvestSymbolsNeededFromOtherModules(
+    SourceOrderedSymbolSet &set, const Symbol &symbol, const Scope &scope) {
+  auto HarvestBound{[&](const Bound &bound) {
+    if (const auto &expr{bound.GetExplicit()}) {
+      for (SymbolRef ref : evaluate::CollectSymbols(*expr)) {
+        set.emplace(*ref);
       }
-    } else if (const auto &generic{symbol->detailsIf<GenericDetails>()};
-               generic && generic->derivedType()) {
-      const Symbol &dtSym{*generic->derivedType()};
-      if (dtSym.has<DerivedTypeDetails>()) {
-        if (dtSym.scope()) {
-          HarvestInitializerSymbols(set, *dtSym.scope());
-        }
-      } else {
-        CHECK(dtSym.has<UseDetails>() || dtSym.has<UseErrorDetails>());
+    }
+  }};
+  auto HarvestShapeSpec{[&](const ShapeSpec &shapeSpec) {
+    HarvestBound(shapeSpec.lbound());
+    HarvestBound(shapeSpec.ubound());
+  }};
+  auto HarvestArraySpec{[&](const ArraySpec &arraySpec) {
+    for (const auto &shapeSpec : arraySpec) {
+      HarvestShapeSpec(shapeSpec);
+    }
+  }};
+
+  if (symbol.has<DerivedTypeDetails>()) {
+    if (symbol.scope()) {
+      HarvestSymbolsNeededFromOtherModules(set, *symbol.scope());
+    }
+  } else if (const auto &generic{symbol.detailsIf<GenericDetails>()};
+             generic && generic->derivedType()) {
+    const Symbol &dtSym{*generic->derivedType()};
+    if (dtSym.has<DerivedTypeDetails>()) {
+      if (dtSym.scope()) {
+        HarvestSymbolsNeededFromOtherModules(set, *dtSym.scope());
       }
-    } else if (IsNamedConstant(*symbol) || scope.IsDerivedType()) {
-      if (const auto *object{symbol->detailsIf<ObjectEntityDetails>()}) {
-        if (object->init()) {
-          for (SymbolRef ref : evaluate::CollectSymbols(*object->init())) {
-            set.emplace(*ref);
-          }
-        }
-      } else if (const auto *proc{symbol->detailsIf<ProcEntityDetails>()}) {
-        if (proc->init() && *proc->init()) {
-          set.emplace(**proc->init());
+    } else {
+      CHECK(dtSym.has<UseDetails>() || dtSym.has<UseErrorDetails>());
+    }
+  } else if (const auto *object{symbol.detailsIf<ObjectEntityDetails>()}) {
+    HarvestArraySpec(object->shape());
+    HarvestArraySpec(object->coshape());
+    if (IsNamedConstant(symbol) || scope.IsDerivedType()) {
+      if (object->init()) {
+        for (SymbolRef ref : evaluate::CollectSymbols(*object->init())) {
+          set.emplace(*ref);
         }
       }
     }
+  } else if (const auto *proc{symbol.detailsIf<ProcEntityDetails>()}) {
+    if (proc->init() && *proc->init() && scope.IsDerivedType()) {
+      set.emplace(**proc->init());
+    }
+  } else if (const auto *subp{symbol.detailsIf<SubprogramDetails>()}) {
+    for (const Symbol *dummy : subp->dummyArgs()) {
+      if (dummy) {
+        HarvestSymbolsNeededFromOtherModules(set, *dummy, scope);
+      }
+    }
+    if (subp->isFunction()) {
+      HarvestSymbolsNeededFromOtherModules(set, subp->result(), scope);
+    }
+  }
+}
+
+static void HarvestSymbolsNeededFromOtherModules(
+    SourceOrderedSymbolSet &set, const Scope &scope) {
+  for (const auto &[_, symbol] : scope) {
+    HarvestSymbolsNeededFromOtherModules(set, *symbol, scope);
   }
 }
 
 void ModFileWriter::PrepareRenamings(const Scope &scope) {
-  SourceOrderedSymbolSet symbolsInInits;
-  HarvestInitializerSymbols(symbolsInInits, scope);
-  for (SymbolRef s : symbolsInInits) {
+  // Identify use-associated symbols already in scope under some name
+  std::map<const Symbol *, const Symbol *> useMap;
+  for (const auto &[name, symbolRef] : scope) {
+    const Symbol *symbol{&*symbolRef};
+    while (const auto *hostAssoc{symbol->detailsIf<HostAssocDetails>()}) {
+      symbol = &hostAssoc->symbol();
+    }
+    if (const auto *use{symbol->detailsIf<UseDetails>()}) {
+      useMap.emplace(&use->symbol(), symbol);
+    }
+  }
+  // Collect symbols needed from other modules
+  SourceOrderedSymbolSet symbolsNeeded;
+  HarvestSymbolsNeededFromOtherModules(symbolsNeeded, scope);
+  // Establish any necessary renamings of symbols in other modules
+  // to their names in this scope, creating those new names when needed.
+  auto &renamings{context_.moduleFileOutputRenamings()};
+  for (SymbolRef s : symbolsNeeded) {
+    if (s->owner().kind() == Scope::Kind::DerivedType) {
+      continue; // component or binding: ok
+    }
     const Scope *sMod{FindModuleContaining(s->owner())};
-    if (!sMod) {
+    if (!sMod || sMod == &scope) {
+      continue;
+    }
+    if (auto iter{useMap.find(&*s)}; iter != useMap.end()) {
+      renamings.emplace(&*s, iter->second->name());
       continue;
     }
     SourceName rename{s->name()};
@@ -272,10 +330,10 @@ void ModFileWriter::PrepareRenamings(const Scope &scope) {
     uses_ << DEREF(sMod->symbol()).name() << ",only:";
     if (rename != s->name()) {
       uses_ << rename << "=>";
+      renamings.emplace(&*s, rename);
     }
     uses_ << s->name() << '\n';
     useExtraAttrs_ << "private::" << rename << '\n';
-    renamings_.emplace(&*s, rename);
   }
 }
 
@@ -283,9 +341,11 @@ void ModFileWriter::PrepareRenamings(const Scope &scope) {
 void ModFileWriter::PutSymbols(const Scope &scope) {
   SymbolVector sorted;
   SymbolVector uses;
+  auto &renamings{context_.moduleFileOutputRenamings()};
+  auto previousRenamings{std::move(renamings)};
   PrepareRenamings(scope);
   UnorderedSymbolSet modules;
-  CollectSymbols(scope, sorted, uses, renamings_, modules);
+  CollectSymbols(scope, sorted, uses, modules);
   // Write module files for dependencies first so that their
   // hashes are known.
   for (auto ref : modules) {
@@ -318,6 +378,7 @@ void ModFileWriter::PutSymbols(const Scope &scope) {
     }
   }
   CHECK(typeBindings.str().empty());
+  renamings = std::move(previousRenamings);
 }
 
 // Emit components in order
@@ -521,7 +582,7 @@ void ModFileWriter::PutDECStructure(
         }
         decls_ << ref->name();
         PutShape(decls_, object->shape(), '(', ')');
-        PutInit(decls_, *ref, object->init(), nullptr, renamings_);
+        PutInit(decls_, *ref, object->init(), nullptr);
         emittedDECFields_.insert(*ref);
       } else if (any) {
         break; // any later use of this structure will use RECORD/str/
@@ -767,8 +828,7 @@ static inline SourceName NameInModuleFile(const Symbol &symbol) {
 // Collect the symbols of this scope sorted by their original order, not name.
 // Generics and namelists are exceptions: they are sorted after other symbols.
 void CollectSymbols(const Scope &scope, SymbolVector &sorted,
-    SymbolVector &uses, std::map<const Symbol *, SourceName> &renamings,
-    UnorderedSymbolSet &modules) {
+    SymbolVector &uses, UnorderedSymbolSet &modules) {
   SymbolVector namelist, generics;
   auto symbols{scope.GetSymbols()};
   std::size_t commonSize{scope.commonBlocks().size()};
@@ -878,8 +938,7 @@ void ModFileWriter::PutObjectEntity(
       getSymbolAttrsToWrite(symbol));
   PutShape(os, details.shape(), '(', ')');
   PutShape(os, details.coshape(), '[', ']');
-  PutInit(os, symbol, details.init(), details.unanalyzedPDTComponentInit(),
-      renamings_);
+  PutInit(os, symbol, details.init(), details.unanalyzedPDTComponentInit());
   os << '\n';
   if (auto tkr{GetIgnoreTKR(symbol)}; !tkr.empty()) {
     os << "!dir$ ignore_tkr(";
@@ -973,25 +1032,12 @@ void ModFileWriter::PutTypeParam(llvm::raw_ostream &os, const Symbol &symbol) {
 }
 
 void PutInit(llvm::raw_ostream &os, const Symbol &symbol, const MaybeExpr &init,
-    const parser::Expr *unanalyzed,
-    const std::map<const Symbol *, SourceName> &renamings) {
+    const parser::Expr *unanalyzed) {
   if (IsNamedConstant(symbol) || symbol.owner().IsDerivedType()) {
     const char *assign{symbol.attrs().test(Attr::POINTER) ? "=>" : "="};
     if (unanalyzed) {
       parser::Unparse(os << assign, *unanalyzed);
     } else if (init) {
-      if (const auto *dtConst{
-              evaluate::UnwrapExpr<evaluate::Constant<evaluate::SomeDerived>>(
-                  *init)}) {
-        const Symbol &dtSym{dtConst->result().derivedTypeSpec().typeSymbol()};
-        if (auto iter{renamings.find(&dtSym)}; iter != renamings.end()) {
-          // Initializer is a constant whose derived type's name has
-          // been brought into scope from a module under a new name
-          // to avoid a conflict.
-          dtConst->AsFortran(os << assign, &iter->second);
-          return;
-        }
-      }
       init->AsFortran(os << assign);
     }
   }
diff --git a/flang/lib/Semantics/mod-file.h b/flang/lib/Semantics/mod-file.h
index 739add3..be44780 100644
--- a/flang/lib/Semantics/mod-file.h
+++ b/flang/lib/Semantics/mod-file.h
@@ -57,7 +57,6 @@ private:
   llvm::raw_string_ostream decls_{declsBuf_};
   llvm::raw_string_ostream contains_{containsBuf_};
   bool isSubmodule_{false};
-  std::map<const Symbol *, SourceName> renamings_;
 
   void WriteAll(const Scope &);
   void WriteOne(const Scope &);
diff --git a/flang/lib/Semantics/resolve-names-utils.cpp b/flang/lib/Semantics/resolve-names-utils.cpp
index 3ca460b..e27a543 100644
--- a/flang/lib/Semantics/resolve-names-utils.cpp
+++ b/flang/lib/Semantics/resolve-names-utils.cpp
@@ -376,25 +376,35 @@ static void PropagateSaveAttr(const EquivalenceSet &src, EquivalenceSet &dst) {
 
 void EquivalenceSets::AddToSet(const parser::Designator &designator) {
   if (CheckDesignator(designator)) {
-    Symbol &symbol{*currObject_.symbol};
-    if (!currSet_.empty()) {
-      // check this symbol against first of set for compatibility
-      Symbol &first{currSet_.front().symbol};
-      CheckCanEquivalence(designator.source, first, symbol) &&
-          CheckCanEquivalence(designator.source, symbol, first);
-    }
-    auto subscripts{currObject_.subscripts};
-    if (subscripts.empty() && symbol.IsObjectArray()) {
-      // record a whole array as its first element
-      for (const ShapeSpec &spec : symbol.get<ObjectEntityDetails>().shape()) {
-        auto &lbound{spec.lbound().GetExplicit().value()};
-        subscripts.push_back(evaluate::ToInt64(lbound).value());
+    if (Symbol * symbol{currObject_.symbol}) {
+      if (!currSet_.empty()) {
+        // check this symbol against first of set for compatibility
+        Symbol &first{currSet_.front().symbol};
+        CheckCanEquivalence(designator.source, first, *symbol) &&
+            CheckCanEquivalence(designator.source, *symbol, first);
+      }
+      auto subscripts{currObject_.subscripts};
+      if (subscripts.empty()) {
+        if (const ArraySpec * shape{symbol->GetShape()};
+            shape && shape->IsExplicitShape()) {
+          // record a whole array as its first element
+          for (const ShapeSpec &spec : *shape) {
+            if (auto lbound{spec.lbound().GetExplicit()}) {
+              if (auto lbValue{evaluate::ToInt64(*lbound)}) {
+                subscripts.push_back(*lbValue);
+                continue;
+              }
+            }
+            subscripts.clear(); // error recovery
+            break;
+          }
+        }
       }
+      auto substringStart{currObject_.substringStart};
+      currSet_.emplace_back(
+          *symbol, subscripts, substringStart, designator.source);
+      PropagateSaveAttr(currSet_.back(), currSet_);
     }
-    auto substringStart{currObject_.substringStart};
-    currSet_.emplace_back(
-        symbol, subscripts, substringStart, designator.source);
-    PropagateSaveAttr(currSet_.back(), currSet_);
   }
   currObject_ = {};
 }
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index a46c0f3..68cfc86 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -6227,7 +6227,7 @@ void DeclarationVisitor::CheckEquivalenceSets() {
     }
     for (const parser::EquivalenceObject &object : *set) {
       const auto &designator{object.v.value()};
-      // The designator was not resolved when it was encountered so do it now.
+      // The designator was not resolved when it was encountered, so do it now.
       // AnalyzeExpr causes array sections to be changed to substrings as needed
       Walk(designator);
       if (AnalyzeExpr(context(), designator)) {
@@ -7846,28 +7846,31 @@ bool DeclarationVisitor::CheckForHostAssociatedImplicit(
   if (name.symbol) {
     ApplyImplicitRules(*name.symbol, true);
   }
-  Symbol *hostSymbol;
-  Scope *host{GetHostProcedure()};
-  if (!host || isImplicitNoneType(*host)) {
-    return false;
-  }
-  if (!name.symbol) {
-    hostSymbol = &MakeSymbol(*host, name.source, Attrs{});
-    ConvertToObjectEntity(*hostSymbol);
-    ApplyImplicitRules(*hostSymbol);
-    hostSymbol->set(Symbol::Flag::ImplicitOrError);
-  } else if (name.symbol->test(Symbol::Flag::ImplicitOrError)) {
-    hostSymbol = name.symbol;
-  } else {
-    return false;
-  }
-  Symbol &symbol{MakeHostAssocSymbol(name, *hostSymbol)};
-  if (isImplicitNoneType()) {
-    symbol.get<HostAssocDetails>().implicitOrExplicitTypeError = true;
-  } else {
-    symbol.get<HostAssocDetails>().implicitOrSpecExprError = true;
+  if (Scope * host{GetHostProcedure()}; host && !isImplicitNoneType(*host)) {
+    Symbol *hostSymbol{nullptr};
+    if (!name.symbol) {
+      if (currScope().CanImport(name.source)) {
+        hostSymbol = &MakeSymbol(*host, name.source, Attrs{});
+        ConvertToObjectEntity(*hostSymbol);
+        ApplyImplicitRules(*hostSymbol);
+        hostSymbol->set(Symbol::Flag::ImplicitOrError);
+      }
+    } else if (name.symbol->test(Symbol::Flag::ImplicitOrError)) {
+      hostSymbol = name.symbol;
+    }
+    if (hostSymbol) {
+      Symbol &symbol{MakeHostAssocSymbol(name, *hostSymbol)};
+      if (auto *assoc{symbol.detailsIf<HostAssocDetails>()}) {
+        if (isImplicitNoneType()) {
+          assoc->implicitOrExplicitTypeError = true;
+        } else {
+          assoc->implicitOrSpecExprError = true;
+        }
+        return true;
+      }
+    }
   }
-  return true;
+  return false;
 }
 
 bool DeclarationVisitor::IsUplevelReference(const Symbol &symbol) {
diff --git a/flang/lib/Semantics/symbol.cpp b/flang/lib/Semantics/symbol.cpp
index 381905b..3eb120f 100644
--- a/flang/lib/Semantics/symbol.cpp
+++ b/flang/lib/Semantics/symbol.cpp
@@ -385,9 +385,17 @@ bool Symbol::IsFuncResult() const {
       details_);
 }
 
+const ArraySpec *Symbol::GetShape() const {
+  if (const auto *details{std::get_if<ObjectEntityDetails>(&details_)}) {
+    return &details->shape();
+  } else {
+    return nullptr;
+  }
+}
+
 bool Symbol::IsObjectArray() const {
-  const auto *details{std::get_if<ObjectEntityDetails>(&details_)};
-  return details && details->IsArray();
+  const ArraySpec *shape{GetShape()};
+  return shape && !shape->empty();
 }
 
 bool Symbol::IsSubprogram() const {
diff --git a/flang/runtime/edit-output.cpp b/flang/runtime/edit-output.cpp
index 6b24c56..1a73c85 100644
--- a/flang/runtime/edit-output.cpp
+++ b/flang/runtime/edit-output.cpp
@@ -263,7 +263,6 @@ template <int KIND>
 RT_API_ATTRS decimal::ConversionToDecimalResult
 RealOutputEditing<KIND>::ConvertToDecimal(
     int significantDigits, enum decimal::FortranRounding rounding, int flags) {
-#if !defined(RT_DEVICE_COMPILATION)
   auto converted{decimal::ConvertToDecimal<binaryPrecision>(buffer_,
       sizeof buffer_, static_cast<enum decimal::DecimalConversionFlags>(flags),
       significantDigits, rounding, x_)};
@@ -273,10 +272,6 @@ RealOutputEditing<KIND>::ConvertToDecimal(
         sizeof buffer_);
   }
   return converted;
-#else // defined(RT_DEVICE_COMPILATION)
-  // TODO: enable Decimal library build for the device.
-  io_.GetIoErrorHandler().Crash("not implemented yet: decimal conversion");
-#endif // defined(RT_DEVICE_COMPILATION)
 }
 
 static RT_API_ATTRS bool IsInfOrNaN(const char *p, int length) {
diff --git a/flang/runtime/external-unit.cpp b/flang/runtime/external-unit.cpp
index b48549d..4bfa218 100644
--- a/flang/runtime/external-unit.cpp
+++ b/flang/runtime/external-unit.cpp
@@ -214,6 +214,13 @@ Iostat ExternalFileUnit::SetDirection(Direction direction) {
     }
   } else {
     if (mayWrite()) {
+      if (direction_ == Direction::Input) {
+        // Don't retain any input data from previous record, like a
+        // variable-length unformatted record footer, in the frame,
+        // since we're going start writing frames.
+        frameOffsetInFile_ += recordOffsetInFrame_;
+        recordOffsetInFrame_ = 0;
+      }
       direction_ = Direction::Output;
       return IostatOk;
     } else {
@@ -332,5 +339,4 @@ bool ExternalFileUnit::Wait(int id) {
 }
 
 } // namespace Fortran::runtime::io
-
 #endif // !defined(RT_USE_PSEUDO_FILE_UNIT)
diff --git a/flang/runtime/numeric.cpp b/flang/runtime/numeric.cpp
index 52b5a56..2225473 100644
--- a/flang/runtime/numeric.cpp
+++ b/flang/runtime/numeric.cpp
@@ -117,13 +117,13 @@ inline RT_API_ATTRS CppTypeFor<TypeCategory::Integer, 4> SelectedIntKind(T x) {
 template <typename T>
 inline RT_API_ATTRS CppTypeFor<TypeCategory::Integer, 4> SelectedLogicalKind(
     T x) {
-  if (x <= 2) {
+  if (x <= 8) {
     return 1;
-  } else if (x <= 4) {
+  } else if (x <= 16) {
     return 2;
-  } else if (x <= 9) {
+  } else if (x <= 32) {
     return 4;
-  } else if (x <= 18) {
+  } else if (x <= 64) {
     return 8;
   }
   return -1;
diff --git a/flang/runtime/terminator.h b/flang/runtime/terminator.h
index 59a47ce..609f059 100644
--- a/flang/runtime/terminator.h
+++ b/flang/runtime/terminator.h
@@ -54,7 +54,7 @@ public:
   // to regular printf for the device compilation.
   // Try to keep the inline implementations as small as possible.
   template <typename... Args>
-  [[noreturn]] RT_API_ATTRS const char *Crash(
+  [[noreturn]] RT_DEVICE_NOINLINE RT_API_ATTRS const char *Crash(
       const char *message, Args... args) const {
 #if !defined(RT_DEVICE_COMPILATION)
     // Invoke handler set up by the test harness.
diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp
index 3b42f45..a11f444 100644
--- a/flang/runtime/unit.cpp
+++ b/flang/runtime/unit.cpp
@@ -265,6 +265,7 @@ void ExternalFileUnit::FinishReadingRecord(IoErrorHandler &handler) {
     furthestPositionInRecord =
         std::max(furthestPositionInRecord, positionInRecord);
     frameOffsetInFile_ += recordOffsetInFrame_ + furthestPositionInRecord;
+    recordOffsetInFrame_ = 0;
   }
   BeginRecord();
 }
diff --git a/flang/test/Driver/fopenmp.f90 b/flang/test/Driver/fopenmp.f90
index c71d34d..d70fe10 100644
--- a/flang/test/Driver/fopenmp.f90
+++ b/flang/test/Driver/fopenmp.f90
@@ -14,7 +14,7 @@
 ! CHECK-FC1-OPENMP: "-fc1"
 ! CHECK-FC1-OPENMP: "-fopenmp"
 !
-! CHECK-WARNING: warning: The library '-fopenmp=={{.*}}' is not supported, openmp is not be enabled
+! CHECK-WARNING: warning: the library '-fopenmp=={{.*}}' is not supported, OpenMP will not be enabled
 ! CHECK-FC1-NO-OPENMP: "-fc1"
 ! CHECK-FC1-NO-OPENMP-NOT: "-fopenmp"
 !
@@ -51,9 +51,14 @@
 ! We'd like to check that the default is sane, but until we have the ability
 ! to *always* semantically analyze OpenMP without always generating runtime
 ! calls (in the event of an unsupported runtime), we don't have a good way to
-! test the CC1 invocation. Instead, just ensure we do eventually link *some*
+! test the FC1 invocation. Instead, just ensure we do eventually link *some*
 ! OpenMP runtime.
 !
+! RUN: %flang -target x86_64-linux-gnu -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
+! RUN: %flang -target x86_64-darwin -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
+! RUN: %flang -target x86_64-freebsd -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
+! RUN: %flang -target x86_64-windows-gnu -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANYMD
+!
 ! CHECK-LD-ANY: "{{.*}}ld{{(.exe)?}}"
 ! CHECK-LD-ANY: "-l{{(omp|gomp|iomp5)}}"
 !
diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90
index 6d0e6c3..b3712db 100644
--- a/flang/test/Driver/mlir-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-pass-pipeline.f90
@@ -30,8 +30,15 @@ end program
 ! O2-NEXT: CSE
 ! O2-NEXT: (S) {{.*}} num-cse'd
 ! O2-NEXT: (S) {{.*}} num-dce'd
+! O2-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+! O2-NEXT: 'fir.global' Pipeline
+! O2-NEXT:   OptimizedBufferization
 ! O2-NEXT: 'func.func' Pipeline
 ! O2-NEXT:   OptimizedBufferization
+! O2-NEXT: 'omp.declare_reduction' Pipeline
+! O2-NEXT:   OptimizedBufferization
+! O2-NEXT: 'omp.private' Pipeline
+! O2-NEXT:   OptimizedBufferization
 ! ALL: LowerHLFIROrderedAssignments
 ! ALL-NEXT: LowerHLFIRIntrinsics
 ! ALL-NEXT: BufferizeHLFIR
diff --git a/flang/test/Driver/w-arg-unsupported.f90 b/flang/test/Driver/w-arg-unsupported.f90
index 1ef25fd..be753bf 100644
--- a/flang/test/Driver/w-arg-unsupported.f90
+++ b/flang/test/Driver/w-arg-unsupported.f90
@@ -6,32 +6,32 @@
 ! RUN:        -Wrealloc-lhs -Wrealloc-lhs-all -Wfrontend-loop-interchange -Wtarget-lifetime %s \
 ! RUN:        -c 2>&1 | FileCheck %s
 
-! CHECK: The warning option '-Wextra' is not supported
-! CHECK-NEXT: The warning option '-Waliasing' is not supported
-! CHECK-NEXT: The warning option '-Wampersand' is not supported
-! CHECK-NEXT: The warning option '-Warray-bounds' is not supported
-! CHECK-NEXT: The warning option '-Wc-binding-type' is not supported
-! CHECK-NEXT: The warning option '-Wcharacter-truncation' is not supported
-! CHECK-NEXT: The warning option '-Wconversion' is not supported
-! CHECK-NEXT: The warning option '-Wdo-subscript' is not supported
-! CHECK-NEXT: The warning option '-Wfunction-elimination' is not supported
-! CHECK-NEXT: The warning option '-Wimplicit-interface' is not supported
-! CHECK-NEXT: The warning option '-Wimplicit-procedure' is not supported
-! CHECK-NEXT: The warning option '-Wintrinsic-shadow' is not supported
-! CHECK-NEXT: The warning option '-Wuse-without-only' is not supported
-! CHECK-NEXT: The warning option '-Wintrinsics-std' is not supported
-! CHECK-NEXT: The warning option '-Wline-truncation' is not supported
-! CHECK-NEXT: The warning option '-Wno-align-commons' is not supported
-! CHECK-NEXT: The warning option '-Wno-overwrite-recursive' is not supported
-! CHECK-NEXT: The warning option '-Wno-tabs' is not supported
-! CHECK-NEXT: The warning option '-Wreal-q-constant' is not supported
-! CHECK-NEXT: The warning option '-Wsurprising' is not supported
-! CHECK-NEXT: The warning option '-Wunderflow' is not supported
-! CHECK-NEXT: The warning option '-Wunused-parameter' is not supported
-! CHECK-NEXT: The warning option '-Wrealloc-lhs' is not supported
-! CHECK-NEXT: The warning option '-Wrealloc-lhs-all' is not supported
-! CHECK-NEXT: The warning option '-Wfrontend-loop-interchange' is not supported
-! CHECK-NEXT: The warning option '-Wtarget-lifetime' is not supported
+! CHECK: the warning option '-Wextra' is not supported
+! CHECK-NEXT: the warning option '-Waliasing' is not supported
+! CHECK-NEXT: the warning option '-Wampersand' is not supported
+! CHECK-NEXT: the warning option '-Warray-bounds' is not supported
+! CHECK-NEXT: the warning option '-Wc-binding-type' is not supported
+! CHECK-NEXT: the warning option '-Wcharacter-truncation' is not supported
+! CHECK-NEXT: the warning option '-Wconversion' is not supported
+! CHECK-NEXT: the warning option '-Wdo-subscript' is not supported
+! CHECK-NEXT: the warning option '-Wfunction-elimination' is not supported
+! CHECK-NEXT: the warning option '-Wimplicit-interface' is not supported
+! CHECK-NEXT: the warning option '-Wimplicit-procedure' is not supported
+! CHECK-NEXT: the warning option '-Wintrinsic-shadow' is not supported
+! CHECK-NEXT: the warning option '-Wuse-without-only' is not supported
+! CHECK-NEXT: the warning option '-Wintrinsics-std' is not supported
+! CHECK-NEXT: the warning option '-Wline-truncation' is not supported
+! CHECK-NEXT: the warning option '-Wno-align-commons' is not supported
+! CHECK-NEXT: the warning option '-Wno-overwrite-recursive' is not supported
+! CHECK-NEXT: the warning option '-Wno-tabs' is not supported
+! CHECK-NEXT: the warning option '-Wreal-q-constant' is not supported
+! CHECK-NEXT: the warning option '-Wsurprising' is not supported
+! CHECK-NEXT: the warning option '-Wunderflow' is not supported
+! CHECK-NEXT: the warning option '-Wunused-parameter' is not supported
+! CHECK-NEXT: the warning option '-Wrealloc-lhs' is not supported
+! CHECK-NEXT: the warning option '-Wrealloc-lhs-all' is not supported
+! CHECK-NEXT: the warning option '-Wfrontend-loop-interchange' is not supported
+! CHECK-NEXT: the warning option '-Wtarget-lifetime' is not supported
 
 program m
 end program
diff --git a/flang/test/Driver/wextra-ok.f90 b/flang/test/Driver/wextra-ok.f90
index 48676e8..6a38d94 100644
--- a/flang/test/Driver/wextra-ok.f90
+++ b/flang/test/Driver/wextra-ok.f90
@@ -4,7 +4,7 @@
 ! RUN: %flang -std=f2018 -Wextra %s -c 2>&1 | FileCheck %s --check-prefix=CHECK-OK
 ! RUN: not %flang -std=f2018 -Wblah -Wextra %s -c 2>&1 | FileCheck %s --check-prefix=WRONG
 
-! CHECK-OK: The warning option '-Wextra' is not supported
+! CHECK-OK: the warning option '-Wextra' is not supported
 ! WRONG: Only `-Werror` is supported currently.
 
 program wextra_ok
diff --git a/flang/test/Evaluate/triplets01.f90 b/flang/test/Evaluate/triplets01.f90
new file mode 100644
index 0000000..aba9772
--- /dev/null
+++ b/flang/test/Evaluate/triplets01.f90
@@ -0,0 +1,11 @@
+! RUN: %python %S/test_folding.py %s %flang_fc1
+module m
+  logical, parameter :: test01 = all([1:10:2] == [(j, j=1,10,2)])
+  logical, parameter :: test02 = kind([1:20:2]) == kind(1)
+  logical, parameter :: test03 = all([10:1:-3,123] == [(j, j=10,1,-3),123])
+  logical, parameter :: test04 = kind([10:1:-3,123]) == kind(1)
+  logical, parameter :: test05 = kind([10_2:1_2:-3_2,123_2]) == 2
+  logical, parameter :: test06 = all([10_2:1_2:-3_2,123_2] == [(j, integer(2)::j=10,1,-3),123_2])
+  logical, parameter :: test07 = kind([10_2:1_4:-3_2]) == 4
+  logical, parameter :: test08 = kind([10_2:1_4]) == 4
+end
diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index 42bceb6..db252c4 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -34,7 +34,14 @@ func.func @_QQmain() {
 // PASSES-NEXT:   CSE
 // PASSES-NEXT:    (S) 0 num-cse'd - Number of operations CSE'd
 // PASSES-NEXT:    (S) 0 num-dce'd - Number of operations DCE'd
-// PASSES-NEXT:   'func.func' Pipeline
+// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+// PASSES-NEXT: 'fir.global' Pipeline
+// PASSES-NEXT:    OptimizedBufferization
+// PASSES-NEXT: 'func.func' Pipeline
+// PASSES-NEXT:    OptimizedBufferization
+// PASSES-NEXT: 'omp.declare_reduction' Pipeline
+// PASSES-NEXT:    OptimizedBufferization
+// PASSES-NEXT: 'omp.private' Pipeline
 // PASSES-NEXT:    OptimizedBufferization
 // PASSES-NEXT:   LowerHLFIROrderedAssignments
 // PASSES-NEXT:   LowerHLFIRIntrinsics
diff --git a/flang/test/Integration/debug-complex-1.f90 b/flang/test/Integration/debug-complex-1.f90
new file mode 100644
index 0000000..c8d0da4
--- /dev/null
+++ b/flang/test/Integration/debug-complex-1.f90
@@ -0,0 +1,26 @@
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck  %s
+
+program mn
+  complex(kind=4) :: c4
+  complex(kind=8) :: c8
+  complex(kind=16) :: r
+  r = fn1(c4, c8)
+  print *, r
+contains
+  function fn1(a, b) result (c)
+    complex(kind=4), intent(in) :: a
+    complex(kind=8), intent(in) :: b
+    complex(kind=16) :: c
+    c = a + b
+  end function
+end program
+
+! CHECK-DAG: ![[C4:.*]] = !DIBasicType(name: "complex", size: 64, encoding: DW_ATE_complex_float)
+! CHECK-DAG: ![[C8:.*]] = !DIBasicType(name: "complex", size: 128, encoding: DW_ATE_complex_float)
+! CHECK-DAG: ![[C16:.*]] = !DIBasicType(name: "complex", size: 256, encoding: DW_ATE_complex_float)
+! CHECK-DAG: !DILocalVariable(name: "c4"{{.*}}type: ![[C4]])
+! CHECK-DAG: !DILocalVariable(name: "c8"{{.*}}type: ![[C8]])
+! CHECK-DAG: !DILocalVariable(name: "r"{{.*}}type: ![[C16]])
+! CHECK-DAG: !DILocalVariable(name: "a"{{.*}}type: ![[C4]])
+! CHECK-DAG: !DILocalVariable(name: "b"{{.*}}type: ![[C8]])
+! CHECK-DAG: !DILocalVariable(name: "c"{{.*}}type: ![[C16]])
diff --git a/flang/test/Integration/debug-fixed-array-type-2.f90 b/flang/test/Integration/debug-fixed-array-type-2.f90
new file mode 100644
index 0000000..3155254
--- /dev/null
+++ b/flang/test/Integration/debug-fixed-array-type-2.f90
@@ -0,0 +1,43 @@
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck %s
+
+program mn
+
+  integer d1(3)
+  integer d2(2, 5)
+  real d3(6, 8, 7)
+
+  i8 = fn1(d1, d2, d3)
+contains
+  function fn1(a1, b1, c1) result (res)
+    integer a1(3)
+    integer b1(2, 5)
+    real c1(6, 8, 7)
+    integer res
+    res = a1(1) + b1(1,2) + c1(3, 3, 4)
+  end function
+
+end program
+
+! CHECK-DAG: ![[INT:.*]] = !DIBasicType(name: "integer", size: 32, encoding: DW_ATE_signed)
+! CHECK-DAG: ![[REAL:.*]] = !DIBasicType(name: "real", size: 32, encoding: DW_ATE_float)
+! CHECK-DAG: ![[R1:.*]] = !DISubrange(count: 3, lowerBound: 1)
+! CHECK-DAG: ![[SUB1:.*]] = !{![[R1]]}
+! CHECK-DAG: ![[D1TY:.*]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[INT]], elements: ![[SUB1]])
+! CHECK-DAG: !DILocalVariable(name: "d1"{{.*}}type: ![[D1TY]])
+
+! CHECK-DAG: ![[R21:.*]] = !DISubrange(count: 2, lowerBound: 1)
+! CHECK-DAG: ![[R22:.*]] = !DISubrange(count: 5, lowerBound: 1)
+! CHECK-DAG: ![[SUB2:.*]] = !{![[R21]], ![[R22]]}
+! CHECK-DAG: ![[D2TY:.*]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[INT]], elements: ![[SUB2]])
+! CHECK-DAG: !DILocalVariable(name: "d2"{{.*}}type: ![[D2TY]])
+
+! CHECK-DAG: ![[R31:.*]] = !DISubrange(count: 6, lowerBound: 1)
+! CHECK-DAG: ![[R32:.*]] = !DISubrange(count: 8, lowerBound: 1)
+! CHECK-DAG: ![[R33:.*]] = !DISubrange(count: 7, lowerBound: 1)
+! CHECK-DAG: ![[SUB3:.*]] = !{![[R31]], ![[R32]], ![[R33]]}
+! CHECK-DAG: ![[D3TY:.*]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[REAL]], elements: ![[SUB3]])
+! CHECK-DAG: !DILocalVariable(name: "d3"{{.*}}type: ![[D3TY]])
+
+! CHECK-DAG: !DILocalVariable(name: "a1", arg: 1{{.*}}type: ![[D1TY]])
+! CHECK-DAG: !DILocalVariable(name: "b1", arg: 2{{.*}}type: ![[D2TY]])
+! CHECK-DAG: !DILocalVariable(name: "c1", arg: 3{{.*}}type: ![[D3TY]])
diff --git a/flang/test/Integration/debug-module-2.f90 b/flang/test/Integration/debug-module-2.f90
new file mode 100644
index 0000000..60fccaa
--- /dev/null
+++ b/flang/test/Integration/debug-module-2.f90
@@ -0,0 +1,39 @@
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=line-tables-only %s -o - | FileCheck --check-prefix=LINEONLY %s
+
+! CHECK-DAG: ![[FILE:.*]] = !DIFile(filename: {{.*}}debug-module-2.f90{{.*}})
+! CHECK-DAG: ![[FILE2:.*]] = !DIFile(filename: {{.*}}debug-module-2.f90{{.*}})
+! CHECK-DAG: ![[CU:.*]] = distinct !DICompileUnit({{.*}}file: ![[FILE]]{{.*}} globals: ![[GLOBALS:.*]])
+! CHECK-DAG: ![[MOD:.*]] = !DIModule(scope: ![[CU]], name: "helper", file: ![[FILE]]{{.*}})
+! CHECK-DAG: ![[R4:.*]] = !DIBasicType(name: "real", size: 32, encoding: DW_ATE_float)
+! CHECK-DAG: ![[I4:.*]] = !DIBasicType(name: "integer", size: 32, encoding: DW_ATE_signed)
+module helper
+! CHECK-DAG: ![[GLR:.*]] = distinct !DIGlobalVariable(name: "glr", linkageName: "_QMhelperEglr", scope: ![[MOD]], file: ![[FILE]], line: [[@LINE+2]], type: ![[R4]], isLocal: false, isDefinition: true)
+! CHECK-DAG: ![[GLRX:.*]] = !DIGlobalVariableExpression(var: ![[GLR]], expr: !DIExpression())
+  real glr
+
+! CHECK-DAG: ![[GLI:.*]] = distinct !DIGlobalVariable(name: "gli", linkageName: "_QMhelperEgli", scope: ![[MOD]], file: ![[FILE]], line: [[@LINE+2]], type: ![[I4]], isLocal: false, isDefinition: true)
+! CHECK-DAG: ![[GLIX:.*]] = !DIGlobalVariableExpression(var: ![[GLI]], expr: !DIExpression())
+  integer gli
+
+  contains
+!CHECK-DAG: !DISubprogram(name: "test", linkageName: "_QMhelperPtest", scope: ![[MOD]], file: ![[FILE2]], line: [[@LINE+1]]{{.*}}unit: ![[CU]])
+    subroutine test()
+    glr = 12.34
+    gli = 67
+
+    end subroutine
+end module helper
+
+program test
+use helper
+implicit none
+
+  glr = 3.14
+  gli = 2
+  call test()
+
+end program test
+
+! CHECK-DAG: ![[GLOBALS]] = !{![[GLIX]], ![[GLRX]]}
+! LINEONLY-NOT: DIGlobalVariable
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
index 084314e..42fa4d0 100644
--- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -25,6 +25,8 @@ subroutine sub1()
 
   adev = ahost + bhost
 
+  adev = 10
+
 end
 
 ! CHECK-LABEL: func.func @_QPsub1()
@@ -41,10 +43,7 @@ end
 ! CHECK: cuf.data_transfer %[[ASSOC]]#0 to %[[M]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<i32>, !fir.ref<i32>
 ! CHECK: hlfir.end_associate %[[ASSOC]]#1, %[[ASSOC]]#2 : !fir.ref<i32>, i1
 
-! CHECK: %[[C1:.*]] = arith.constant 1 : i32
-! CHECK: %[[ASSOC:.*]]:3 = hlfir.associate %[[C1]] {uniq_name = ".cuf_host_tmp"} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
-! CHECK: cuf.data_transfer %[[ASSOC]]#0 to %[[M]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<i32>, !fir.ref<i32>
-! CHECK: hlfir.end_associate %[[ASSOC]]#1, %[[ASSOC]]#2 : !fir.ref<i32>, i1
+! CHECK: cuf.data_transfer %c1{{.*}} to %[[M]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : i32, !fir.ref<i32>
 
 ! CHECK: cuf.data_transfer %[[AHOST]]#0 to %[[ADEV]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
 
@@ -62,6 +61,8 @@ end
 ! CHECK: cuf.data_transfer %[[ASSOC]]#0 to %[[ADEV]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
 ! CHECK: hlfir.end_associate %[[ASSOC]]#1, %[[ASSOC]]#2 : !fir.ref<!fir.array<10xi32>>, i1
 
+! CHECK: cuf.data_transfer %c10{{.*}} to %[[ADEV]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : i32, !fir.ref<!fir.array<10xi32>>
+
 subroutine sub2()
   integer, device :: m
   integer, device :: adev(10), bdev(10)
@@ -159,3 +160,22 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPsub6
 ! CHECK: cuf.data_transfer
+
+subroutine sub7(a, b, c)
+  integer, device, allocatable :: a(:), c(:)
+  integer, allocatable :: b(:)
+  b = a
+
+  a = b
+
+  c = a
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub7(
+! CHECK-SAME:  %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {fir.bindc_name = "b"}, %[[ARG2:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "c"}) {
+! CHECK: %[[A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+! CHECK: %[[B:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{.*}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Eb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+! CHECK: %[[C:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Ec"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+! CHECK: cuf.data_transfer %[[A]]#0 to %[[B]]#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: cuf.data_transfer %[[B]]#0 to %[[A]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: cuf.data_transfer %[[A]]#0 to %[[C]]#0 {transfer_kind = #cuf.cuda_transfer<device_device>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
diff --git a/flang/test/Lower/Intrinsics/selected_char_kind.f90 b/flang/test/Lower/Intrinsics/selected_char_kind.f90
new file mode 100644
index 0000000..4012591
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/selected_char_kind.f90
@@ -0,0 +1,17 @@
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
+
+subroutine selected_char_kind_test(c)
+  character(*) :: c
+  integer :: res
+  res = selected_char_kind(c)
+end
+
+! CHECK-LABEL: func.func @_QPselected_char_kind_test(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "c"})
+! CHECK: %[[UNBOXCHAR:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+! CHECK: %[[C:.*]]:2 = hlfir.declare %[[UNBOXCHAR]]#0 typeparams %[[UNBOXCHAR]]#1 dummy_scope %0 {uniq_name = "_QFselected_char_kind_testEc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK: %[[RES_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFselected_char_kind_testEres"}
+! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_char_kind_testEres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[CHAR_PTR:.*]] = fir.convert %[[C]]#1 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
+! CHECK: %[[CHAR_LEN:.*]] = fir.convert %[[UNBOXCHAR]]#1 : (index) -> i64
+! CHECK: %{{.*}} = fir.call @_FortranASelectedCharKind(%{{.*}}, %{{.*}}, %[[CHAR_PTR]], %[[CHAR_LEN]]) fastmath<contract> : (!fir.ref<i8>, i32, !fir.ref<i8>, i64) -> i32
diff --git a/flang/test/Lower/Intrinsics/selected_logical_kind.f90 b/flang/test/Lower/Intrinsics/selected_logical_kind.f90
new file mode 100644
index 0000000..9395276
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/selected_logical_kind.f90
@@ -0,0 +1,71 @@
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
+
+subroutine selected_logical_kind_test1(input)
+  integer(1) :: input, res
+  res = selected_logical_kind(input)
+end
+
+! CHECK-LABEL: func.func @_QPselected_logical_kind_test1(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i8> {fir.bindc_name = "input"})
+! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {uniq_name = "_QFselected_logical_kind_test1Einput"} : (!fir.ref<i8>, !fir.dscope) -> (!fir.ref<i8>, !fir.ref<i8>)
+! CHECK: %[[RES_ALLOCA:.*]] = fir.alloca i8 {bindc_name = "res", uniq_name = "_QFselected_logical_kind_test1Eres"}
+! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_logical_kind_test1Eres"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
+! CHECK: %[[KIND:.*]] = arith.constant 1 : i32
+! CHECK: %[[INPUT_ADDR:.*]] = fir.convert %1#1 : (!fir.ref<i8>) -> !fir.llvm_ptr<i8>
+! CHECK: %{{.*}} = fir.call @_FortranASelectedLogicalKind(%{{.*}}, %{{.*}}, %[[INPUT_ADDR]], %[[KIND]]) fastmath<contract> : (!fir.ref<i8>, i32, !fir.llvm_ptr<i8>, i32) -> i32
+
+subroutine selected_logical_kind_test2(input)
+  integer(2) :: input, res
+  res = selected_logical_kind(input)
+end
+
+! CHECK-LABEL: func.func @_QPselected_logical_kind_test2(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i16> {fir.bindc_name = "input"})
+! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {uniq_name = "_QFselected_logical_kind_test2Einput"} : (!fir.ref<i16>, !fir.dscope) -> (!fir.ref<i16>, !fir.ref<i16>)
+! CHECK: %[[RES_ALLOCA:.*]] = fir.alloca i16 {bindc_name = "res", uniq_name = "_QFselected_logical_kind_test2Eres"}
+! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_logical_kind_test2Eres"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
+! CHECK: %[[KIND:.*]] = arith.constant 2 : i32
+! CHECK: %[[INPUT_ADDR:.*]] = fir.convert %[[INPUT]]#1 : (!fir.ref<i16>) -> !fir.llvm_ptr<i8>
+! CHECK: %{{.*}} = fir.call @_FortranASelectedLogicalKind(%{{.*}}, %{{.*}}, %[[INPUT_ADDR]], %[[KIND]]) fastmath<contract> : (!fir.ref<i8>, i32, !fir.llvm_ptr<i8>, i32) -> i32
+
+subroutine selected_logical_kind_test4(input)
+  integer(4) :: input, res
+  res = selected_logical_kind(input)
+end
+
+! CHECK-LABEL: func.func @_QPselected_logical_kind_test4(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "input"})
+! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFselected_logical_kind_test4Einput"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[RES_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFselected_logical_kind_test4Eres"}
+! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_logical_kind_test4Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[KIND:.*]] = arith.constant 4 : i32
+! CHECK: %[[INPUT_ADDR:.*]] = fir.convert %[[INPUT]]#1 : (!fir.ref<i32>) -> !fir.llvm_ptr<i8>
+! CHECK: %{{.*}} = fir.call @_FortranASelectedLogicalKind(%{{.*}}, %{{.*}}, %[[INPUT_ADDR]], %[[KIND]]) fastmath<contract> : (!fir.ref<i8>, i32, !fir.llvm_ptr<i8>, i32) -> i32
+
+subroutine selected_logical_kind_test8(input)
+  integer(8) :: input, res
+  res = selected_logical_kind(input)
+end
+
+! CHECK-LABEL: func.func @_QPselected_logical_kind_test8(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i64> {fir.bindc_name = "input"})
+! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {uniq_name = "_QFselected_logical_kind_test8Einput"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[RES_ALLOCA]] = fir.alloca i64 {bindc_name = "res", uniq_name = "_QFselected_logical_kind_test8Eres"}
+! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_logical_kind_test8Eres"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[KIND:.*]] = arith.constant 8 : i32
+! CHECK: %[[INPUT_ADDR:.*]] = fir.convert %[[INPUT]]#1 : (!fir.ref<i64>) -> !fir.llvm_ptr<i8>
+! CHECK: %{{.*}} = fir.call @_FortranASelectedLogicalKind(%{{.*}}, %{{.*}}, %[[INPUT_ADDR]], %[[KIND]]) fastmath<contract> : (!fir.ref<i8>, i32, !fir.llvm_ptr<i8>, i32) -> i32
+
+subroutine selected_logical_kind_test16(input)
+  integer(16) :: input, res
+  res = selected_logical_kind(input)
+end
+
+! CHECK-LABEL: func.func @_QPselected_logical_kind_test16(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i128> {fir.bindc_name = "input"})
+! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {uniq_name = "_QFselected_logical_kind_test16Einput"} : (!fir.ref<i128>, !fir.dscope) -> (!fir.ref<i128>, !fir.ref<i128>)
+! CHECK: %[[RES_ALLOCA:.*]] = fir.alloca i128 {bindc_name = "res", uniq_name = "_QFselected_logical_kind_test16Eres"}
+! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_logical_kind_test16Eres"} : (!fir.ref<i128>) -> (!fir.ref<i128>, !fir.ref<i128>)
+! CHECK: %[[KIND:.*]] = arith.constant 16 : i32
+! CHECK: %[[INPUT_ADDR:.*]] = fir.convert %[[INPUT]]#1 : (!fir.ref<i128>) -> !fir.llvm_ptr<i8>
+! CHECK: %{{.*}} = fir.call @_FortranASelectedLogicalKind(%{{.*}}, %{{.*}}, %[[INPUT_ADDR]], %[[KIND]]) fastmath<contract> : (!fir.ref<i8>, i32, !fir.llvm_ptr<i8>, i32) -> i32
diff --git a/flang/test/Lower/OpenMP/invalid-reduction-modifier.f90 b/flang/test/Lower/OpenMP/invalid-reduction-modifier.f90
index 5387127..b3e87df 100644
--- a/flang/test/Lower/OpenMP/invalid-reduction-modifier.f90
+++ b/flang/test/Lower/OpenMP/invalid-reduction-modifier.f90
@@ -1,6 +1,4 @@
-!Remove the --crash below once we can diagnose the issue more gracefully.
-!REQUIRES: asserts
-!RUN: not --crash %flang_fc1 -fopenmp -emit-hlfir -o - %s
+!RUN: not %flang_fc1 -fopenmp -emit-hlfir -o - %s
 
 ! Check that we reject the "task" reduction modifier on the "simd" directive.
 
diff --git a/flang/test/Semantics/OpenMP/allocate-clause01.f90 b/flang/test/Semantics/OpenMP/allocate-clause01.f90
index 486166e..2b9a72e9 100644
--- a/flang/test/Semantics/OpenMP/allocate-clause01.f90
+++ b/flang/test/Semantics/OpenMP/allocate-clause01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocate-directive.f90 b/flang/test/Semantics/OpenMP/allocate-directive.f90
index f55b724..18a14b8 100644
--- a/flang/test/Semantics/OpenMP/allocate-directive.f90
+++ b/flang/test/Semantics/OpenMP/allocate-directive.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocate01.f90 b/flang/test/Semantics/OpenMP/allocate01.f90
index a3d5fb5..6ccb8bb 100644
--- a/flang/test/Semantics/OpenMP/allocate01.f90
+++ b/flang/test/Semantics/OpenMP/allocate01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocate02.f90 b/flang/test/Semantics/OpenMP/allocate02.f90
index b9bfdbe..8f0579e 100644
--- a/flang/test/Semantics/OpenMP/allocate02.f90
+++ b/flang/test/Semantics/OpenMP/allocate02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocate03.f90 b/flang/test/Semantics/OpenMP/allocate03.f90
index ce577c8..e35115f 100644
--- a/flang/test/Semantics/OpenMP/allocate03.f90
+++ b/flang/test/Semantics/OpenMP/allocate03.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocate04.f90 b/flang/test/Semantics/OpenMP/allocate04.f90
index 37f180c..ea89d94 100644
--- a/flang/test/Semantics/OpenMP/allocate04.f90
+++ b/flang/test/Semantics/OpenMP/allocate04.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocate05.f90 b/flang/test/Semantics/OpenMP/allocate05.f90
index c4e0ace..a787e8b 100644
--- a/flang/test/Semantics/OpenMP/allocate05.f90
+++ b/flang/test/Semantics/OpenMP/allocate05.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocate06.f90 b/flang/test/Semantics/OpenMP/allocate06.f90
index e25b4c4..e14134c 100644
--- a/flang/test/Semantics/OpenMP/allocate06.f90
+++ b/flang/test/Semantics/OpenMP/allocate06.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocate07.f90 b/flang/test/Semantics/OpenMP/allocate07.f90
index 2b0f176..396df59 100644
--- a/flang/test/Semantics/OpenMP/allocate07.f90
+++ b/flang/test/Semantics/OpenMP/allocate07.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocate08.f90 b/flang/test/Semantics/OpenMP/allocate08.f90
index 82aa11d..fc950ea 100644
--- a/flang/test/Semantics/OpenMP/allocate08.f90
+++ b/flang/test/Semantics/OpenMP/allocate08.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocate09.f90 b/flang/test/Semantics/OpenMP/allocate09.f90
index 3664c34..0f93a34 100644
--- a/flang/test/Semantics/OpenMP/allocate09.f90
+++ b/flang/test/Semantics/OpenMP/allocate09.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocators01.f90 b/flang/test/Semantics/OpenMP/allocators01.f90
index f10db35..c75c522 100644
--- a/flang/test/Semantics/OpenMP/allocators01.f90
+++ b/flang/test/Semantics/OpenMP/allocators01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocators02.f90 b/flang/test/Semantics/OpenMP/allocators02.f90
index 7f8fa36..8055d21 100644
--- a/flang/test/Semantics/OpenMP/allocators02.f90
+++ b/flang/test/Semantics/OpenMP/allocators02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocators03.f90 b/flang/test/Semantics/OpenMP/allocators03.f90
index 050cc20..03cff1b 100644
--- a/flang/test/Semantics/OpenMP/allocators03.f90
+++ b/flang/test/Semantics/OpenMP/allocators03.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocators04.f90 b/flang/test/Semantics/OpenMP/allocators04.f90
index 3c84030..1d2e964 100644
--- a/flang/test/Semantics/OpenMP/allocators04.f90
+++ b/flang/test/Semantics/OpenMP/allocators04.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocators05.f90 b/flang/test/Semantics/OpenMP/allocators05.f90
index 8fd80b0..d0e11ca 100644
--- a/flang/test/Semantics/OpenMP/allocators05.f90
+++ b/flang/test/Semantics/OpenMP/allocators05.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocators06.f90 b/flang/test/Semantics/OpenMP/allocators06.f90
index 881182c..a975204 100644
--- a/flang/test/Semantics/OpenMP/allocators06.f90
+++ b/flang/test/Semantics/OpenMP/allocators06.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/atomic-hint-clause.f90 b/flang/test/Semantics/OpenMP/atomic-hint-clause.f90
index 9050cbb..e157b7e 100644
--- a/flang/test/Semantics/OpenMP/atomic-hint-clause.f90
+++ b/flang/test/Semantics/OpenMP/atomic-hint-clause.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags 
diff --git a/flang/test/Semantics/OpenMP/atomic.f90 b/flang/test/Semantics/OpenMP/atomic.f90
index 2f270ce..44f06b7 100644
--- a/flang/test/Semantics/OpenMP/atomic.f90
+++ b/flang/test/Semantics/OpenMP/atomic.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 use omp_lib
 ! Check OpenMP 2.13.6 atomic Construct
diff --git a/flang/test/Semantics/OpenMP/atomic01.f90 b/flang/test/Semantics/OpenMP/atomic01.f90
index 6ec94f3..f0e1b47 100644
--- a/flang/test/Semantics/OpenMP/atomic01.f90
+++ b/flang/test/Semantics/OpenMP/atomic01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/atomic02.f90 b/flang/test/Semantics/OpenMP/atomic02.f90
index 92f2c4b..b823bc4c 100644
--- a/flang/test/Semantics/OpenMP/atomic02.f90
+++ b/flang/test/Semantics/OpenMP/atomic02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/atomic03.f90 b/flang/test/Semantics/OpenMP/atomic03.f90
index 4cce71d..7636749 100644
--- a/flang/test/Semantics/OpenMP/atomic03.f90
+++ b/flang/test/Semantics/OpenMP/atomic03.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/atomic04.f90 b/flang/test/Semantics/OpenMP/atomic04.f90
index c03b230..a9644ad 100644
--- a/flang/test/Semantics/OpenMP/atomic04.f90
+++ b/flang/test/Semantics/OpenMP/atomic04.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/atomic05.f90 b/flang/test/Semantics/OpenMP/atomic05.f90
index cfba339..2d95664 100644
--- a/flang/test/Semantics/OpenMP/atomic05.f90
+++ b/flang/test/Semantics/OpenMP/atomic05.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/barrier.f90 b/flang/test/Semantics/OpenMP/barrier.f90
index 5fc3f7f..1483fbd 100644
--- a/flang/test/Semantics/OpenMP/barrier.f90
+++ b/flang/test/Semantics/OpenMP/barrier.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 
 !$omp barrier
diff --git a/flang/test/Semantics/OpenMP/clause-validity01.f90 b/flang/test/Semantics/OpenMP/clause-validity01.f90
index 779be00..22ac570 100644
--- a/flang/test/Semantics/OpenMP/clause-validity01.f90
+++ b/flang/test/Semantics/OpenMP/clause-validity01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags %openmp_module_flag
diff --git a/flang/test/Semantics/OpenMP/combined-constructs.f90 b/flang/test/Semantics/OpenMP/combined-constructs.f90
index ba504d1..35ab6fc 100644
--- a/flang/test/Semantics/OpenMP/combined-constructs.f90
+++ b/flang/test/Semantics/OpenMP/combined-constructs.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 
 program main
diff --git a/flang/test/Semantics/OpenMP/common-block.f90 b/flang/test/Semantics/OpenMP/common-block.f90
index 4ddc547..e1ddd12 100644
--- a/flang/test/Semantics/OpenMP/common-block.f90
+++ b/flang/test/Semantics/OpenMP/common-block.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %flang_fc1 -fopenmp -fdebug-dump-symbols %s | FileCheck %s
 
 program main
diff --git a/flang/test/Semantics/OpenMP/compiler-directive.f90 b/flang/test/Semantics/OpenMP/compiler-directive.f90
index 07363ac..5d3e9ba 100644
--- a/flang/test/Semantics/OpenMP/compiler-directive.f90
+++ b/flang/test/Semantics/OpenMP/compiler-directive.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! CompilerDirective with openmp tests
 
diff --git a/flang/test/Semantics/OpenMP/copyin01.f90 b/flang/test/Semantics/OpenMP/copyin01.f90
index 387a9fc..0051b5d 100644
--- a/flang/test/Semantics/OpenMP/copyin01.f90
+++ b/flang/test/Semantics/OpenMP/copyin01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 4.5
 ! 2.15.4.1 copyin Clause
diff --git a/flang/test/Semantics/OpenMP/copyin02.f90 b/flang/test/Semantics/OpenMP/copyin02.f90
index 9251289..09b8766 100644
--- a/flang/test/Semantics/OpenMP/copyin02.f90
+++ b/flang/test/Semantics/OpenMP/copyin02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 4.5
 ! 2.15.4.1 copyin Clause
diff --git a/flang/test/Semantics/OpenMP/copyin03.f90 b/flang/test/Semantics/OpenMP/copyin03.f90
index 5c0a2e8..7c3759a 100644
--- a/flang/test/Semantics/OpenMP/copyin03.f90
+++ b/flang/test/Semantics/OpenMP/copyin03.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 4.5
 ! 2.15.4.1 copyin Clause
diff --git a/flang/test/Semantics/OpenMP/copyin04.f90 b/flang/test/Semantics/OpenMP/copyin04.f90
index 7cbee5f..6f5e8df 100644
--- a/flang/test/Semantics/OpenMP/copyin04.f90
+++ b/flang/test/Semantics/OpenMP/copyin04.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 4.5
 ! 2.15.4.1 copyin Clause
diff --git a/flang/test/Semantics/OpenMP/copyin05.f90 b/flang/test/Semantics/OpenMP/copyin05.f90
index aec6a7f..142d5a7 100644
--- a/flang/test/Semantics/OpenMP/copyin05.f90
+++ b/flang/test/Semantics/OpenMP/copyin05.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 4.5
 ! 2.15.4.1 copyin Clause
diff --git a/flang/test/Semantics/OpenMP/copying.f90 b/flang/test/Semantics/OpenMP/copying.f90
index d56d2b8..63fb39a 100644
--- a/flang/test/Semantics/OpenMP/copying.f90
+++ b/flang/test/Semantics/OpenMP/copying.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp -Werror -pedantic
 ! OpenMP Version 5.0
 ! 2.19.4.4 firstprivate Clause
diff --git a/flang/test/Semantics/OpenMP/copyprivate01.f90 b/flang/test/Semantics/OpenMP/copyprivate01.f90
index 4920d7a..d5cf273 100644
--- a/flang/test/Semantics/OpenMP/copyprivate01.f90
+++ b/flang/test/Semantics/OpenMP/copyprivate01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 4.5
 ! 2.15.4.2 copyprivate Clause
diff --git a/flang/test/Semantics/OpenMP/copyprivate02.f90 b/flang/test/Semantics/OpenMP/copyprivate02.f90
index 2157cd4c..35fd6dd 100644
--- a/flang/test/Semantics/OpenMP/copyprivate02.f90
+++ b/flang/test/Semantics/OpenMP/copyprivate02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 4.5
 ! 2.15.4.2 copyprivate Clause
diff --git a/flang/test/Semantics/OpenMP/copyprivate03.f90 b/flang/test/Semantics/OpenMP/copyprivate03.f90
index f1433ce..9d39fdb 100644
--- a/flang/test/Semantics/OpenMP/copyprivate03.f90
+++ b/flang/test/Semantics/OpenMP/copyprivate03.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 4.5
 ! 2.15.4.2 copyprivate Clause
diff --git a/flang/test/Semantics/OpenMP/critical-empty.f90 b/flang/test/Semantics/OpenMP/critical-empty.f90
index 706f6d8..2001c8a 100644
--- a/flang/test/Semantics/OpenMP/critical-empty.f90
+++ b/flang/test/Semantics/OpenMP/critical-empty.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp 
 ! Test that there are no errors for an empty critical construct
 
diff --git a/flang/test/Semantics/OpenMP/critical-hint-clause.f90 b/flang/test/Semantics/OpenMP/critical-hint-clause.f90
index d737d67..419187f 100644
--- a/flang/test/Semantics/OpenMP/critical-hint-clause.f90
+++ b/flang/test/Semantics/OpenMP/critical-hint-clause.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags 
diff --git a/flang/test/Semantics/OpenMP/do02.f90 b/flang/test/Semantics/OpenMP/do02.f90
new file mode 100644
index 0000000..d9f5c99
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/do02.f90
@@ -0,0 +1,21 @@
+! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
+! XFAIL: *
+
+! OpenMP Version 4.5
+! 2.7.1 Loop Construct
+! Exit statement terminating !$OMP DO loop
+
+program omp_do
+  integer i, j, k
+
+  !$omp do
+  do i = 1, 10
+    do j = 1, 10
+      print *, "Hello"
+    end do
+    !ERROR: EXIT statement terminating !$OMP DO loop
+    exit
+  end do
+  !$omp end do
+
+end program omp_do
diff --git a/flang/test/Semantics/OpenMP/reduction-modifiers.f90 b/flang/test/Semantics/OpenMP/reduction-modifiers.f90
new file mode 100644
index 0000000..cf38200
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/reduction-modifiers.f90
@@ -0,0 +1,89 @@
+! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp -fopenmp-version=52
+
+subroutine mod_task1(x)
+  integer, intent(inout) :: x
+
+  !Correct: "parallel" directive.
+  !$omp parallel reduction(task, +:x)
+  do i = 1, 100
+    x = foo(i)
+  enddo
+  !$omp end parallel
+end
+
+subroutine mod_task2(x)
+  integer, intent(inout) :: x
+
+  !Correct: worksharing directive.
+  !$omp sections reduction(task, +:x)
+  do i = 1, 100
+    x = foo(i)
+  enddo
+  !$omp end sections
+end
+
+subroutine mod_task3(x)
+  integer, intent(inout) :: x
+
+  !ERROR: Modifier 'TASK' on REDUCTION clause is only allowed with PARALLEL or worksharing directive
+  !$omp simd reduction(task, +:x)
+  do i = 1, 100
+    x = foo(i)
+  enddo
+  !$omp end simd
+end
+
+subroutine mod_inscan1(x)
+  integer, intent(inout) :: x
+
+  !Correct: worksharing-loop directive
+  !$omp do reduction(inscan, +:x)
+  do i = 1, 100
+    x = foo(i)
+  enddo
+  !$omp end do
+end
+
+subroutine mod_inscan2(x)
+  integer, intent(inout) :: x
+
+  !Correct: worksharing-loop simd directive
+  !$omp do simd reduction(inscan, +:x)
+  do i = 1, 100
+    x = foo(i)
+  enddo
+  !$omp end do simd
+end
+
+subroutine mod_inscan3(x)
+  integer, intent(inout) :: x
+
+  !Correct: "simd" directive
+  !$omp simd reduction(inscan, +:x)
+  do i = 1, 100
+    x = foo(i)
+  enddo
+  !$omp end simd
+end
+
+subroutine mod_inscan4(x)
+  integer, intent(inout) :: x
+
+  !ERROR: Modifier 'INSCAN' on REDUCTION clause is only allowed with worksharing-loop, worksharing-loop simd, or SIMD directive
+  !$omp parallel reduction(inscan, +:x)
+  do i = 1, 100
+    x = foo(i)
+  enddo
+  !$omp end parallel
+end
+
+subroutine mod_inscan5(x)
+  integer, intent(inout) :: x
+
+  !ERROR: Modifier 'INSCAN' on REDUCTION clause is only allowed with worksharing-loop, worksharing-loop simd, or SIMD directive
+  !$omp sections reduction(inscan, +:x)
+  do i = 1, 100
+    x = foo(i)
+  enddo
+  !$omp end sections
+end
diff --git a/flang/test/Semantics/OpenMP/sections01.f90 b/flang/test/Semantics/OpenMP/sections01.f90
index 00b5a6d..c26cc88 100644
--- a/flang/test/Semantics/OpenMP/sections01.f90
+++ b/flang/test/Semantics/OpenMP/sections01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 
 ! OpenMP Version 4.5
diff --git a/flang/test/Semantics/OpenMP/sections02.f90 b/flang/test/Semantics/OpenMP/sections02.f90
index 912e7bc..ee29922 100644
--- a/flang/test/Semantics/OpenMP/sections02.f90
+++ b/flang/test/Semantics/OpenMP/sections02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/sections03.f90 b/flang/test/Semantics/OpenMP/sections03.f90
deleted file mode 100644
index b170f86..0000000
--- a/flang/test/Semantics/OpenMP/sections03.f90
+++ /dev/null
@@ -1,29 +0,0 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
-! RUN: %python %S/../test_errors.py %s %flang -fopenmp
-!XFAIL: *
-! OpenMP version 5.0.0
-! 2.8.1 sections construct
-! Orphaned section directives are prohibited. That is, the section directives must appear within the sections construct and must not be encountered elsewhere in the sections region
-!TODO: Error in parsing. Make parser errors more informative. Until then, the test is XFAIL
-
-program OmpOrphanedSections
-   use omp_lib
-   integer counter
-   counter = 0
-   !CHECK: expected 'END'
-   !CHECK: END PROGRAM statement
-   !CHECK: in the context: main program
-   !CHECK: expected 'END PROGRAM'
-   !CHECK: in the context: END PROGRAM statement
-   !CHECK: in the context: main program
-   !$omp section
-   print *, "An orphaned section containing a single statement"
-   !$omp section
-   counter = counter + 1
-   print *, "An orphaned section containing multiple statements"
-!$omp sections
-   !$omp section
-   print *, "Not an orphan structured block"
-!$omp end sections
-end program OmpOrphanedSections
diff --git a/flang/test/Semantics/OpenMP/simd-aligned.f90 b/flang/test/Semantics/OpenMP/simd-aligned.f90
index 3ffdc68..0a9f958 100644
--- a/flang/test/Semantics/OpenMP/simd-aligned.f90
+++ b/flang/test/Semantics/OpenMP/simd-aligned.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 
 ! OpenMP Version 4.5
diff --git a/flang/test/Semantics/OpenMP/simd-nontemporal.f90 b/flang/test/Semantics/OpenMP/simd-nontemporal.f90
index 074b0a2..a488edd 100644
--- a/flang/test/Semantics/OpenMP/simd-nontemporal.f90
+++ b/flang/test/Semantics/OpenMP/simd-nontemporal.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 
 ! OpenMP Version 4.5
diff --git a/flang/test/Semantics/OpenMP/simd01.f90 b/flang/test/Semantics/OpenMP/simd01.f90
index 1e24164..1aa2880 100644
--- a/flang/test/Semantics/OpenMP/simd01.f90
+++ b/flang/test/Semantics/OpenMP/simd01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 5.0
 ! 2.9.3.1 simd Construct
diff --git a/flang/test/Semantics/OpenMP/simd02.f90 b/flang/test/Semantics/OpenMP/simd02.f90
index 24d6abd..a627e2a 100644
--- a/flang/test/Semantics/OpenMP/simd02.f90
+++ b/flang/test/Semantics/OpenMP/simd02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 
 ! OpenMP Version 4.5
diff --git a/flang/test/Semantics/OpenMP/simd03.f90 b/flang/test/Semantics/OpenMP/simd03.f90
index 8df4836..8c90eba 100644
--- a/flang/test/Semantics/OpenMP/simd03.f90
+++ b/flang/test/Semantics/OpenMP/simd03.f90
@@ -1,6 +1,4 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
-! RUN: %S/test_errors.sh %s %t %flang -fopenmp
+! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
 ! XFAIL: *
 
 ! OpenMP Version 4.5
diff --git a/flang/test/Semantics/OpenMP/single01.f90 b/flang/test/Semantics/OpenMP/single01.f90
index 0468e69..2e40bec 100644
--- a/flang/test/Semantics/OpenMP/single01.f90
+++ b/flang/test/Semantics/OpenMP/single01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 4.5
 ! 2.7.3 single Construct
diff --git a/flang/test/Semantics/OpenMP/single02.f90 b/flang/test/Semantics/OpenMP/single02.f90
index 9d9d306..03cf7fb 100644
--- a/flang/test/Semantics/OpenMP/single02.f90
+++ b/flang/test/Semantics/OpenMP/single02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 4.5
 ! 2.7.3 single Construct
diff --git a/flang/test/Semantics/OpenMP/struct.f90 b/flang/test/Semantics/OpenMP/struct.f90
index 3d2000a..8ae1fbe 100644
--- a/flang/test/Semantics/OpenMP/struct.f90
+++ b/flang/test/Semantics/OpenMP/struct.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! Check OpenMP compatibility with the DEC STRUCTURE extension
 
diff --git a/flang/test/Semantics/OpenMP/symbol01.f90 b/flang/test/Semantics/OpenMP/symbol01.f90
index e2a9c01..0b435a9 100644
--- a/flang/test/Semantics/OpenMP/symbol01.f90
+++ b/flang/test/Semantics/OpenMP/symbol01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
 
 ! Test clauses that accept list.
diff --git a/flang/test/Semantics/OpenMP/symbol02.f90 b/flang/test/Semantics/OpenMP/symbol02.f90
index 1b1dc448..f6ffc55 100644
--- a/flang/test/Semantics/OpenMP/symbol02.f90
+++ b/flang/test/Semantics/OpenMP/symbol02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
 
 ! 1.4.1 Structure of the OpenMP Memory Model
diff --git a/flang/test/Semantics/OpenMP/symbol03.f90 b/flang/test/Semantics/OpenMP/symbol03.f90
index 76d9357..93e9b7a 100644
--- a/flang/test/Semantics/OpenMP/symbol03.f90
+++ b/flang/test/Semantics/OpenMP/symbol03.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
 
 ! 1.4.1 Structure of the OpenMP Memory Model
diff --git a/flang/test/Semantics/OpenMP/symbol04.f90 b/flang/test/Semantics/OpenMP/symbol04.f90
index 8ef154e..808d1e0 100644
--- a/flang/test/Semantics/OpenMP/symbol04.f90
+++ b/flang/test/Semantics/OpenMP/symbol04.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
 
 ! 2.15.3 Data-Sharing Attribute Clauses
diff --git a/flang/test/Semantics/OpenMP/symbol05.f90 b/flang/test/Semantics/OpenMP/symbol05.f90
index d08d852..fa0a8f6 100644
--- a/flang/test/Semantics/OpenMP/symbol05.f90
+++ b/flang/test/Semantics/OpenMP/symbol05.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
 
 ! 2.15.2 threadprivate Directive
diff --git a/flang/test/Semantics/OpenMP/symbol06.f90 b/flang/test/Semantics/OpenMP/symbol06.f90
index a2cd288..906264e 100644
--- a/flang/test/Semantics/OpenMP/symbol06.f90
+++ b/flang/test/Semantics/OpenMP/symbol06.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
 
 ! 2.15.3 Data-Sharing Attribute Clauses
diff --git a/flang/test/Semantics/OpenMP/symbol07.f90 b/flang/test/Semantics/OpenMP/symbol07.f90
index ee6cd2a..e2250f5 100644
--- a/flang/test/Semantics/OpenMP/symbol07.f90
+++ b/flang/test/Semantics/OpenMP/symbol07.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
 
 ! Generic tests
diff --git a/flang/test/Semantics/OpenMP/symbol08.f90 b/flang/test/Semantics/OpenMP/symbol08.f90
index 76db86c..3af85af 100644
--- a/flang/test/Semantics/OpenMP/symbol08.f90
+++ b/flang/test/Semantics/OpenMP/symbol08.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
 
 ! 2.15.1.1 Predetermined rules for associated do-loops index variable
diff --git a/flang/test/Semantics/OpenMP/symbol09.f90 b/flang/test/Semantics/OpenMP/symbol09.f90
index ee6cd2a..e2250f5 100644
--- a/flang/test/Semantics/OpenMP/symbol09.f90
+++ b/flang/test/Semantics/OpenMP/symbol09.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
 
 ! Generic tests
diff --git a/flang/test/Semantics/OpenMP/sync-critical01.f90 b/flang/test/Semantics/OpenMP/sync-critical01.f90
index ef377eb..b597eb1 100644
--- a/flang/test/Semantics/OpenMP/sync-critical01.f90
+++ b/flang/test/Semantics/OpenMP/sync-critical01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 
 ! OpenMP Version 5.0
diff --git a/flang/test/Semantics/OpenMP/sync-critical02.f90 b/flang/test/Semantics/OpenMP/sync-critical02.f90
index 681aa79..1fa9d6a 100644
--- a/flang/test/Semantics/OpenMP/sync-critical02.f90
+++ b/flang/test/Semantics/OpenMP/sync-critical02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/taskloop01.f90 b/flang/test/Semantics/OpenMP/taskloop01.f90
index 2c53759..6bef584 100644
--- a/flang/test/Semantics/OpenMP/taskloop01.f90
+++ b/flang/test/Semantics/OpenMP/taskloop01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 4.5
 ! 2.9.2 taskloop Construct
diff --git a/flang/test/Semantics/OpenMP/taskloop02.f90 b/flang/test/Semantics/OpenMP/taskloop02.f90
index 275b079..867ef8a9 100644
--- a/flang/test/Semantics/OpenMP/taskloop02.f90
+++ b/flang/test/Semantics/OpenMP/taskloop02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: not %flang -fsyntax-only -fopenmp %s 2>&1 | FileCheck %s
 ! OpenMP Version 4.5
 ! 2.9.2 taskloop Construct
diff --git a/flang/test/Semantics/OpenMP/taskloop03.f90 b/flang/test/Semantics/OpenMP/taskloop03.f90
new file mode 100644
index 0000000..3fe6a59
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/taskloop03.f90
@@ -0,0 +1,25 @@
+! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
+! XFAIL: *
+
+! OpenMP Version 4.5
+! 2.9.2 taskloop Construct
+! All loops associated with the taskloop construct must be perfectly nested,
+! there must be no intervening code or any OpenMP directive between
+! any two loops
+
+program omp_taskloop
+  integer i, j
+
+  !$omp taskloop private(j) grainsize(500) nogroup
+  do i=1, 10000
+    do j=1, i
+      call loop_body(i, j)
+    end do
+    !ERROR: Loops associated with !$omp taskloop is not perfectly nested
+    !$omp single
+    print *, "omp single"
+    !$omp end single
+  end do
+  !$omp end taskloop
+
+end program omp_taskloop
diff --git a/flang/test/Semantics/OpenMP/taskwait.f90 b/flang/test/Semantics/OpenMP/taskwait.f90
index a3b15c7..e60051c 100644
--- a/flang/test/Semantics/OpenMP/taskwait.f90
+++ b/flang/test/Semantics/OpenMP/taskwait.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 
 !$omp taskwait
diff --git a/flang/test/Semantics/OpenMP/threadprivate01.f90 b/flang/test/Semantics/OpenMP/threadprivate01.f90
index 6597941..c2cf9ba 100644
--- a/flang/test/Semantics/OpenMP/threadprivate01.f90
+++ b/flang/test/Semantics/OpenMP/threadprivate01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/threadprivate02.f90 b/flang/test/Semantics/OpenMP/threadprivate02.f90
index 862d1e8..7f6e8dc 100644
--- a/flang/test/Semantics/OpenMP/threadprivate02.f90
+++ b/flang/test/Semantics/OpenMP/threadprivate02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 5.1
 ! Check OpenMP construct validity for the following directives:
diff --git a/flang/test/Semantics/OpenMP/threadprivate03.f90 b/flang/test/Semantics/OpenMP/threadprivate03.f90
index 57d3b92..b466a8e 100644
--- a/flang/test/Semantics/OpenMP/threadprivate03.f90
+++ b/flang/test/Semantics/OpenMP/threadprivate03.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp -pedantic
 ! OpenMP Version 5.1
 ! Check OpenMP construct validity for the following directives:
diff --git a/flang/test/Semantics/OpenMP/threadprivate04.f90 b/flang/test/Semantics/OpenMP/threadprivate04.f90
index 8199dba..3d8c7fb 100644
--- a/flang/test/Semantics/OpenMP/threadprivate04.f90
+++ b/flang/test/Semantics/OpenMP/threadprivate04.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 5.1
 ! Check OpenMP construct validity for the following directives:
diff --git a/flang/test/Semantics/OpenMP/threadprivate05.f90 b/flang/test/Semantics/OpenMP/threadprivate05.f90
index eecf9e7..cdbf370 100644
--- a/flang/test/Semantics/OpenMP/threadprivate05.f90
+++ b/flang/test/Semantics/OpenMP/threadprivate05.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 5.1
 ! Check OpenMP construct validity for the following directives:
diff --git a/flang/test/Semantics/OpenMP/threadprivate06.f90 b/flang/test/Semantics/OpenMP/threadprivate06.f90
index 5537a88..f31c38f 100644
--- a/flang/test/Semantics/OpenMP/threadprivate06.f90
+++ b/flang/test/Semantics/OpenMP/threadprivate06.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 5.1
 ! Check OpenMP construct validity for the following directives:
diff --git a/flang/test/Semantics/OpenMP/threadprivate07.f90 b/flang/test/Semantics/OpenMP/threadprivate07.f90
index 5302fdf..c9a006c 100644
--- a/flang/test/Semantics/OpenMP/threadprivate07.f90
+++ b/flang/test/Semantics/OpenMP/threadprivate07.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 
 ! Check Threadprivate Directive with local variable of a BLOCK construct.
diff --git a/flang/test/Semantics/OpenMP/use_device_addr.f90 b/flang/test/Semantics/OpenMP/use_device_addr.f90
index dda00d5..93a7643 100644
--- a/flang/test/Semantics/OpenMP/use_device_addr.f90
+++ b/flang/test/Semantics/OpenMP/use_device_addr.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %flang_fc1 -fopenmp -fdebug-dump-symbols %s | FileCheck %s
 ! OpenMP Version 5.1
 ! 2.14.2 use_device_addr clause
diff --git a/flang/test/Semantics/OpenMP/use_device_addr1.f90 b/flang/test/Semantics/OpenMP/use_device_addr1.f90
index c37e9a3..867e324 100644
--- a/flang/test/Semantics/OpenMP/use_device_addr1.f90
+++ b/flang/test/Semantics/OpenMP/use_device_addr1.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 5.0
 ! 2.10.1 use_device_ptr clause
diff --git a/flang/test/Semantics/OpenMP/use_device_ptr.f90 b/flang/test/Semantics/OpenMP/use_device_ptr.f90
index e9e7fbb..64b98cf 100644
--- a/flang/test/Semantics/OpenMP/use_device_ptr.f90
+++ b/flang/test/Semantics/OpenMP/use_device_ptr.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %flang_fc1 -fopenmp -fdebug-dump-symbols %s | FileCheck %s
 ! OpenMP Version 5.0
 ! 2.10.1 use_device_ptr clause
diff --git a/flang/test/Semantics/OpenMP/use_device_ptr1.f90 b/flang/test/Semantics/OpenMP/use_device_ptr1.f90
index f705c50..176fb5f 100644
--- a/flang/test/Semantics/OpenMP/use_device_ptr1.f90
+++ b/flang/test/Semantics/OpenMP/use_device_ptr1.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 5.0
 ! 2.10.1 use_device_ptr clause
diff --git a/flang/test/Semantics/OpenMP/workshare01.f90 b/flang/test/Semantics/OpenMP/workshare01.f90
index 615c340..9667a30 100644
--- a/flang/test/Semantics/OpenMP/workshare01.f90
+++ b/flang/test/Semantics/OpenMP/workshare01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 4.5
 ! 2.7.4 workshare Construct
diff --git a/flang/test/Semantics/OpenMP/workshare02.f90 b/flang/test/Semantics/OpenMP/workshare02.f90
index b6faf19..e099ecb 100644
--- a/flang/test/Semantics/OpenMP/workshare02.f90
+++ b/flang/test/Semantics/OpenMP/workshare02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 4.5
 ! 2.7.4 workshare Construct
diff --git a/flang/test/Semantics/OpenMP/workshare03.f90 b/flang/test/Semantics/OpenMP/workshare03.f90
index 2aea0cc..09d46ab 100644
--- a/flang/test/Semantics/OpenMP/workshare03.f90
+++ b/flang/test/Semantics/OpenMP/workshare03.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 4.5
 ! 2.7.4 workshare Construct
diff --git a/flang/test/Semantics/OpenMP/workshare04.f90 b/flang/test/Semantics/OpenMP/workshare04.f90
index e844599..0ec635e 100644
--- a/flang/test/Semantics/OpenMP/workshare04.f90
+++ b/flang/test/Semantics/OpenMP/workshare04.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 4.5
 ! 2.7.4 workshare Construct
diff --git a/flang/test/Semantics/OpenMP/workshare05.f90 b/flang/test/Semantics/OpenMP/workshare05.f90
index 30f3b98..b57053e 100644
--- a/flang/test/Semantics/OpenMP/workshare05.f90
+++ b/flang/test/Semantics/OpenMP/workshare05.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 4.5
 ! 2.7.4 workshare Construct
diff --git a/flang/test/Semantics/bind-c12.f90 b/flang/test/Semantics/bind-c12.f90
index 55af8a9..01a8d0c 100644
--- a/flang/test/Semantics/bind-c12.f90
+++ b/flang/test/Semantics/bind-c12.f90
@@ -26,8 +26,8 @@ end
 
 subroutine subr5(p) bind(c)
   interface
+    !WARNING: A dummy procedure of an interoperable procedure should be BIND(C)
     subroutine p(c)
-      !ERROR: An assumed-length dummy argument must not appear in a non-BIND(C) entry in a subprogram with an entry that must be interoperable
       character(*), intent(in) :: c
     end
   end interface
@@ -52,8 +52,8 @@ end
 
 subroutine subr8(p) bind(c)
   interface
+    !WARNING: A dummy procedure of an interoperable procedure should be BIND(C)
     subroutine p(n)
-      !ERROR: A VALUE dummy argument must not appear in a non-BIND(C) entry of a subprogram with an entry that must be interoperable
       integer, intent(in), value :: n
     end
   end interface
diff --git a/flang/test/Semantics/call05.f90 b/flang/test/Semantics/call05.f90
index 66d0a37..71f2197 100644
--- a/flang/test/Semantics/call05.f90
+++ b/flang/test/Semantics/call05.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/test_errors.py %s %flang_fc1
+! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic
 ! Test 15.5.2.5 constraints and restrictions for POINTER & ALLOCATABLE
 ! arguments when both sides of the call have the same attributes.
 
@@ -73,9 +73,9 @@ module m
     call sma(ma) ! ok
     call spp(pp) ! ok
     call spa(pa) ! ok
-    !ERROR: If a POINTER or ALLOCATABLE dummy or actual argument is polymorphic, both must be so
+    !PORTABILITY: If a POINTER or ALLOCATABLE actual argument is polymorphic, the corresponding dummy argument should also be so
     call smp(pp)
-    !ERROR: If a POINTER or ALLOCATABLE dummy or actual argument is polymorphic, both must be so
+    !PORTABILITY: If a POINTER or ALLOCATABLE actual argument is polymorphic, the corresponding dummy argument should also be so
     call sma(pa)
     !ERROR: If a POINTER or ALLOCATABLE dummy or actual argument is polymorphic, both must be so
     call spp(mp)
diff --git a/flang/test/Semantics/call39.f90 b/flang/test/Semantics/call39.f90
index 41eeba1..724c9f9 100644
--- a/flang/test/Semantics/call39.f90
+++ b/flang/test/Semantics/call39.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic -Werror
+! RUN: %python %S/test_errors.py %s %flang_fc1
 ! Tests actual/dummy pointer argument shape mismatches
 module m
  contains
@@ -11,6 +11,15 @@ module m
   subroutine sa(p)
     real, pointer, intent(in) :: p(..)
   end
+  subroutine sao(p)
+    real, intent(in), optional, pointer :: p(..)
+  end
+  subroutine so(x)
+    real, intent(in), optional :: x(..)
+  end
+  subroutine soa(a)
+    real, intent(in), optional, allocatable :: a(..)
+  end
   subroutine test
     real, pointer :: a0, a1(:)
     call s0(null(a0)) ! ok
@@ -23,9 +32,15 @@ module m
     call s1(null(a1)) ! ok
     call sa(null(a0)) ! ok
     call sa(null(a1)) ! ok
-    !ERROR: NULL() without MOLD= must not be associated with an assumed-rank dummy argument
-    call sa(null())
-    !ERROR: NULL() without MOLD= must not be associated with an assumed-rank dummy argument
+    !ERROR: NULL() without MOLD= must not be associated with an assumed-rank dummy argument that is ALLOCATABLE, POINTER, or non-OPTIONAL
     call sa(null())
+    call sao ! ok
+    !ERROR: NULL() without MOLD= must not be associated with an assumed-rank dummy argument that is ALLOCATABLE, POINTER, or non-OPTIONAL
+    call sao(null())
+    call so ! ok
+    call so(null()) ! ok
+    call soa ! ok
+    !ERROR: NULL() without MOLD= must not be associated with an assumed-rank dummy argument that is ALLOCATABLE, POINTER, or non-OPTIONAL
+    call soa(null())
   end
 end
diff --git a/flang/test/Semantics/modfile03.f90 b/flang/test/Semantics/modfile03.f90
index db0caea..eb3136f0a 100644
--- a/flang/test/Semantics/modfile03.f90
+++ b/flang/test/Semantics/modfile03.f90
@@ -135,10 +135,8 @@ module m6d
 end
 !Expect: m6d.mod
 !module m6d
-! use m6a,only:t1
 ! use m6a,only:t2=>t1
-! private::t1
-! type(t2),parameter::p=t1()
+! type(t2),parameter::p=t2()
 !end
 
 module m6e
@@ -178,3 +176,98 @@ end
 ! use m7a,only:x
 ! private::x
 !end
+
+module m8a
+  private foo
+  type t
+   contains
+    procedure, nopass :: foo
+  end type
+ contains
+  pure integer function foo(n)
+    integer, intent(in) :: n
+    foo = n
+  end
+end
+!Expect: m8a.mod
+!module m8a
+!type::t
+!contains
+!procedure,nopass::foo
+!end type
+!private::foo
+!contains
+!pure function foo(n)
+!integer(4),intent(in)::n
+!integer(4)::foo
+!end
+!end
+
+module m8b
+  use m8a
+ contains
+  subroutine foo(x,a)
+    type(t), intent(in) :: x
+    real a(x%foo(10))
+  end
+end
+!Expect: m8b.mod
+!module m8b
+!use m8a,only:m8a$foo=>foo
+!use m8a,only:t
+!private::m8a$foo
+!contains
+!subroutine foo(x,a)
+!type(t),intent(in)::x
+!real(4)::a(1_8:int(m8a$foo(10_4),kind=8))
+!end
+!end
+
+module m9a
+  private
+  public t
+  type t
+    integer n
+   contains
+    procedure f
+  end type
+ contains
+  pure integer function f(x, k)
+    class(t), intent(in) :: x
+    integer, intent(in) :: k
+    f = x%n + k
+  end
+end
+!Expect: m9a.mod
+!module m9a
+!type::t
+!integer(4)::n
+!contains
+!procedure::f
+!end type
+!private::f
+!contains
+!pure function f(x,k)
+!class(t),intent(in)::x
+!integer(4),intent(in)::k
+!integer(4)::f
+!end
+!end
+
+module m9b
+  use m9a
+ contains
+  subroutine s(x, y)
+    class(t), intent(in) :: x
+    real y(x%f(x%n))
+  end
+end
+!Expect: m9b.mod
+!module m9b
+!use m9a,only:t
+!contains
+!subroutine s(x,y)
+!class(t),intent(in)::x
+!real(4)::y(1_8:int(x%f(x%n),kind=8))
+!end
+!end
diff --git a/flang/test/Semantics/procinterface05.f90 b/flang/test/Semantics/procinterface05.f90
new file mode 100644
index 0000000..8c3afbf
--- /dev/null
+++ b/flang/test/Semantics/procinterface05.f90
@@ -0,0 +1,14 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+interface a1
+  subroutine s1
+    interface a2
+      subroutine s2
+        !ERROR: Invalid specification expression: reference to local entity 'k'
+        real x(k)
+      end subroutine
+    end interface
+    !ERROR: Invalid specification expression: reference to local entity 'k'
+    real y(k)
+  end subroutine
+end interface
+end
diff --git a/flang/test/Semantics/shape.f90 b/flang/test/Semantics/shape.f90
index f43b81f..21e2930 100644
--- a/flang/test/Semantics/shape.f90
+++ b/flang/test/Semantics/shape.f90
@@ -2,10 +2,12 @@
 ! Test comparisons that use the intrinsic SHAPE() as an operand
 program testShape
 contains
-  subroutine sub1(arrayDummy)
-    integer :: arrayDummy(:)
+  subroutine sub1(arrayDummy, assumedRank)
+    integer :: arrayDummy(:), assumedRank(..)
     integer, allocatable :: arrayDeferred(:)
     integer :: arrayLocal(2) = [88, 99]
+    integer, parameter :: aRrs = rank(shape(assumedRank))
+    integer(kind=merge(kind(1),-1,aRrs == 1)) :: test_aRrs
     !ERROR: Dimension 1 of left operand has extent 1, but right operand has extent 0
     !ERROR: Dimension 1 of left operand has extent 1, but right operand has extent 0
     if (all(shape(arrayDummy)==shape(8))) then
@@ -45,5 +47,9 @@ contains
     if (all(64==shape(arrayLocal))) then
       print *, "hello"
     end if
+    ! These can't be checked at compilation time
+    if (any(shape(assumedRank) == [1])) stop
+    if (any(lbound(assumedRank) == [1,2])) stop
+    if (any(ubound(assumedRank) == [1,2,3])) stop
   end subroutine sub1
 end program testShape
diff --git a/flang/test/Transforms/debug-complex-1.fir b/flang/test/Transforms/debug-complex-1.fir
new file mode 100644
index 0000000..a3cbd76
--- /dev/null
+++ b/flang/test/Transforms/debug-complex-1.fir
@@ -0,0 +1,39 @@
+// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s
+
+// check conversion of complex type of different size. Both fir and mlir
+// variants are checked.
+
+module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.target_triple = "native"} {
+  func.func @test1(%x : !fir.complex<4>) -> !fir.complex<8> {
+  %1 = fir.convert %x : (!fir.complex<4>) -> !fir.complex<8>
+  return %1 : !fir.complex<8>
+  }loc(#loc1)
+  func.func @test2(%x : !fir.complex<4>) -> complex<f64> {
+  %1 = fir.convert %x : (!fir.complex<4>) -> complex<f64>
+  return %1 : complex<f64>
+  }loc(#loc2)
+  func.func @test3(%x : !fir.complex<4>) -> !fir.complex<16> {
+  %1 = fir.convert %x : (!fir.complex<4>) -> !fir.complex<16>
+  return %1 : !fir.complex<16>
+  }loc(#loc3)
+  func.func @test4(%x : !fir.complex<4>) -> complex<f128> {
+  %1 = fir.convert %x : (!fir.complex<4>) -> complex<f128>
+  return %1 : complex<f128>
+  }loc(#loc4)
+}
+#loc1 = loc("./simple.f90":2:1)
+#loc2 = loc("./simple.f90":5:1)
+#loc3 = loc("./simple.f90":8:1)
+#loc4 = loc("./simple.f90":11:1)
+
+// CHECK-DAG: #[[CMPX8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex", sizeInBits = 128, encoding = DW_ATE_complex_float>
+// CHECK-DAG: #[[CMPX4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex", sizeInBits = 64, encoding = DW_ATE_complex_float>
+// CHECK-DAG: #[[CMPX16:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex", sizeInBits = 256, encoding = DW_ATE_complex_float>
+
+// CHECK-DAG: #[[TY1:.*]] = #llvm.di_subroutine_type<{{.*}}types = #[[CMPX8]], #[[CMPX4]]>
+// CHECK-DAG: #[[TY2:.*]] = #llvm.di_subroutine_type<{{.*}}types = #[[CMPX16]], #[[CMPX4]]>
+
+// CHECK-DAG: #llvm.di_subprogram<{{.*}}name = "test1"{{.*}}type = #[[TY1]]>
+// CHECK-DAG: #llvm.di_subprogram<{{.*}}name = "test2"{{.*}}type = #[[TY1]]>
+// CHECK-DAG: #llvm.di_subprogram<{{.*}}name = "test3"{{.*}}type = #[[TY2]]>
+// CHECK-DAG: #llvm.di_subprogram<{{.*}}name = "test4"{{.*}}type = #[[TY2]]>
diff --git a/flang/test/Transforms/debug-fixed-array-type.fir b/flang/test/Transforms/debug-fixed-array-type.fir
new file mode 100644
index 0000000..401c725
--- /dev/null
+++ b/flang/test/Transforms/debug-fixed-array-type.fir
@@ -0,0 +1,34 @@
+// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s
+
+module attributes {} {
+  func.func @_QQmain() attributes {fir.bindc_name = "mn"} {
+    %c7 = arith.constant 7 : index
+    %c8 = arith.constant 8 : index
+    %c6 = arith.constant 6 : index
+    %c5 = arith.constant 5 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+    %0 = fir.alloca !fir.array<3xi32> {bindc_name = "d1", uniq_name = "_QFEd1"}
+    %1 = fircg.ext_declare %0(%c3) {uniq_name = "_QFEd1"} : (!fir.ref<!fir.array<3xi32>>, index) -> !fir.ref<!fir.array<3xi32>> loc(#loc1)
+    %2 = fir.address_of(@_QFEd2) : !fir.ref<!fir.array<2x5xi32>>
+    %3 = fircg.ext_declare %2(%c2, %c5) {uniq_name = "_QFEd2"} : (!fir.ref<!fir.array<2x5xi32>>, index, index) -> !fir.ref<!fir.array<2x5xi32>> loc(#loc2)
+    %4 = fir.address_of(@_QFEd3) : !fir.ref<!fir.array<6x8x7xf32>>
+    %5 = fircg.ext_declare %4(%c6, %c8, %c7) {uniq_name = "_QFEd3"} : (!fir.ref<!fir.array<6x8x7xf32>>, index, index, index) -> !fir.ref<!fir.array<6x8x7xf32>> loc(#loc3)
+    return
+  } loc(#loc4)
+}
+
+#loc1 = loc("test.f90":5:1)
+#loc2 = loc("test.f90":6:11)
+#loc3 = loc("test.f90":7:11)
+#loc4 = loc("test.f90":2:8)
+
+
+// CHECK-DAG: #[[INT:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
+// CHECK-DAG: #[[REAL:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
+// CHECK-DAG: #[[D1TY:.*]] = #llvm.di_composite_type<tag = DW_TAG_array_type{{.*}}baseType = #[[INT]], elements = #llvm.di_subrange<count = 3 : i64, lowerBound = 1 : i64>>
+// CHECK-DAG: #[[D2TY:.*]] = #llvm.di_composite_type<tag = DW_TAG_array_type{{.*}}baseType = #[[INT]], elements = #llvm.di_subrange<count = 2 : i64, lowerBound = 1 : i64>, #llvm.di_subrange<count = 5 : i64, lowerBound = 1 : i64>>
+// CHECK-DAG: #[[D3TY:.*]] = #llvm.di_composite_type<tag = DW_TAG_array_type{{.*}}baseType = #[[REAL]], elements = #llvm.di_subrange<count = 6 : i64, lowerBound = 1 : i64>, #llvm.di_subrange<count = 8 : i64, lowerBound = 1 : i64>, #llvm.di_subrange<count = 7 : i64, lowerBound = 1 : i64>>
+// CHECK-DAG: #llvm.di_local_variable<{{.*}}name = "d1"{{.*}}type = #[[D1TY]]>
+// CHECK-DAG: #llvm.di_local_variable<{{.*}}name = "d2"{{.*}}type = #[[D2TY]]>
+// CHECK-DAG: #llvm.di_local_variable<{{.*}}name = "d3"{{.*}}type = #[[D3TY]]>
diff --git a/flang/test/Transforms/debug-module-1.fir b/flang/test/Transforms/debug-module-1.fir
new file mode 100644
index 0000000..822ae01
--- /dev/null
+++ b/flang/test/Transforms/debug-module-1.fir
@@ -0,0 +1,40 @@
+// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s
+
+
+module attributes {} {
+  fir.global @_QMhelperEgli : i32 {
+    %0 = fir.zero_bits i32
+    fir.has_value %0 : i32
+  } loc(#loc1)
+  fir.global @_QMhelperEglr : f32 {
+    %0 = fir.zero_bits f32
+    fir.has_value %0 : f32
+  } loc(#loc2)
+  func.func @_QMhelperPtest() {
+    %c67_i32 = arith.constant 67 : i32
+    %cst = arith.constant 1.234000e+01 : f32
+    %0 = fir.address_of(@_QMhelperEgli) : !fir.ref<i32>
+    %1 = fir.address_of(@_QMhelperEglr) : !fir.ref<f32>
+    fir.store %cst to %1 : !fir.ref<f32>
+    fir.store %c67_i32 to %0 : !fir.ref<i32>
+    return
+  } loc(#loc3)
+}
+#loc1 = loc("test.f90":12:11)
+#loc2 = loc("test.f90":15:8)
+#loc3 = loc("test.f90":20:5)
+
+// CHECK-DAG: #[[I4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
+// CHECK-DAG: #[[R4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
+// CHECK-DAG: #[[CU:.*]] = #llvm.di_compile_unit<{{.*}}>
+// CHECK-DAG: #[[MOD:.*]] = #llvm.di_module<{{.*}}scope = #[[CU]], name = "helper"{{.*}}>
+// CHECK-DAG: #[[LOC1:.*]] = loc("{{.*}}test.f90":12{{.*}})
+// CHECK-DAG: #[[GLI:.*]] = #llvm.di_global_variable<scope = #[[MOD]], name = "gli", linkageName = "_QMhelperEgli"{{.*}}line = 12, type = #[[I4]], isDefined = true>
+// CHECK-DAG: #[[LOC2:.*]] = loc("{{.*}}test.f90":15{{.*}})
+// CHECK-DAG: #[[GLR:.*]] = #llvm.di_global_variable<scope = #[[MOD]], name = "glr", linkageName = "_QMhelperEglr"{{.*}}line = 15, type = #[[R4]], isDefined = true>
+// CHECK-DAG: #[[LOC3:.*]] = loc("{{.*}}test.f90":20{{.*}})
+// CHECK-DAG: #[[TEST:.*]] = #llvm.di_subprogram<{{.*}}compileUnit = #[[CU]], scope = #[[MOD]], name = "test", linkageName = "_QMhelperPtest"{{.*}}line = 20, scopeLine = 20{{.*}}>
+// CHECK-DAG: loc(fused<#[[GLI]]>[#[[LOC1]]])
+// CHECK-DAG: loc(fused<#[[GLR]]>[#[[LOC2]]])
+// CHECK-DAG: loc(fused<#[[TEST]]>[#[[LOC3]]])
+
diff --git a/flang/test/Transforms/debug-module-2.fir b/flang/test/Transforms/debug-module-2.fir
new file mode 100644
index 0000000..6acdc1d
--- /dev/null
+++ b/flang/test/Transforms/debug-module-2.fir
@@ -0,0 +1,35 @@
+// RUN: fir-opt --fir-to-llvm-ir="target=x86_64-unknown-linux-gnu" --mlir-print-debuginfo %s | FileCheck %s
+
+module {
+  fir.global @_QMhelperEgli : i32 {
+    %0 = fir.zero_bits i32
+    fir.has_value %0 : i32
+  } loc(#loc3)
+  fir.global @_QMhelperEglr : f32 {
+    %0 = fir.zero_bits f32
+    fir.has_value %0 : f32
+  } loc(#loc4)
+}
+#di_basic_type = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
+#di_basic_type1 = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
+
+#di_file = #llvm.di_file<"test.f90" in "">
+#di_subroutine_type = #llvm.di_subroutine_type<callingConvention = DW_CC_normal>
+
+#di_compile_unit = #llvm.di_compile_unit<id = distinct[0]<>, sourceLanguage = DW_LANG_Fortran95, file = #di_file, producer = "flang version 19.0.0 (/home/haqadeer/work/llvm-project/flang 5d5c73cad421bdca6e43e1cc10704ff160f1a33e)", isOptimized = false, emissionKind = Full>
+#di_module = #llvm.di_module<file = #di_file, scope = #di_compile_unit, name = "helper", line = 11>
+#di_global_variable = #llvm.di_global_variable<scope = #di_module, name = "gli", linkageName = "_QMhelperEgli", file = #di_file, line = 12, type = #di_basic_type, isDefined = true>
+#di_global_variable1 = #llvm.di_global_variable<scope = #di_module, name = "glr", linkageName = "_QMhelperEglr", file = #di_file, line = 15, type = #di_basic_type1, isDefined = true>
+
+#loc1 = loc("test.f90":12:11)
+#loc2 = loc("test.f90":15:8)
+#loc3 = loc(fused<#di_global_variable>[#loc1])
+#loc4 = loc(fused<#di_global_variable1>[#loc2])
+
+
+// CHECK-DAG: #[[GLI:.*]] = #llvm.di_global_variable<{{.*}}name = "gli", linkageName = "_QMhelperEgli"{{.*}}>
+// CHECK-DAG: #[[GLR:.*]] = #llvm.di_global_variable<{{.*}}name = "glr", linkageName = "_QMhelperEglr"{{.*}}>
+// CHECK-DAG: #[[GLIE:.*]] = #llvm.di_global_variable_expression<var = #[[GLI]]>
+// CHECK-DAG: #[[GLRE:.*]] = #llvm.di_global_variable_expression<var = #[[GLR]]>
+// CHECK-DAG: llvm.mlir.global{{.*}}@_QMhelperEgli() {{{.*}}dbg_expr = #[[GLIE]]}
+// CHECK-DAG: llvm.mlir.global{{.*}}@_QMhelperEglr() {{{.*}}dbg_expr = #[[GLRE]]}
diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
index 0649e9f..134c514 100644
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -246,9 +246,6 @@ function(create_entrypoint_object fq_target_name)
   if(NOT ADD_ENTRYPOINT_OBJ_SRCS)
     message(FATAL_ERROR "`add_entrypoint_object` rule requires SRCS to be specified.")
   endif()
-  if(NOT ADD_ENTRYPOINT_OBJ_HDRS)
-    message(FATAL_ERROR "`add_entrypoint_object` rule requires HDRS to be specified.")
-  endif()
   if(NOT ADD_ENTRYPOINT_OBJ_CXX_STANDARD)
     set(ADD_ENTRYPOINT_OBJ_CXX_STANDARD ${CMAKE_CXX_STANDARD})
   endif()
diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt
index 4e3d1cb..7fb82c6 100644
--- a/libc/config/baremetal/arm/entrypoints.txt
+++ b/libc/config/baremetal/arm/entrypoints.txt
@@ -183,6 +183,10 @@ set(TARGET_LIBC_ENTRYPOINTS
 
     # time.h entrypoints
     libc.src.time.difftime
+
+    # internal entrypoints
+    libc.startup.baremetal.init
+    libc.startup.baremetal.fini
 )
 
 set(TARGET_LIBM_ENTRYPOINTS
diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt
index 7efd9bc..b769b43 100644
--- a/libc/config/baremetal/riscv/entrypoints.txt
+++ b/libc/config/baremetal/riscv/entrypoints.txt
@@ -183,6 +183,10 @@ set(TARGET_LIBC_ENTRYPOINTS
 
     # time.h entrypoints
     libc.src.time.difftime
+
+    # internal entrypoints
+    libc.startup.baremetal.init
+    libc.startup.baremetal.fini
 )
 
 set(TARGET_LIBM_ENTRYPOINTS
diff --git a/libc/docs/ctype.rst b/libc/docs/ctype.rst
index 7d77dad..828785c 100644
--- a/libc/docs/ctype.rst
+++ b/libc/docs/ctype.rst
@@ -1,7 +1,11 @@
 .. include:: check.rst
 
-ctype.h Functions
-=================
+=======
+ctype.h
+=======
+
+Functions
+=========
 
 .. list-table::
   :widths: auto
@@ -10,46 +14,61 @@ ctype.h Functions
 
   * - Function
     - Implemented
-    - Standard
+    - C23 Standard Section
+    - POSIX.1-2017 Standard Section
   * - isalnum
     - |check|
     - 7.4.1.1
+    -
   * - isalpha
     - |check|
     - 7.4.1.2
+    -
   * - isblank
     - |check|
     - 7.4.1.3
+    -
   * - iscntrl
     - |check|
     - 7.4.1.4
+    -
   * - isdigit
     - |check|
     - 7.4.1.5
+    -
   * - isgraph
     - |check|
     - 7.4.1.6
+    -
   * - islower
     - |check|
     - 7.4.1.7
+    -
   * - isprint
     - |check|
     - 7.4.1.8
+    -
   * - ispunct
     - |check|
     - 7.4.1.9
+    -
   * - isspace
     - |check|
     - 7.4.1.10
+    -
   * - isupper
     - |check|
     - 7.4.1.11
+    -
   * - isxdigit
     - |check|
     - 7.4.1.12
+    -
   * - tolower
     - |check|
     - 7.4.2.1
+    -
   * - toupper
     - |check|
     - 7.4.2.2
+    -
diff --git a/libc/docs/fenv.rst b/libc/docs/fenv.rst
index 1dee5515..e7a5a3f 100644
--- a/libc/docs/fenv.rst
+++ b/libc/docs/fenv.rst
@@ -1,7 +1,11 @@
 .. include:: check.rst
 
-fenv.h Functions
-================
+======
+fenv.h
+======
+
+Macros
+======
 
 .. list-table::
   :widths: auto
@@ -10,55 +14,162 @@ fenv.h Functions
 
   * - Function
     - Implemented
-    - Standard
+    - C23 Standard Section
+    - POSIX.1-2017 Standard Section
+  * - FE_ALL_EXCEPT
+    - |check|
+    - 7.6.12
+    -
+  * - FE_DEC_DOWNWARD
+    -
+    - 7.6.14
+    -
+  * - FE_DEC_TONEAREST
+    -
+    - 7.6.14
+    -
+  * - FE_DEC_TONEARESTFROMZERO
+    -
+    - 7.6.14
+    -
+  * - FE_DEC_TOWARDZERO
+    -
+    - 7.6.14
+    -
+  * - FE_DEC_UPWARD
+    -
+    - 7.6.14
+    -
+  * - FE_DFL_ENV
+    - |check|
+    - 7.6.17
+    -
+  * - FE_DFL_MODE
+    -
+    - 7.6.11
+    -
+  * - FE_DIVBYZERO
+    - |check|
+    - 7.6.9
+    -
+  * - FE_DOWNARD
+    -
+    - 7.6.13
+    -
+  * - FE_INEXACT
+    - |check|
+    - 7.6.9
+    -
+  * - FE_INVALID
+    - |check|
+    - 7.6.9
+    -
+  * - FE_OVERFLOW
+    - |check|
+    - 7.6.9
+    -
+  * - FE_TONEAREST
+    - |check|
+    - 7.6.13
+    -
+  * - FE_TONEARESTFROMZERO
+    -
+    - 7.6.13
+    -
+  * - FE_TOWARDZERO
+    - |check|
+    - 7.6.13
+    -
+  * - FE_UNDERFLOW
+    - |check|
+    - 7.6.9
+    -
+  * - FE_UPWARD
+    - |check|
+    - 7.6.13
+    -
+  * - __STDC_VERSION_FENV_H__
+    -
+    - 7.6.5
+    -
+
+Functions
+=========
+
+.. list-table::
+  :widths: auto
+  :align: center
+  :header-rows: 1
+
+  * - Function
+    - Implemented
+    - C23 Standard Section
+    - POSIX.1-2017 Standard Section
   * - fe_dec_getround
     -
     - 7.6.5.3
+    -
   * - fe_dec_setround
     -
     - 7.6.5.6
+    -
   * - feclearexcept
     - |check|
     - 7.6.4.1
+    -
   * - fegetenv
     - |check|
     - 7.6.6.1
+    -
   * - fegetexceptflag
     - |check|
     - 7.6.4.2
+    -
   * - fegetmode
     -
     - 7.6.5.1
+    -
   * - fegetround
     - |check|
     - 7.6.5.2
+    -
   * - feholdexcept
     - |check|
     - 7.6.6.2
+    -
   * - feraiseexcept
     - |check|
     - 7.6.4.3
+    -
   * - fesetenv
     - |check|
     - 7.6.6.3
+    -
   * - fesetexcept
     - |check|
     - 7.6.4.4
+    -
   * - fesetexceptflag
     - |check|
     - 7.6.4.5
+    -
   * - fesetmode
     -
     - 7.6.5.4
+    -
   * - fesetround
     - |check|
     - 7.6.5.5
+    -
   * - fetestexcept
     - |check|
     - 7.6.4.7
+    -
   * - fetestexceptflag
     - |check|
     - 7.6.4.6
+    -
   * - feupdateenv
     - |check|
     - 7.6.6.4
+    -
diff --git a/libc/docs/signal.rst b/libc/docs/signal.rst
index 7903bb4..d1a7cb6 100644
--- a/libc/docs/signal.rst
+++ b/libc/docs/signal.rst
@@ -1,7 +1,160 @@
 .. include:: check.rst
 
-signal.h Functions
-==================
+========
+signal.h
+========
+
+Macros
+======
+
+.. list-table::
+  :widths: auto
+  :align: center
+  :header-rows: 1
+
+  * - Function
+    - Implemented
+    - C23 Standard Section
+    - POSIX.1-2017 Standard Section
+  * - SIGABRT
+    - |check|
+    - 7.14.3
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGALRM
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGBUS
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGCHLD
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGCONT
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGFPE
+    - |check|
+    - 7.14.3
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGHUP
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGILL
+    - |check|
+    - 7.14.3
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGINT
+    - |check|
+    - 7.14.3
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGKILL
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGPIPE
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGPOLL
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGPROF
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGQUIT
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGRTMAX
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGRTMIN
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGSEGV
+    - |check|
+    - 7.14.3
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGSTOP
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGSYS
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGTERM
+    - |check|
+    - 7.14.3
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGTRAP
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGTSTP
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGTTIN
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGTTOU
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGURG
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGUSR1
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGUSR2
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGVTALRM
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGXCPU
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGXFSZ
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIG_DFL
+    - |check|
+    - 7.14.3
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIG_ERR
+    - |check|
+    - 7.14.3
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIG_HOLD
+    -
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIG_IGN
+    - |check|
+    - 7.14.3
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+
+Functions
+=========
 
 .. list-table::
   :widths: auto
@@ -10,34 +163,45 @@ signal.h Functions
 
   * - Function
     - Implemented
-    - Standard
+    - C23 Standard Section
+    - POSIX.1-2017 Standard Section
   * - kill
     - |check|
     -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/functions/kill.html
   * - raise
     - |check|
     - 7.14.2.1
+    - https://pubs.opengroup.org/onlinepubs/9699919799/functions/raise.html
   * - sigaction
     - |check|
     -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigaction.html
   * - sigaddset
     - |check|
     -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigaddset.html
   * - sigaltstack
     - |check|
     -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigaltstack.html
   * - sigdelset
     - |check|
     -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigdelset.html
   * - sigemptyset
     - |check|
     -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigemptyset.html
   * - sigfillset
     - |check|
     -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigfillset.html
   * - signal
     - |check|
     - 7.14.1.1
+    - https://pubs.opengroup.org/onlinepubs/9699919799/functions/signal.html
   * - sigprocmask
     - |check|
     -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigprocmask.html
diff --git a/libc/docs/stdbit.rst b/libc/docs/stdbit.rst
index 0a12b2b..71f9bbf 100644
--- a/libc/docs/stdbit.rst
+++ b/libc/docs/stdbit.rst
@@ -1,7 +1,96 @@
 .. include:: check.rst
 
-stdbit.h Functions
-==================
+========
+stdbit.h
+========
+
+Macros
+======
+
+.. list-table::
+  :widths: auto
+  :align: center
+  :header-rows: 1
+
+  * - Function
+    - Implemented
+    - C23 Standard Section
+    - POSIX.1-2017 Standard Section
+  * - __STDC_ENDIAN_BIG__
+    - |check|
+    - 7.18.2.2
+    -
+  * - __STDC_ENDIAN_LITTLE__
+    - |check|
+    - 7.18.2.2
+    -
+  * - __STDC_ENDIAN_NATIVE__
+    - |check|
+    - 7.18.2.2
+    -
+  * - __STDC_VERSION_STDBIT_H__
+    - |check|
+    - 7.18.1.2
+    -
+  * - stdc_bit_ceil
+    - |check|
+    - 7.18.16.1
+    -
+  * - stdc_bit_floor
+    - |check|
+    - 7.18.15.1
+    -
+  * - stdc_bit_width
+    - |check|
+    - 7.18.14.1
+    -
+  * - stdc_count_ones
+    - |check|
+    - 7.18.12.1
+    -
+  * - stdc_count_zeros
+    - |check|
+    - 7.18.11.1
+    -
+  * - stdc_first_leading_one
+    - |check|
+    - 7.18.8.1
+    -
+  * - stdc_first_leading_zero
+    - |check|
+    - 7.18.7.1
+    -
+  * - stdc_first_trailing_one
+    - |check|
+    - 7.18.10.1
+    -
+  * - stdc_first_trailing_zero
+    - |check|
+    - 7.18.9.1
+    -
+  * - stdc_has_single_bit
+    - |check|
+    - 7.18.13.1
+    -
+  * - stdc_leading_ones
+    - |check|
+    - 7.18.4.1
+    -
+  * - stdc_leading_zeros
+    - |check|
+    - 7.18.3.1
+    -
+  * - stdc_trailing_ones
+    - |check|
+    - 7.18.6.1
+    -
+  * - stdc_trailing_zeros
+    - |check|
+    - 7.18.5.1
+    -
+
+Functions
+=========
 
 .. list-table::
   :widths: auto
@@ -10,214 +99,285 @@ stdbit.h Functions
 
   * - Function
     - Implemented
-    - Standard
+    - C23 Standard Section
+    - POSIX.1-2017 Standard Section
   * - stdc_bit_ceil_uc
     - |check|
     - 7.18.16
+    -
   * - stdc_bit_ceil_ui
     - |check|
     - 7.18.16
+    -
   * - stdc_bit_ceil_ul
     - |check|
     - 7.18.16
+    -
   * - stdc_bit_ceil_ull
     - |check|
     - 7.18.16
+    -
   * - stdc_bit_ceil_us
     - |check|
     - 7.18.16
+    -
   * - stdc_bit_floor_uc
     - |check|
     - 7.18.15
+    -
   * - stdc_bit_floor_ui
     - |check|
     - 7.18.15
+    -
   * - stdc_bit_floor_ul
     - |check|
     - 7.18.15
+    -
   * - stdc_bit_floor_ull
     - |check|
     - 7.18.15
+    -
   * - stdc_bit_floor_us
     - |check|
     - 7.18.15
+    -
   * - stdc_bit_width_uc
     - |check|
     - 7.18.14
+    -
   * - stdc_bit_width_ui
     - |check|
     - 7.18.14
+    -
   * - stdc_bit_width_ul
     - |check|
     - 7.18.14
+    -
   * - stdc_bit_width_ull
     - |check|
     - 7.18.14
+    -
   * - stdc_bit_width_us
     - |check|
     - 7.18.14
+    -
   * - stdc_count_ones_uc
     - |check|
     - 7.18.12
+    -
   * - stdc_count_ones_ui
     - |check|
     - 7.18.12
+    -
   * - stdc_count_ones_ul
     - |check|
     - 7.18.12
+    -
   * - stdc_count_ones_ull
     - |check|
     - 7.18.12
+    -
   * - stdc_count_ones_us
     - |check|
     - 7.18.12
+    -
   * - stdc_count_zeros_uc
     - |check|
     - 7.18.11
+    -
   * - stdc_count_zeros_ui
     - |check|
     - 7.18.11
+    -
   * - stdc_count_zeros_ul
     - |check|
     - 7.18.11
+    -
   * - stdc_count_zeros_ull
     - |check|
     - 7.18.11
+    -
   * - stdc_count_zeros_us
     - |check|
     - 7.18.11
+    -
   * - stdc_first_leading_one_uc
     - |check|
     - 7.18.8
+    -
   * - stdc_first_leading_one_ui
     - |check|
     - 7.18.8
+    -
   * - stdc_first_leading_one_ul
     - |check|
     - 7.18.8
+    -
   * - stdc_first_leading_one_ull
     - |check|
     - 7.18.8
+    -
   * - stdc_first_leading_one_us
     - |check|
     - 7.18.8
+    -
   * - stdc_first_leading_zero_uc
     - |check|
     - 7.18.7
+    -
   * - stdc_first_leading_zero_ui
     - |check|
     - 7.18.7
+    -
   * - stdc_first_leading_zero_ul
     - |check|
     - 7.18.7
+    -
   * - stdc_first_leading_zero_ull
     - |check|
     - 7.18.7
+    -
   * - stdc_first_leading_zero_us
     - |check|
     - 7.18.7
+    -
   * - stdc_first_trailing_one_uc
     - |check|
     - 7.18.10
+    -
   * - stdc_first_trailing_one_ui
     - |check|
     - 7.18.10
+    -
   * - stdc_first_trailing_one_ul
     - |check|
     - 7.18.10
+    -
   * - stdc_first_trailing_one_ull
     - |check|
     - 7.18.10
+    -
   * - stdc_first_trailing_one_us
     - |check|
     - 7.18.10
+    -
   * - stdc_first_trailing_zero_uc
     - |check|
     - 7.18.9
+    -
   * - stdc_first_trailing_zero_ui
     - |check|
     - 7.18.9
+    -
   * - stdc_first_trailing_zero_ul
     - |check|
     - 7.18.9
+    -
   * - stdc_first_trailing_zero_ull
     - |check|
     - 7.18.9
+    -
   * - stdc_first_trailing_zero_us
     - |check|
     - 7.18.9
+    -
   * - stdc_has_single_bit_uc
     - |check|
     - 7.18.13
+    -
   * - stdc_has_single_bit_ui
     - |check|
     - 7.18.13
+    -
   * - stdc_has_single_bit_ul
     - |check|
     - 7.18.13
+    -
   * - stdc_has_single_bit_ull
     - |check|
     - 7.18.13
+    -
   * - stdc_has_single_bit_us
     - |check|
     - 7.18.13
+    -
   * - stdc_leading_ones_uc
     - |check|
     - 7.18.4
+    -
   * - stdc_leading_ones_ui
     - |check|
     - 7.18.4
+    -
   * - stdc_leading_ones_ul
     - |check|
     - 7.18.4
+    -
   * - stdc_leading_ones_ull
     - |check|
     - 7.18.4
+    -
   * - stdc_leading_ones_us
     - |check|
     - 7.18.4
+    -
   * - stdc_leading_zeros_uc
     - |check|
     - 7.18.3
+    -
   * - stdc_leading_zeros_ui
     - |check|
     - 7.18.3
+    -
   * - stdc_leading_zeros_ul
     - |check|
     - 7.18.3
+    -
   * - stdc_leading_zeros_ull
     - |check|
     - 7.18.3
+    -
   * - stdc_leading_zeros_us
     - |check|
     - 7.18.3
+    -
   * - stdc_trailing_ones_uc
     - |check|
     - 7.18.6
+    -
   * - stdc_trailing_ones_ui
     - |check|
     - 7.18.6
+    -
   * - stdc_trailing_ones_ul
     - |check|
     - 7.18.6
+    -
   * - stdc_trailing_ones_ull
     - |check|
     - 7.18.6
+    -
   * - stdc_trailing_ones_us
     - |check|
     - 7.18.6
+    -
   * - stdc_trailing_zeros_uc
     - |check|
     - 7.18.5
+    -
   * - stdc_trailing_zeros_ui
     - |check|
     - 7.18.5
+    -
   * - stdc_trailing_zeros_ul
     - |check|
     - 7.18.5
+    -
   * - stdc_trailing_zeros_ull
     - |check|
     - 7.18.5
+    -
   * - stdc_trailing_zeros_us
     - |check|
     - 7.18.5
+    -
diff --git a/libc/docs/threads.rst b/libc/docs/threads.rst
index 78e17e9..63cd6c4 100644
--- a/libc/docs/threads.rst
+++ b/libc/docs/threads.rst
@@ -1,7 +1,32 @@
 .. include:: check.rst
 
-threads.h Functions
-===================
+=========
+threads.h
+=========
+
+Macros
+======
+
+.. list-table::
+  :widths: auto
+  :align: center
+  :header-rows: 1
+
+  * - Function
+    - Implemented
+    - C23 Standard Section
+    - POSIX.1-2017 Standard Section
+  * - ONCE_FLAG_INIT
+    -
+    - 7.28.1.3
+    -
+  * - TSS_DTOR_ITERATIONS
+    -
+    - 7.28.1.3
+    -
+
+Functions
+=========
 
 .. list-table::
   :widths: auto
@@ -10,79 +35,105 @@ threads.h Functions
 
   * - Function
     - Implemented
-    - Standard
+    - C23 Standard Section
+    - POSIX.1-2017 Standard Section
   * - call_once
     - |check|
     - 7.28.2.1
+    -
   * - cnd_broadcast
     - |check|
     - 7.28.3.1
+    -
   * - cnd_destroy
     - |check|
     - 7.28.3.2
+    -
   * - cnd_init
     - |check|
     - 7.28.3.3
+    -
   * - cnd_signal
     - |check|
     - 7.28.3.4
+    -
   * - cnd_timedwait
     -
     - 7.28.3.5
+    -
   * - cnd_wait
     - |check|
     - 7.28.3.6
+    -
   * - mtx_destroy
     - |check|
     - 7.28.4.1
+    -
   * - mtx_init
     - |check|
     - 7.28.4.2
+    -
   * - mtx_lock
     - |check|
     - 7.28.4.3
+    -
   * - mtx_timedlock
     -
     - 7.28.4.4
+    -
   * - mtx_trylock
     -
     - 7.28.4.5
+    -
   * - mtx_unlock
     - |check|
     - 7.28.4.6
+    -
   * - thrd_create
     - |check|
     - 7.28.5.1
+    -
   * - thrd_current
     - |check|
     - 7.28.5.2
+    -
   * - thrd_detach
     - |check|
     - 7.28.5.3
+    -
   * - thrd_equal
     - |check|
     - 7.28.5.4
+    -
   * - thrd_exit
     - |check|
     - 7.28.5.5
+    -
   * - thrd_join
     - |check|
     - 7.28.5.6
+    -
   * - thrd_sleep
     -
     - 7.28.5.7
+    -
   * - thrd_yield
     -
     - 7.28.5.8
+    -
   * - tss_create
     - |check|
     - 7.28.6.1
+    -
   * - tss_delete
     - |check|
     - 7.28.6.2
+    -
   * - tss_get
     - |check|
     - 7.28.6.3
+    -
   * - tss_set
     - |check|
     - 7.28.6.4
+    -
diff --git a/libc/src/__support/threads/CMakeLists.txt b/libc/src/__support/threads/CMakeLists.txt
index 34412be..9ea0b59 100644
--- a/libc/src/__support/threads/CMakeLists.txt
+++ b/libc/src/__support/threads/CMakeLists.txt
@@ -71,3 +71,12 @@ if(TARGET libc.src.__support.threads.${LIBC_TARGET_OS}.callonce)
       .${LIBC_TARGET_OS}.callonce
   )
 endif()
+
+if(TARGET libc.src.__support.threads.${LIBC_TARGET_OS}.CndVar)
+  add_object_library(
+    CndVar
+    ALIAS
+    DEPENDS
+    .${LIBC_TARGET_OS}.CndVar
+  )
+endif()
diff --git a/libc/src/__support/threads/CndVar.h b/libc/src/__support/threads/CndVar.h
new file mode 100644
index 0000000..baa2a68
--- /dev/null
+++ b/libc/src/__support/threads/CndVar.h
@@ -0,0 +1,52 @@
+//===-- A platform independent abstraction layer for cond vars --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC___SUPPORT_SRC_THREADS_LINUX_CNDVAR_H
+#define LLVM_LIBC___SUPPORT_SRC_THREADS_LINUX_CNDVAR_H
+
+#include "src/__support/threads/linux/futex_utils.h" // Futex
+#include "src/__support/threads/mutex.h"             // Mutex
+
+#include <stdint.h> // uint32_t
+
+namespace LIBC_NAMESPACE {
+
+struct CndVar {
+  enum CndWaiterStatus : uint32_t {
+    WS_Waiting = 0xE,
+    WS_Signalled = 0x5,
+  };
+
+  struct CndWaiter {
+    Futex futex_word = WS_Waiting;
+    CndWaiter *next = nullptr;
+  };
+
+  CndWaiter *waitq_front;
+  CndWaiter *waitq_back;
+  Mutex qmtx;
+
+  static int init(CndVar *cv) {
+    cv->waitq_front = cv->waitq_back = nullptr;
+    auto err = Mutex::init(&cv->qmtx, false, false, false);
+    return err == MutexError::NONE ? 0 : -1;
+  }
+
+  static void destroy(CndVar *cv) {
+    cv->waitq_front = cv->waitq_back = nullptr;
+  }
+
+  // Returns 0 on success, -1 on error.
+  int wait(Mutex *m);
+  void notify_one();
+  void broadcast();
+};
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC___SUPPORT_THREADS_LINUX_CNDVAR_H
diff --git a/libc/src/__support/threads/linux/CMakeLists.txt b/libc/src/__support/threads/linux/CMakeLists.txt
index d3353f6..39c4ad2 100644
--- a/libc/src/__support/threads/linux/CMakeLists.txt
+++ b/libc/src/__support/threads/linux/CMakeLists.txt
@@ -63,3 +63,16 @@ add_object_library(
   DEPENDS
     .futex_utils
 )
+
+add_object_library(
+  CndVar
+  SRCS
+    CndVar.cpp
+  HDRS
+    ../CndVar.h
+  DEPENDS
+    libc.include.sys_syscall
+    libc.src.__support.OSUtil.osutil
+    libc.src.__support.threads.linux.futex_word_type
+    libc.src.__support.threads.mutex
+)
diff --git a/libc/src/__support/threads/linux/CndVar.cpp b/libc/src/__support/threads/linux/CndVar.cpp
new file mode 100644
index 0000000..daf56bc
--- /dev/null
+++ b/libc/src/__support/threads/linux/CndVar.cpp
@@ -0,0 +1,103 @@
+//===-- Utility condition variable class ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/threads/CndVar.h"
+#include "src/__support/OSUtil/syscall.h"           // syscall_impl
+#include "src/__support/threads/linux/futex_word.h" // FutexWordType
+#include "src/__support/threads/mutex.h"            // Mutex, MutexLock
+
+#include <sys/syscall.h> // For syscall numbers.
+
+namespace LIBC_NAMESPACE {
+
+int CndVar::wait(Mutex *m) {
+  // The goal is to perform "unlock |m| and wait" in an
+  // atomic operation. However, it is not possible to do it
+  // in the true sense so we do it in spirit. Before unlocking
+  // |m|, a new waiter object is added to the waiter queue with
+  // the waiter queue locked. Iff a signalling thread signals
+  // the waiter before the waiter actually starts waiting, the
+  // wait operation will not begin at all and the waiter immediately
+  // returns.
+
+  CndWaiter waiter;
+  {
+    MutexLock ml(&qmtx);
+    CndWaiter *old_back = nullptr;
+    if (waitq_front == nullptr) {
+      waitq_front = waitq_back = &waiter;
+    } else {
+      old_back = waitq_back;
+      waitq_back->next = &waiter;
+      waitq_back = &waiter;
+    }
+
+    if (m->unlock() != MutexError::NONE) {
+      // If we do not remove the queued up waiter before returning,
+      // then another thread can potentially signal a non-existing
+      // waiter. Note also that we do this with |qmtx| locked. This
+      // ensures that another thread will not signal the withdrawing
+      // waiter.
+      waitq_back = old_back;
+      if (waitq_back == nullptr)
+        waitq_front = nullptr;
+      else
+        waitq_back->next = nullptr;
+
+      return -1;
+    }
+  }
+
+  waiter.futex_word.wait(WS_Waiting, cpp::nullopt, true);
+
+  // At this point, if locking |m| fails, we can simply return as the
+  // queued up waiter would have been removed from the queue.
+  auto err = m->lock();
+  return err == MutexError::NONE ? 0 : -1;
+}
+
+void CndVar::notify_one() {
+  // We don't use an RAII locker in this method as we want to unlock
+  // |qmtx| and signal the waiter using a single FUTEX_WAKE_OP signal.
+  qmtx.lock();
+  if (waitq_front == nullptr)
+    qmtx.unlock();
+
+  CndWaiter *first = waitq_front;
+  waitq_front = waitq_front->next;
+  if (waitq_front == nullptr)
+    waitq_back = nullptr;
+
+  qmtx.futex_word = FutexWordType(Mutex::LockState::Free);
+
+  // this is a special WAKE_OP, so we use syscall directly
+  LIBC_NAMESPACE::syscall_impl<long>(
+      FUTEX_SYSCALL_ID, &qmtx.futex_word.val, FUTEX_WAKE_OP, 1, 1,
+      &first->futex_word.val,
+      FUTEX_OP(FUTEX_OP_SET, WS_Signalled, FUTEX_OP_CMP_EQ, WS_Waiting));
+}
+
+void CndVar::broadcast() {
+  MutexLock ml(&qmtx);
+  uint32_t dummy_futex_word;
+  CndWaiter *waiter = waitq_front;
+  waitq_front = waitq_back = nullptr;
+  while (waiter != nullptr) {
+    // FUTEX_WAKE_OP is used instead of just FUTEX_WAKE as it allows us to
+    // atomically update the waiter status to WS_Signalled before waking
+    // up the waiter. A dummy location is used for the other futex of
+    // FUTEX_WAKE_OP.
+    LIBC_NAMESPACE::syscall_impl<long>(
+        FUTEX_SYSCALL_ID, &dummy_futex_word, FUTEX_WAKE_OP, 1, 1,
+        &waiter->futex_word.val,
+        FUTEX_OP(FUTEX_OP_SET, WS_Signalled, FUTEX_OP_CMP_EQ, WS_Waiting));
+    waiter = waiter->next;
+  }
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/threads/linux/CMakeLists.txt b/libc/src/threads/linux/CMakeLists.txt
index 68b7106..a5a02e4 100644
--- a/libc/src/threads/linux/CMakeLists.txt
+++ b/libc/src/threads/linux/CMakeLists.txt
@@ -1,7 +1,6 @@
 add_header_library(
   threads_utils
   HDRS
-    CndVar.h
     Futex.h
   DEPENDS
     libc.include.sys_syscall
@@ -20,8 +19,8 @@ add_entrypoint_object(
   HDRS
     ../cnd_init.h
   DEPENDS
-    .threads_utils
     libc.include.threads
+    libc.src.__support.threads.CndVar
 )
 
 add_entrypoint_object(
@@ -31,8 +30,8 @@ add_entrypoint_object(
   HDRS
     ../cnd_destroy.h
   DEPENDS
-    .threads_utils
     libc.include.threads
+    libc.src.__support.threads.CndVar
 )
 
 add_entrypoint_object(
@@ -42,9 +41,9 @@ add_entrypoint_object(
   HDRS
     ../cnd_wait.h
   DEPENDS
-    .threads_utils
     libc.include.threads
     libc.src.__support.threads.mutex
+    libc.src.__support.threads.CndVar
 )
 
 add_entrypoint_object(
@@ -54,8 +53,8 @@ add_entrypoint_object(
   HDRS
     ../cnd_signal.h
   DEPENDS
-    .threads_utils
     libc.include.threads
+    libc.src.__support.threads.CndVar
 )
 
 add_entrypoint_object(
@@ -65,6 +64,6 @@ add_entrypoint_object(
   HDRS
     ../cnd_broadcast.h
   DEPENDS
-    .threads_utils
     libc.include.threads
+    libc.src.__support.threads.CndVar
 )
diff --git a/libc/src/threads/linux/CndVar.h b/libc/src/threads/linux/CndVar.h
deleted file mode 100644
index c08ffa3..0000000
--- a/libc/src/threads/linux/CndVar.h
+++ /dev/null
@@ -1,148 +0,0 @@
-//===-- Utility condition variable class ------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_THREADS_LINUX_CNDVAR_H
-#define LLVM_LIBC_SRC_THREADS_LINUX_CNDVAR_H
-
-#include "src/__support/CPP/atomic.h"
-#include "src/__support/CPP/mutex.h" // lock_guard
-#include "src/__support/CPP/optional.h"
-#include "src/__support/OSUtil/syscall.h" // For syscall functions.
-#include "src/__support/threads/linux/futex_utils.h"
-#include "src/__support/threads/mutex.h"
-
-#include <linux/futex.h> // For futex operations.
-#include <stdint.h>
-#include <sys/syscall.h> // For syscall numbers.
-#include <threads.h>     // For values like thrd_success etc.
-
-namespace LIBC_NAMESPACE {
-
-struct CndVar {
-  enum CndWaiterStatus : uint32_t {
-    WS_Waiting = 0xE,
-    WS_Signalled = 0x5,
-  };
-
-  struct CndWaiter {
-    Futex futex_word = WS_Waiting;
-    CndWaiter *next = nullptr;
-  };
-
-  CndWaiter *waitq_front;
-  CndWaiter *waitq_back;
-  Mutex qmtx;
-
-  static int init(CndVar *cv) {
-    cv->waitq_front = cv->waitq_back = nullptr;
-    auto err = Mutex::init(&cv->qmtx, false, false, false);
-    return err == MutexError::NONE ? thrd_success : thrd_error;
-  }
-
-  static void destroy(CndVar *cv) {
-    cv->waitq_front = cv->waitq_back = nullptr;
-  }
-
-  int wait(Mutex *m) {
-    // The goal is to perform "unlock |m| and wait" in an
-    // atomic operation. However, it is not possible to do it
-    // in the true sense so we do it in spirit. Before unlocking
-    // |m|, a new waiter object is added to the waiter queue with
-    // the waiter queue locked. Iff a signalling thread signals
-    // the waiter before the waiter actually starts waiting, the
-    // wait operation will not begin at all and the waiter immediately
-    // returns.
-
-    CndWaiter waiter;
-    {
-      cpp::lock_guard ml(qmtx);
-      CndWaiter *old_back = nullptr;
-      if (waitq_front == nullptr) {
-        waitq_front = waitq_back = &waiter;
-      } else {
-        old_back = waitq_back;
-        waitq_back->next = &waiter;
-        waitq_back = &waiter;
-      }
-
-      if (m->unlock() != MutexError::NONE) {
-        // If we do not remove the queued up waiter before returning,
-        // then another thread can potentially signal a non-existing
-        // waiter. Note also that we do this with |qmtx| locked. This
-        // ensures that another thread will not signal the withdrawing
-        // waiter.
-        waitq_back = old_back;
-        if (waitq_back == nullptr)
-          waitq_front = nullptr;
-        else
-          waitq_back->next = nullptr;
-
-        return thrd_error;
-      }
-    }
-
-    waiter.futex_word.wait(WS_Waiting, cpp::nullopt, true);
-
-    // At this point, if locking |m| fails, we can simply return as the
-    // queued up waiter would have been removed from the queue.
-    auto err = m->lock();
-    return err == MutexError::NONE ? thrd_success : thrd_error;
-  }
-
-  int notify_one() {
-    // We don't use an RAII locker in this method as we want to unlock
-    // |qmtx| and signal the waiter using a single FUTEX_WAKE_OP signal.
-    qmtx.lock();
-    if (waitq_front == nullptr) {
-      qmtx.unlock();
-      return thrd_success;
-    }
-
-    CndWaiter *first = waitq_front;
-    waitq_front = waitq_front->next;
-    if (waitq_front == nullptr)
-      waitq_back = nullptr;
-
-    qmtx.futex_word = FutexWordType(Mutex::LockState::Free);
-
-    // this is a special WAKE_OP, so we use syscall directly
-    LIBC_NAMESPACE::syscall_impl<long>(
-        FUTEX_SYSCALL_ID, &qmtx.futex_word.val, FUTEX_WAKE_OP, 1, 1,
-        &first->futex_word.val,
-        FUTEX_OP(FUTEX_OP_SET, WS_Signalled, FUTEX_OP_CMP_EQ, WS_Waiting));
-    return thrd_success;
-  }
-
-  int broadcast() {
-    cpp::lock_guard ml(qmtx);
-    uint32_t dummy_futex_word;
-    CndWaiter *waiter = waitq_front;
-    waitq_front = waitq_back = nullptr;
-    while (waiter != nullptr) {
-      // FUTEX_WAKE_OP is used instead of just FUTEX_WAKE as it allows us to
-      // atomically update the waiter status to WS_Signalled before waking
-      // up the waiter. A dummy location is used for the other futex of
-      // FUTEX_WAKE_OP.
-      LIBC_NAMESPACE::syscall_impl<long>(
-          FUTEX_SYSCALL_ID, &dummy_futex_word, FUTEX_WAKE_OP, 1, 1,
-          &waiter->futex_word.val,
-          FUTEX_OP(FUTEX_OP_SET, WS_Signalled, FUTEX_OP_CMP_EQ, WS_Waiting));
-      waiter = waiter->next;
-    }
-    return thrd_success;
-  }
-};
-
-static_assert(sizeof(CndVar) == sizeof(cnd_t),
-              "Mismatch in the size of the "
-              "internal representation of condition variable and the public "
-              "cnd_t type.");
-
-} // namespace LIBC_NAMESPACE
-
-#endif // LLVM_LIBC_SRC_THREADS_LINUX_CNDVAR_H
diff --git a/libc/src/threads/linux/cnd_broadcast.cpp b/libc/src/threads/linux/cnd_broadcast.cpp
index 180ac6d..a56aaa2 100644
--- a/libc/src/threads/linux/cnd_broadcast.cpp
+++ b/libc/src/threads/linux/cnd_broadcast.cpp
@@ -6,16 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "CndVar.h"
-
 #include "src/threads/cnd_broadcast.h"
 #include "src/__support/common.h"
+#include "src/__support/threads/CndVar.h"
+
+// TODO: https://github.com/llvm/llvm-project/issues/92968
+#include <threads.h> // cnd_t, thrd_error, thrd_success
 
 namespace LIBC_NAMESPACE {
 
+static_assert(sizeof(CndVar) == sizeof(cnd_t));
+
 LLVM_LIBC_FUNCTION(int, cnd_broadcast, (cnd_t * cond)) {
   CndVar *cndvar = reinterpret_cast<CndVar *>(cond);
-  return cndvar->broadcast();
+  cndvar->broadcast();
+  return thrd_success;
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/threads/linux/cnd_destroy.cpp b/libc/src/threads/linux/cnd_destroy.cpp
index 08eb3a1..2b03b18 100644
--- a/libc/src/threads/linux/cnd_destroy.cpp
+++ b/libc/src/threads/linux/cnd_destroy.cpp
@@ -6,13 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "CndVar.h"
-
 #include "src/threads/cnd_destroy.h"
 #include "src/__support/common.h"
+#include "src/__support/threads/CndVar.h"
+
+#include <threads.h> // cnd_t
 
 namespace LIBC_NAMESPACE {
 
+static_assert(sizeof(CndVar) == sizeof(cnd_t));
+
 LLVM_LIBC_FUNCTION(void, cnd_destroy, (cnd_t * cond)) {
   CndVar *cndvar = reinterpret_cast<CndVar *>(cond);
   CndVar::destroy(cndvar);
diff --git a/libc/src/threads/linux/cnd_init.cpp b/libc/src/threads/linux/cnd_init.cpp
index 5e3f360..d3d2c8a 100644
--- a/libc/src/threads/linux/cnd_init.cpp
+++ b/libc/src/threads/linux/cnd_init.cpp
@@ -6,16 +6,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "CndVar.h"
-
 #include "src/threads/cnd_init.h"
 #include "src/__support/common.h"
+#include "src/__support/threads/CndVar.h"
+
+#include <threads.h> // cnd_t, thrd_error, thrd_success
 
 namespace LIBC_NAMESPACE {
 
+static_assert(sizeof(CndVar) == sizeof(cnd_t));
+
 LLVM_LIBC_FUNCTION(int, cnd_init, (cnd_t * cond)) {
   CndVar *cndvar = reinterpret_cast<CndVar *>(cond);
-  return CndVar::init(cndvar);
+  return CndVar::init(cndvar) ? thrd_error : thrd_success;
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/threads/linux/cnd_signal.cpp b/libc/src/threads/linux/cnd_signal.cpp
index dba01ab..f144013 100644
--- a/libc/src/threads/linux/cnd_signal.cpp
+++ b/libc/src/threads/linux/cnd_signal.cpp
@@ -6,16 +6,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "CndVar.h"
-
 #include "src/threads/cnd_signal.h"
 #include "src/__support/common.h"
+#include "src/__support/threads/CndVar.h"
+
+#include <threads.h> // cnd_t, thrd_error, thrd_success
 
 namespace LIBC_NAMESPACE {
 
+static_assert(sizeof(CndVar) == sizeof(cnd_t));
+
 LLVM_LIBC_FUNCTION(int, cnd_signal, (cnd_t * cond)) {
   CndVar *cndvar = reinterpret_cast<CndVar *>(cond);
-  return cndvar->notify_one();
+  cndvar->notify_one();
+  return thrd_success;
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/threads/linux/cnd_wait.cpp b/libc/src/threads/linux/cnd_wait.cpp
index db3d7f1..97cade3 100644
--- a/libc/src/threads/linux/cnd_wait.cpp
+++ b/libc/src/threads/linux/cnd_wait.cpp
@@ -6,18 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "CndVar.h"
-
+#include "src/threads/cnd_wait.h"
 #include "src/__support/common.h"
+#include "src/__support/threads/CndVar.h"
 #include "src/__support/threads/mutex.h"
-#include "src/threads/cnd_wait.h"
+
+#include <threads.h> // cnd_t, mtx_t, thrd_error, thrd_success
 
 namespace LIBC_NAMESPACE {
 
+static_assert(sizeof(CndVar) == sizeof(cnd_t));
+
 LLVM_LIBC_FUNCTION(int, cnd_wait, (cnd_t * cond, mtx_t *mtx)) {
   CndVar *cndvar = reinterpret_cast<CndVar *>(cond);
   Mutex *mutex = reinterpret_cast<Mutex *>(mtx);
-  return cndvar->wait(mutex);
+  return cndvar->wait(mutex) ? thrd_error : thrd_success;
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/startup/baremetal/CMakeLists.txt b/libc/startup/baremetal/CMakeLists.txt
new file mode 100644
index 0000000..4faced9
--- /dev/null
+++ b/libc/startup/baremetal/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_entrypoint_object(
+  init
+  SRCS
+    init.cpp
+)
+
+add_entrypoint_object(
+  fini
+  SRCS
+    fini.cpp
+)
diff --git a/libc/startup/baremetal/fini.cpp b/libc/startup/baremetal/fini.cpp
new file mode 100644
index 0000000..84997fb
--- /dev/null
+++ b/libc/startup/baremetal/fini.cpp
@@ -0,0 +1,27 @@
+//===-- Implementation file of __libc_fini_array --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stddef.h>
+#include <stdint.h>
+
+extern "C" {
+extern uintptr_t __fini_array_start[];
+extern uintptr_t __fini_array_end[];
+}
+
+namespace LIBC_NAMESPACE {
+
+using FiniCallback = void(void);
+
+extern "C" void __libc_fini_array(void) {
+  size_t fini_array_size = __fini_array_end - __fini_array_start;
+  for (size_t i = fini_array_size; i > 0; --i)
+    reinterpret_cast<FiniCallback *>(__fini_array_start[i - 1])();
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/startup/baremetal/init.cpp b/libc/startup/baremetal/init.cpp
new file mode 100644
index 0000000..08dff74
--- /dev/null
+++ b/libc/startup/baremetal/init.cpp
@@ -0,0 +1,32 @@
+//===-- Implementation file of __libc_init_array --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stddef.h>
+#include <stdint.h>
+
+extern "C" {
+extern uintptr_t __preinit_array_start[];
+extern uintptr_t __preinit_array_end[];
+extern uintptr_t __init_array_start[];
+extern uintptr_t __init_array_end[];
+}
+
+namespace LIBC_NAMESPACE {
+
+using InitCallback = void(void);
+
+extern "C" void __libc_init_array(void) {
+  size_t preinit_array_size = __preinit_array_end - __preinit_array_start;
+  for (size_t i = 0; i < preinit_array_size; ++i)
+    reinterpret_cast<InitCallback *>(__preinit_array_start[i])();
+  size_t init_array_size = __init_array_end - __init_array_start;
+  for (size_t i = 0; i < init_array_size; ++i)
+    reinterpret_cast<InitCallback *>(__init_array_start[i])();
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/test/integration/scudo/CMakeLists.txt b/libc/test/integration/scudo/CMakeLists.txt
index 8a085b6..a5f7e3b 100644
--- a/libc/test/integration/scudo/CMakeLists.txt
+++ b/libc/test/integration/scudo/CMakeLists.txt
@@ -9,6 +9,7 @@ endif()
 # test will have to link to the LLVM libc startup system. LLVM libc's startup
 # system is not complete enough to allow this. It is also desireable to
 # keep the dependencies as minimal as possible.
+
 add_entrypoint_library(
   libc_for_scudo_integration_test
   DEPENDS
@@ -17,6 +18,9 @@ add_entrypoint_library(
     libc.src.stdlib.realloc
     libc.src.stdlib.aligned_alloc
     libc.src.stdlib.free
+    libc.src.errno.errno
+    libc.src.unistd.__llvm_libc_syscall
+    libc.src.sched.__sched_getcpucount
 )
 
 add_executable(
diff --git a/libc/utils/docgen/ctype.json b/libc/utils/docgen/ctype.json
index 25eeb68..af97e4b 100644
--- a/libc/utils/docgen/ctype.json
+++ b/libc/utils/docgen/ctype.json
@@ -1,46 +1,46 @@
 {
   "functions": {
     "isalnum": {
-      "defined": "7.4.1.1"
+      "c-definition": "7.4.1.1"
     },
     "isalpha": {
-      "defined": "7.4.1.2"
+      "c-definition": "7.4.1.2"
     },
     "isblank": {
-      "defined": "7.4.1.3"
+      "c-definition": "7.4.1.3"
     },
     "iscntrl": {
-      "defined": "7.4.1.4"
+      "c-definition": "7.4.1.4"
     },
     "isdigit": {
-      "defined": "7.4.1.5"
+      "c-definition": "7.4.1.5"
     },
     "isgraph": {
-      "defined": "7.4.1.6"	
+      "c-definition": "7.4.1.6"
     },
     "islower": {
-      "defined": "7.4.1.7"
+      "c-definition": "7.4.1.7"
     },
     "isprint": {
-      "defined": "7.4.1.8"
+      "c-definition": "7.4.1.8"
     },
     "ispunct": {
-      "defined": "7.4.1.9"
+      "c-definition": "7.4.1.9"
     },
     "isspace": {
-      "defined": "7.4.1.10"
+      "c-definition": "7.4.1.10"
     },
     "isupper": {
-      "defined": "7.4.1.11"
+      "c-definition": "7.4.1.11"
     },
     "isxdigit": {
-      "defined": "7.4.1.12"
+      "c-definition": "7.4.1.12"
     },
     "tolower" : {
-      "defined": "7.4.2.1"
+      "c-definition": "7.4.2.1"
     },
     "toupper": {
-      "defined": "7.4.2.2"
+      "c-definition": "7.4.2.2"
     }
   }
 }
diff --git a/libc/utils/docgen/docgen.py b/libc/utils/docgen/docgen.py
index 23d4530..25e22d4 100755
--- a/libc/utils/docgen/docgen.py
+++ b/libc/utils/docgen/docgen.py
@@ -13,70 +13,167 @@ from typing import Dict
 import sys
 import json
 
-
-def load_api(hname: str) -> Dict:
-    p = Path(__file__).parent / Path(hname).with_suffix(".json")
-    api = p.read_text(encoding="utf-8")
+from header import Header
+
+
+class DocgenAPIFormatError(Exception):
+    """Raised on fatal formatting errors with a description of a formatting error"""
+
+
+def check_api(header: Header, api: Dict):
+    """
+    Checks that docgen json files are properly formatted. If there are any
+    fatal formatting errors, raises exceptions with error messages useful for
+    fixing formatting. Warnings are printed to stderr on non-fatal formatting
+    errors. The code that runs after ``check_api(api)`` is called expects that
+    ``check_api`` executed without raising formatting exceptions so the json
+    matches the formatting specified here.
+
+    The json file may contain:
+    * an optional macros object
+    * an optional functions object
+
+    Formatting of ``macros`` and ``functions`` objects
+    ==================================================
+
+    If a macros or functions object is present, then it may contain nested
+    objects. Each of these nested objects should have a name matching a macro
+    or function's name, and each nested object must have the property:
+    ``"c-definition"`` or ``"posix-definition"``.
+
+    Description of properties
+    =========================
+    The defined property is intended to be a reference to a part of the
+    standard that defines the function or macro. For the ``"c-definition"`` property,
+    this should be a C standard section number. For the ``"posix-definition"`` property,
+    this should be a link to the definition.
+
+    :param api: docgen json file contents parsed into a dict
+    """
+    errors = []
+    cdef = "c-definition"
+    pdef = "posix-definition"
+
+    # Validate macros
+    if "macros" in api:
+        if not header.macro_file_exists():
+            print(
+                f"warning: Macro definitions are listed for {header.name}, but no macro file can be found in the directory tree rooted at {header.macros_dir}. All macros will be listed as not implemented.",
+                file=sys.stderr,
+            )
+
+        macros = api["macros"]
+
+        for name, obj in macros.items():
+            if not (cdef in obj or pdef in obj):
+                err = f'error: Macro {name} does not contain at least one required property: "{cdef}" or "{pdef}"'
+                errors.append(err)
+
+    # Validate functions
+    if "functions" in api:
+        if not header.fns_dir_exists():
+            print(
+                f"warning: Function definitions are listed for {header.name}, but no function implementation directory exists at {header.fns_dir}. All functions will be listed as not implemented.",
+                file=sys.stderr,
+            )
+
+        fns = api["functions"]
+        for name, obj in fns.items():
+            if not (cdef in obj or pdef in obj):
+                err = f'error: function {name} does not contain at least one required property: "{cdef}" or "{pdef}"'
+                errors.append(err)
+
+    if errors:
+        raise DocgenAPIFormatError("\n".join(errors))
+
+
+def load_api(header: Header) -> Dict:
+    api = header.docgen_json.read_text(encoding="utf-8")
     return json.loads(api)
 
 
-# TODO: we may need to get more sophisticated for less generic implementations.
-# Does libc/src/{hname minus .h suffix}/{fname}.cpp exist?
-def is_implemented(hname: str, fname: str) -> bool:
-    path = Path(
-        Path(__file__).parent.parent.parent,
-        "src",
-        hname.rstrip(".h")
+def print_tbl_dir():
+    print(
+        f"""
+.. list-table::
+  :widths: auto
+  :align: center
+  :header-rows: 1
+
+  * - Function
+    - Implemented
+    - C23 Standard Section
+    - POSIX.1-2017 Standard Section"""
     )
 
-    if not path.exists():
-        raise FileNotFoundError(f"implementation dir does not exist: {path}")
 
-    if not path.is_dir():
-        raise NotADirectoryError(f"implementation dir is not a dir: {path}")
+def print_functions_rst(header: Header, functions: Dict):
+    tbl_hdr = "Functions"
+    print(tbl_hdr)
+    print("=" * len(tbl_hdr))
+
+    print_tbl_dir()
+
+    for name in sorted(functions.keys()):
+        print(f"  * - {name}")
+
+        if header.fns_dir_exists() and header.implements_fn(name):
+            print("    - |check|")
+        else:
+            print("    -")
+
+        if "c-definition" in functions[name]:
+            print(f'    - {functions[name]["c-definition"]}')
+        else:
+            print("    -")
+
+        if "posix-definition" in functions[name]:
+            print(f'    - {functions[name]["posix-definition"]}')
+        else:
+            print("    -")
 
-    # Recursively search for the target source file in the subdirectories under
-    # libc/src/{hname}.
-    for _ in path.glob("**/" + fname + ".cpp"):
-        return True
 
-    return False
+def print_macros_rst(header: Header, macros: Dict):
+    tbl_hdr = "Macros"
+    print(tbl_hdr)
+    print("=" * len(tbl_hdr))
 
+    print_tbl_dir()
 
-def print_functions(header: str, functions: Dict):
-    for key in sorted(functions.keys()):
-        print(f"  * - {key}")
+    for name in sorted(macros.keys()):
+        print(f"  * - {name}")
 
-        if is_implemented(header, key):
+        if header.macro_file_exists() and header.implements_macro(name):
             print("    - |check|")
         else:
             print("    -")
 
-        # defined is optional. Having any content is optional.
-        if functions[key] is not None and "defined" in functions[key]:
-            print(f'    - {functions[key]["defined"]}')
+        if "c-definition" in macros[name]:
+            print(f'    - {macros[name]["c-definition"]}')
         else:
             print("    -")
 
+        if "posix-definition" in macros[name]:
+            print(f'    - {macros[name]["posix-definition"]}')
+        else:
+            print("    -")
+    print()
+
 
-def print_header(header: str, api: Dict):
+def print_impl_status_rst(header: Header, api: Dict):
     print(".. include:: check.rst\n")
-    fns = f"{header} Functions"
-    print(fns)
-    print("=" * (len(fns)))
-    print(
-        f"""
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
 
-  * - Function
-    - Implemented
-    - Standard"""
-    )
-    # TODO: how do we want to signal implementation of macros?
-    print_functions(header, api["functions"])
+    print("=" * len(header.name))
+    print(header.name)
+    print("=" * len(header.name))
+    print()
+
+    # the macro and function sections are both optional
+    if "macros" in api:
+        print_macros_rst(header, api["macros"])
+
+    if "functions" in api:
+        print_functions_rst(header, api["functions"])
 
 
 def parse_args() -> Namespace:
@@ -88,6 +185,8 @@ def parse_args() -> Namespace:
 
 if __name__ == "__main__":
     args = parse_args()
-    api = load_api(args.header_name)
+    header = Header(args.header_name)
+    api = load_api(header)
+    check_api(header, api)
 
-    print_header(args.header_name, api)
+    print_impl_status_rst(header, api)
diff --git a/libc/utils/docgen/fenv.json b/libc/utils/docgen/fenv.json
index 9aa3f64..788b196 100644
--- a/libc/utils/docgen/fenv.json
+++ b/libc/utils/docgen/fenv.json
@@ -1,114 +1,114 @@
 {
   "macros": {
     "__STDC_VERSION_FENV_H__": {
-      "defined": "7.6.5"
+      "c-definition": "7.6.5"
     },
     "FE_DIVBYZERO": {
-      "defined": "7.6.9"
+      "c-definition": "7.6.9"
     },
     "FE_INEXACT": {
-      "defined": "7.6.9"
+      "c-definition": "7.6.9"
     },
     "FE_INVALID": {
-      "defined": "7.6.9"
+      "c-definition": "7.6.9"
     },
     "FE_OVERFLOW": {
-      "defined": "7.6.9"
+      "c-definition": "7.6.9"
     },
     "FE_UNDERFLOW": {
-      "defined": "7.6.9"
+      "c-definition": "7.6.9"
     },
     "FE_ALL_EXCEPT": {
-      "defined": "7.6.12"
+      "c-definition": "7.6.12"
     },
     "FE_DFL_MODE": {
-      "defined": "7.6.11"
+      "c-definition": "7.6.11"
     },
     "FE_DOWNARD": {
-      "defined": "7.6.13"
+      "c-definition": "7.6.13"
     },
     "FE_TONEAREST": {
-      "defined": "7.6.13"
+      "c-definition": "7.6.13"
     },
     "FE_TONEARESTFROMZERO": {
-      "defined": "7.6.13"
+      "c-definition": "7.6.13"
     },
     "FE_TOWARDZERO": {
-      "defined": "7.6.13"
+      "c-definition": "7.6.13"
     },
     "FE_UPWARD": {
-      "defined": "7.6.13"
+      "c-definition": "7.6.13"
     },
     "FE_DEC_DOWNWARD": {
-      "defined": "7.6.14"
+      "c-definition": "7.6.14"
     },
     "FE_DEC_TONEAREST": {
-      "defined": "7.6.14"
+      "c-definition": "7.6.14"
     },
     "FE_DEC_TONEARESTFROMZERO": {
-      "defined": "7.6.14"
+      "c-definition": "7.6.14"
     },
     "FE_DEC_TOWARDZERO": {
-      "defined": "7.6.14"
+      "c-definition": "7.6.14"
     },
     "FE_DEC_UPWARD": {
-      "defined": "7.6.14"
+      "c-definition": "7.6.14"
     },
     "FE_DFL_ENV": {
-      "defined": "7.6.17"
+      "c-definition": "7.6.17"
     }
   },
   "functions": {
     "feclearexcept": {
-      "defined": "7.6.4.1"
+      "c-definition": "7.6.4.1"
     },
     "fegetexceptflag": {
-      "defined": "7.6.4.2"
+      "c-definition": "7.6.4.2"
     },
     "feraiseexcept": {
-      "defined": "7.6.4.3"
+      "c-definition": "7.6.4.3"
     },
     "fesetexcept": {
-      "defined": "7.6.4.4"
+      "c-definition": "7.6.4.4"
     },
     "fesetexceptflag": {
-      "defined": "7.6.4.5"
+      "c-definition": "7.6.4.5"
     },
     "fetestexceptflag": {
-      "defined": "7.6.4.6"
+      "c-definition": "7.6.4.6"
     },
     "fetestexcept": {
-      "defined": "7.6.4.7"
+      "c-definition": "7.6.4.7"
     },
     "fegetmode": {
-      "defined": "7.6.5.1"
+      "c-definition": "7.6.5.1"
     },
     "fegetround": {
-      "defined": "7.6.5.2"
+      "c-definition": "7.6.5.2"
     },
     "fe_dec_getround": {
-      "defined": "7.6.5.3"
+      "c-definition": "7.6.5.3"
     },
     "fesetmode": {
-      "defined": "7.6.5.4"
+      "c-definition": "7.6.5.4"
     },
     "fesetround": {
-      "defined": "7.6.5.5"
+      "c-definition": "7.6.5.5"
     },
     "fe_dec_setround": {
-      "defined": "7.6.5.6"
+      "c-definition": "7.6.5.6"
     },
     "fegetenv": {
-      "defined": "7.6.6.1"
+      "c-definition": "7.6.6.1"
     },
     "feholdexcept": {
-      "defined": "7.6.6.2"
+      "c-definition": "7.6.6.2"
     },
     "fesetenv": {
-      "defined": "7.6.6.3"
+      "c-definition": "7.6.6.3"
     },
     "feupdateenv": {
-      "defined": "7.6.6.4"
+      "c-definition": "7.6.6.4"
     }
   }
 }
diff --git a/libc/utils/docgen/header.py b/libc/utils/docgen/header.py
new file mode 100644
index 0000000..dde2100
--- /dev/null
+++ b/libc/utils/docgen/header.py
@@ -0,0 +1,87 @@
+# ====- Information about standard headers used by docgen  ----*- python -*--==#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ==-------------------------------------------------------------------------==#
+from pathlib import Path
+from typing import Generator
+
+
+class Header:
+    """
+    Maintains implementation information about a standard header file:
+    * where does its implementation dir live
+    * where is its macros file
+    * where is its docgen json file
+
+    By convention, the macro-only part of a header file is in a header-specific
+    file somewhere in the directory tree with root at
+    ``$LLVM_PROJECT_ROOT/libc/include/llvm-libc-macros``.  Docgen expects that
+    if a macro is implemented, that it appears in a string
+    ``#define MACRO_NAME`` in some ``*-macros.h`` file in the directory tree.
+    Docgen searches for this string in the file to set the implementation status
+    shown in the generated rst docs rendered as html for display at
+    <libc.llvm.org>.
+
+    By convention, each function for a header is implemented in a function-specific
+    cpp file somewhere in the directory tree with root at, e.g,
+    ``$LLVM_PROJECT_ROOT/libc/src/fenv``. Some headers have architecture-specific
+    implementations, like ``math``, and some don't, like ``fenv``. Docgen uses the
+    presence of this function-specific cpp file to set the implementation status
+    shown in the generated rst docs rendered as html for display at
+    <libc.llvm.org>.
+    """
+
+    def __init__(self, header_name: str):
+        """
+        :param header_name: e.g., ``"threads.h"`` or ``"signal.h"``
+        """
+        self.name = header_name
+        self.stem = header_name.rstrip(".h")
+        self.docgen_root = Path(__file__).parent
+        self.libc_root = self.docgen_root.parent.parent
+        self.docgen_json = self.docgen_root / Path(header_name).with_suffix(".json")
+        self.fns_dir = Path(self.libc_root, "src", self.stem)
+        self.macros_dir = Path(self.libc_root, "include", "llvm-libc-macros")
+
+    def macro_file_exists(self) -> bool:
+        for _ in self.__get_macro_files():
+            return True
+
+        return False
+
+    def fns_dir_exists(self) -> bool:
+        return self.fns_dir.exists() and self.fns_dir.is_dir()
+
+    def implements_fn(self, fn_name: str) -> bool:
+        for _ in self.fns_dir.glob(f"**/{fn_name}.cpp"):
+            return True
+
+        return False
+
+    def implements_macro(self, m_name: str) -> bool:
+        """
+        Some macro files are in, e.g.,
+        ``$LLVM_PROJECT_ROOT/libc/include/llvm-libc-macros/fenv-macros.h``,
+        but others are in subdirectories, e.g., ``signal.h`` has the macro
+        definitions in
+        ``$LLVM_PROJECT_ROOT/libc/include/llvm-libc-macros/linux/signal-macros.h``.
+
+        :param m_name: name of macro, e.g., ``FE_ALL_EXCEPT``
+        """
+        for f in self.__get_macro_files():
+            if f"#define {m_name}" in f.read_text():
+                return True
+
+        return False
+
+    def __get_macro_files(self) -> Generator[Path, None, None]:
+        """
+        This function uses a glob on, e.g., ``"**/fcntl.macros.h"`` because the
+        macro file might be located in a subdirectory:
+        libc/include/llvm-libc-macros/fcntl-macros.h
+        libc/include/llvm-libc-macros/linux/fcntl-macros.h
+        """
+        return self.macros_dir.glob(f"**/{self.stem}-macros.h")
diff --git a/libc/utils/docgen/signal.json b/libc/utils/docgen/signal.json
index d5380d3..337b0c1 100644
--- a/libc/utils/docgen/signal.json
+++ b/libc/utils/docgen/signal.json
@@ -1,47 +1,152 @@
 {
   "macros": {
     "SIG_DFL": {
-      "defined": "7.14.3"
+      "c-definition": "7.14.3",
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
     },
     "SIG_ERR": {
-      "defined": "7.14.3"
+      "c-definition": "7.14.3",
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIG_HOLD": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
     },
     "SIG_IGN": {
-      "defined": "7.14.3"
+      "c-definition": "7.14.3",
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGRTMIN": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGRTMAX": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
     },
     "SIGABRT": {
-      "defined": "7.14.3"
+      "c-definition": "7.14.3",
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGALRM": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGBUS": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGCHLD": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGCONT": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
     },
     "SIGFPE": {
-      "defined": "7.14.3"
+      "c-definition": "7.14.3",
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGHUP": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
     },
     "SIGILL": {
-      "defined": "7.14.3"
+      "c-definition": "7.14.3",
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
     },
     "SIGINT": {
-      "defined": "7.14.3"
+      "c-definition": "7.14.3",
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGKILL": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGPIPE": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGPIPE": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGQUIT": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
     },
     "SIGSEGV": {
-      "defined": "7.14.3"
+      "c-definition": "7.14.3",
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGSTOP": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
     },
     "SIGTERM": {
-      "defined": "7.14.3"
+      "c-definition": "7.14.3",
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGTSTP": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGTTIN": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGTTOU": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGUSR1": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGUSR2": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGPOLL": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGPROF": {
+    "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGSYS": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGTRAP": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGURG": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGVTALRM": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGXCPU": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGXFSZ": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
     }
   },
   "functions": {
     "signal": {
-      "defined": "7.14.1.1"
+      "c-definition": "7.14.1.1",
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/functions/signal.html"
     },
     "raise": {
-      "defined": "7.14.2.1"
-    },
-    "kill": null,
-    "sigaction": null,
-    "sigaddset": null,
-    "sigaltstack": null,
-    "sigdelset": null,
-    "sigemptyset": null,
-    "sigfillset": null,
-    "sigprocmask": null
+      "c-definition": "7.14.2.1",
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/functions/raise.html"
+    },
+    "kill": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/functions/kill.html"
+    },
+    "sigaction": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigaction.html"
+    },
+    "sigaddset": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigaddset.html"
+    },
+    "sigaltstack": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigaltstack.html"
+    },
+    "sigdelset": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigdelset.html"
+    },
+    "sigemptyset": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigemptyset.html"
+    },
+    "sigfillset": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigfillset.html"
+    },
+    "sigprocmask": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigprocmask.html"
+    }
   }
 }
diff --git a/libc/utils/docgen/stdbit.json b/libc/utils/docgen/stdbit.json
index 88106cf..25060c1 100644
--- a/libc/utils/docgen/stdbit.json
+++ b/libc/utils/docgen/stdbit.json
@@ -1,270 +1,270 @@
 {
   "macros": {
     "__STDC_VERSION_STDBIT_H__": {
-      "defined": "7.18.1.2"
+      "c-definition": "7.18.1.2"
     },
     "__STDC_ENDIAN_LITTLE__": {
-      "defined": "7.18.2.2"
+      "c-definition": "7.18.2.2"
     },
     "__STDC_ENDIAN_BIG__": {
-      "defined": "7.18.2.2"
+      "c-definition": "7.18.2.2"
     },
     "__STDC_ENDIAN_NATIVE__": {
-      "defined": "7.18.2.2"
+      "c-definition": "7.18.2.2"
     },
     "stdc_leading_zeros": {
-      "defined": "7.18.3.1"
+      "c-definition": "7.18.3.1"
     },
     "stdc_leading_ones": {
-      "defined": "7.18.4.1"
+      "c-definition": "7.18.4.1"
     },
     "stdc_trailing_zeros": {
-      "defined": "7.18.5.1"
+      "c-definition": "7.18.5.1"
     },
     "stdc_trailing_ones": {
-      "defined": "7.18.6.1"
+      "c-definition": "7.18.6.1"
     },
     "stdc_first_leading_zero": {
-      "defined": "7.18.7.1"
+      "c-definition": "7.18.7.1"
     },
     "stdc_first_leading_one": {
-      "defined": "7.18.8.1"
+      "c-definition": "7.18.8.1"
     },
     "stdc_first_trailing_zero": {
-      "defined": "7.18.9.1"
+      "c-definition": "7.18.9.1"
     },
     "stdc_first_trailing_one": {
-      "defined": "7.18.10.1"
+      "c-definition": "7.18.10.1"
     },
     "stdc_count_zeros": {
-      "defined": "7.18.11.1"
+      "c-definition": "7.18.11.1"
     },
     "stdc_count_ones": {
-      "defined": "7.18.12.1"
+      "c-definition": "7.18.12.1"
     },
     "stdc_has_single_bit": {
-      "defined": "7.18.13.1"
+      "c-definition": "7.18.13.1"
     },
     "stdc_bit_width": {
-      "defined": "7.18.14.1"
+      "c-definition": "7.18.14.1"
     },
     "stdc_bit_floor": {
-      "defined": "7.18.15.1"
+      "c-definition": "7.18.15.1"
     },
     "stdc_bit_ceil": {
-      "defined": "7.18.16.1"
+      "c-definition": "7.18.16.1"
     }
   },
   "functions": {
     "stdc_leading_zeros_uc": {
-      "defined": "7.18.3"
+      "c-definition": "7.18.3"
     },
     "stdc_leading_zeros_us": {
-      "defined": "7.18.3"
+      "c-definition": "7.18.3"
     },
     "stdc_leading_zeros_ui": {
-      "defined": "7.18.3"
+      "c-definition": "7.18.3"
     },
     "stdc_leading_zeros_ul": {
-      "defined": "7.18.3"
+      "c-definition": "7.18.3"
     },
     "stdc_leading_zeros_ull": {
-      "defined": "7.18.3"
+      "c-definition": "7.18.3"
     },
     "stdc_leading_ones_uc": {
-      "defined": "7.18.4"
+      "c-definition": "7.18.4"
     },
     "stdc_leading_ones_us": {
-      "defined": "7.18.4"
+      "c-definition": "7.18.4"
     },
     "stdc_leading_ones_ui": {
-      "defined": "7.18.4"
+      "c-definition": "7.18.4"
     },
     "stdc_leading_ones_ul": {
-      "defined": "7.18.4"
+      "c-definition": "7.18.4"
     },
     "stdc_leading_ones_ull": {
-      "defined": "7.18.4"
+      "c-definition": "7.18.4"
     },
     "stdc_trailing_zeros_uc": {
-      "defined": "7.18.5"
+      "c-definition": "7.18.5"
     },
     "stdc_trailing_zeros_us": {
-      "defined": "7.18.5"
+      "c-definition": "7.18.5"
     },
     "stdc_trailing_zeros_ui": {
-      "defined": "7.18.5"
+      "c-definition": "7.18.5"
     },
     "stdc_trailing_zeros_ul": {
-      "defined": "7.18.5"
+      "c-definition": "7.18.5"
     },
     "stdc_trailing_zeros_ull": {
-      "defined": "7.18.5"
+      "c-definition": "7.18.5"
     },
     "stdc_trailing_ones_uc": {
-      "defined": "7.18.6"
+      "c-definition": "7.18.6"
     },
     "stdc_trailing_ones_us": {
-      "defined": "7.18.6"
+      "c-definition": "7.18.6"
     },
     "stdc_trailing_ones_ui": {
-      "defined": "7.18.6"
+      "c-definition": "7.18.6"
     },
     "stdc_trailing_ones_ul": {
-      "defined": "7.18.6"
+      "c-definition": "7.18.6"
     },
     "stdc_trailing_ones_ull": {
-      "defined": "7.18.6"
+      "c-definition": "7.18.6"
     },
     "stdc_first_leading_zero_uc": {
-      "defined": "7.18.7"
+      "c-definition": "7.18.7"
     },
     "stdc_first_leading_zero_us": {
-      "defined": "7.18.7"
+      "c-definition": "7.18.7"
     },
     "stdc_first_leading_zero_ui": {
-      "defined": "7.18.7"
+      "c-definition": "7.18.7"
     },
     "stdc_first_leading_zero_ul": {
-      "defined": "7.18.7"
+      "c-definition": "7.18.7"
     },
     "stdc_first_leading_zero_ull": {
-      "defined": "7.18.7"
+      "c-definition": "7.18.7"
     },
     "stdc_first_leading_one_uc": {
-      "defined": "7.18.8"
+      "c-definition": "7.18.8"
     },
     "stdc_first_leading_one_us": {
-      "defined": "7.18.8"
+      "c-definition": "7.18.8"
     },
     "stdc_first_leading_one_ui": {
-      "defined": "7.18.8"
+      "c-definition": "7.18.8"
     },
     "stdc_first_leading_one_ul": {
-      "defined": "7.18.8"
+      "c-definition": "7.18.8"
     },
     "stdc_first_leading_one_ull": {
-      "defined": "7.18.8"
+      "c-definition": "7.18.8"
     },
     "stdc_first_trailing_zero_uc": {
-      "defined": "7.18.9"
+      "c-definition": "7.18.9"
     },
     "stdc_first_trailing_zero_us": {
-      "defined": "7.18.9"
+      "c-definition": "7.18.9"
     },
     "stdc_first_trailing_zero_ui": {
-      "defined": "7.18.9"
+      "c-definition": "7.18.9"
     },
     "stdc_first_trailing_zero_ul": {
-      "defined": "7.18.9"
+      "c-definition": "7.18.9"
     },
     "stdc_first_trailing_zero_ull": {
-      "defined": "7.18.9"
+      "c-definition": "7.18.9"
     },
     "stdc_first_trailing_one_uc": {
-      "defined": "7.18.10"
+      "c-definition": "7.18.10"
     },
     "stdc_first_trailing_one_us": {
-      "defined": "7.18.10"
+      "c-definition": "7.18.10"
     },
     "stdc_first_trailing_one_ui": {
-      "defined": "7.18.10"
+      "c-definition": "7.18.10"
     },
     "stdc_first_trailing_one_ul": {
-      "defined": "7.18.10"
+      "c-definition": "7.18.10"
     },
     "stdc_first_trailing_one_ull": {
-      "defined": "7.18.10"
+      "c-definition": "7.18.10"
     },
     "stdc_count_zeros_uc": {
-      "defined": "7.18.11"
+      "c-definition": "7.18.11"
     },
     "stdc_count_zeros_us": {
-      "defined": "7.18.11"
+      "c-definition": "7.18.11"
     },
     "stdc_count_zeros_ui": {
-      "defined": "7.18.11"
+      "c-definition": "7.18.11"
     },
     "stdc_count_zeros_ul": {
-      "defined": "7.18.11"
+      "c-definition": "7.18.11"
     },
     "stdc_count_zeros_ull": {
-      "defined": "7.18.11"
+      "c-definition": "7.18.11"
     },
     "stdc_count_ones_uc": {
-      "defined": "7.18.12"
+      "c-definition": "7.18.12"
     },
     "stdc_count_ones_us": {
-      "defined": "7.18.12"
+      "c-definition": "7.18.12"
     },
     "stdc_count_ones_ui": {
-      "defined": "7.18.12"
+      "c-definition": "7.18.12"
     },
     "stdc_count_ones_ul": {
-      "defined": "7.18.12"
+      "c-definition": "7.18.12"
     },
     "stdc_count_ones_ull": {
-      "defined": "7.18.12"
+      "c-definition": "7.18.12"
     },
     "stdc_has_single_bit_uc": {
-      "defined": "7.18.13"
+      "c-definition": "7.18.13"
     },
     "stdc_has_single_bit_us": {
-      "defined": "7.18.13"
+      "c-definition": "7.18.13"
     },
     "stdc_has_single_bit_ui": {
-      "defined": "7.18.13"
+      "c-definition": "7.18.13"
     },
     "stdc_has_single_bit_ul": {
-      "defined": "7.18.13"
+      "c-definition": "7.18.13"
     },
     "stdc_has_single_bit_ull": {
-      "defined": "7.18.13"
+      "c-definition": "7.18.13"
     },
     "stdc_bit_width_uc": {
-      "defined": "7.18.14"
+      "c-definition": "7.18.14"
     },
     "stdc_bit_width_us": {
-      "defined": "7.18.14"
+      "c-definition": "7.18.14"
     },
     "stdc_bit_width_ui": {
-      "defined": "7.18.14"
+      "c-definition": "7.18.14"
     },
     "stdc_bit_width_ul": {
-      "defined": "7.18.14"
+      "c-definition": "7.18.14"
     },
     "stdc_bit_width_ull": {
-      "defined": "7.18.14"
+      "c-definition": "7.18.14"
     },
     "stdc_bit_floor_uc": {
-      "defined": "7.18.15"
+      "c-definition": "7.18.15"
     },
     "stdc_bit_floor_us": {
-      "defined": "7.18.15"
+      "c-definition": "7.18.15"
     },
     "stdc_bit_floor_ui": {
-      "defined": "7.18.15"
+      "c-definition": "7.18.15"
     },
     "stdc_bit_floor_ul": {
-      "defined": "7.18.15"
+      "c-definition": "7.18.15"
     },
     "stdc_bit_floor_ull": {
-      "defined": "7.18.15"
+      "c-definition": "7.18.15"
     },
     "stdc_bit_ceil_uc": {
-      "defined": "7.18.16"
+      "c-definition": "7.18.16"
     },
     "stdc_bit_ceil_us": {
-      "defined": "7.18.16"
+      "c-definition": "7.18.16"
     },
     "stdc_bit_ceil_ui": {
-      "defined": "7.18.16"
+      "c-definition": "7.18.16"
     },
     "stdc_bit_ceil_ul": {
-      "defined": "7.18.16"
+      "c-definition": "7.18.16"
     },
     "stdc_bit_ceil_ull": {
-      "defined": "7.18.16"
+      "c-definition": "7.18.16"
     }
   }
 }
diff --git a/libc/utils/docgen/threads.json b/libc/utils/docgen/threads.json
index aef6ffa..8591cbd 100644
--- a/libc/utils/docgen/threads.json
+++ b/libc/utils/docgen/threads.json
@@ -1,87 +1,87 @@
 {
   "macros": {
     "ONCE_FLAG_INIT": {
-      "defined": "7.28.1.3"
+      "c-definition": "7.28.1.3"
     },
     "TSS_DTOR_ITERATIONS": {
-      "defined": "7.28.1.3"
+      "c-definition": "7.28.1.3"
     }
   },
   "functions": {
     "call_once": {
-      "defined": "7.28.2.1"
+      "c-definition": "7.28.2.1"
     },
     "cnd_broadcast": {
-      "defined": "7.28.3.1"
+      "c-definition": "7.28.3.1"
     },
     "cnd_destroy": {
-      "defined": "7.28.3.2"
+      "c-definition": "7.28.3.2"
     },
     "cnd_init": {
-      "defined": "7.28.3.3"
+      "c-definition": "7.28.3.3"
     },
     "cnd_signal": {
-      "defined": "7.28.3.4"
+      "c-definition": "7.28.3.4"
     },
     "cnd_timedwait": {
-      "defined": "7.28.3.5"
+      "c-definition": "7.28.3.5"
     },
     "cnd_wait": {
-      "defined": "7.28.3.6"
+      "c-definition": "7.28.3.6"
     },
     "mtx_destroy": {
-      "defined": "7.28.4.1"
+      "c-definition": "7.28.4.1"
     },
     "mtx_init": {
-      "defined": "7.28.4.2"
+      "c-definition": "7.28.4.2"
     },
     "mtx_lock": {
-      "defined": "7.28.4.3"
+      "c-definition": "7.28.4.3"
     },
     "mtx_timedlock": {
-      "defined": "7.28.4.4"
+      "c-definition": "7.28.4.4"
     },
     "mtx_trylock": {
-      "defined": "7.28.4.5"
+      "c-definition": "7.28.4.5"
     },
     "mtx_unlock": {
-      "defined": "7.28.4.6"
+      "c-definition": "7.28.4.6"
     },
     "thrd_create": {
-      "defined": "7.28.5.1"
+      "c-definition": "7.28.5.1"
     },
     "thrd_current": {
-      "defined": "7.28.5.2"
+      "c-definition": "7.28.5.2"
     },
     "thrd_detach": {
-      "defined": "7.28.5.3"
+      "c-definition": "7.28.5.3"
     },
     "thrd_equal": {
-      "defined": "7.28.5.4"
+      "c-definition": "7.28.5.4"
     },
     "thrd_exit": {
-      "defined": "7.28.5.5"
+      "c-definition": "7.28.5.5"
     },
     "thrd_join": {
-      "defined": "7.28.5.6"
+      "c-definition": "7.28.5.6"
     },
     "thrd_sleep": {
-      "defined": "7.28.5.7"
+      "c-definition": "7.28.5.7"
     },
     "thrd_yield": {
-      "defined": "7.28.5.8"
+      "c-definition": "7.28.5.8"
     },
     "tss_create": {
-      "defined": "7.28.6.1"
+      "c-definition": "7.28.6.1"
     },
     "tss_delete": {
-      "defined": "7.28.6.2"
+      "c-definition": "7.28.6.2"
     },
     "tss_get": {
-      "defined": "7.28.6.3"
+      "c-definition": "7.28.6.3"
     },
     "tss_set": {
-      "defined": "7.28.6.4"
+      "c-definition": "7.28.6.4"
     }
   }
 }
diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index 83fcd40..0bc343a 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -53,6 +53,7 @@ Implemented Papers
 - P2387R3 - Pipe support for user-defined range adaptors
 - P2713R1 - Escaping improvements in ``std::format``
 - P2231R1 - Missing ``constexpr`` in ``std::optional`` and ``std::variant``
+- P0019R8 - ``std::atomic_ref``
 
 Improvements and New Features
 -----------------------------
diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv
index db57b15..5f83fa3 100644
--- a/libcxx/docs/Status/Cxx20Issues.csv
+++ b/libcxx/docs/Status/Cxx20Issues.csv
@@ -70,7 +70,7 @@
 "`3041 <https://wg21.link/LWG3041>`__","Unnecessary ``decay``\  in ``reference_wrapper``\ ","Jacksonville","|Complete|",""
 "`3042 <https://wg21.link/LWG3042>`__","``is_literal_type_v``\  should be inline","Jacksonville","|Complete|",""
 "`3043 <https://wg21.link/LWG3043>`__","Bogus postcondition for ``filesystem_error``\  constructor","Jacksonville","|Complete|",""
-"`3045 <https://wg21.link/LWG3045>`__","``atomic<floating-point>``\  doesn't have ``value_type``\  or ``difference_type``\ ","Jacksonville","",""
+"`3045 <https://wg21.link/LWG3045>`__","``atomic<floating-point>``\  doesn't have ``value_type``\  or ``difference_type``\ ","Jacksonville","|Complete|","18.0"
 "`3048 <https://wg21.link/LWG3048>`__","``transform_reduce(exec, first1, last1, first2, init)``\  discards execution policy","Jacksonville","|Complete|","17.0"
 "`3051 <https://wg21.link/LWG3051>`__","Floating point classifications were inadvertently changed in P0175","Jacksonville","|Nothing To Do|",""
 "`3075 <https://wg21.link/LWG3075>`__","``basic_string``\  needs deduction guides from ``basic_string_view``\ ","Jacksonville","|Complete|",""
diff --git a/libcxx/docs/Status/Cxx20Papers.csv b/libcxx/docs/Status/Cxx20Papers.csv
index 955aa5f..6598cd18 100644
--- a/libcxx/docs/Status/Cxx20Papers.csv
+++ b/libcxx/docs/Status/Cxx20Papers.csv
@@ -26,7 +26,7 @@
 "`P0905R1 <https://wg21.link/P0905R1>`__","CWG","Symmetry for spaceship","Jacksonville","|Complete|","7.0","|spaceship|"
 "`P0966R1 <https://wg21.link/P0966R1>`__","LWG","``string::reserve``\  Should Not Shrink","Jacksonville","|Complete| [#note-P0966]_","12.0"
 "","","","","","",""
-"`P0019R8 <https://wg21.link/P0019R8>`__","LWG","Atomic Ref","Rapperswil","",""
+"`P0019R8 <https://wg21.link/P0019R8>`__","LWG","Atomic Ref","Rapperswil","|Complete|","19.0"
 "`P0458R2 <https://wg21.link/P0458R2>`__","LWG","Checking for Existence of an Element in Associative Containers","Rapperswil","|Complete|","13.0"
 "`P0475R1 <https://wg21.link/P0475R1>`__","LWG","LWG 2511: guaranteed copy elision for piecewise construction","Rapperswil","|Complete|",""
 "`P0476R2 <https://wg21.link/P0476R2>`__","LWG","Bit-casting object representations","Rapperswil","|Complete|","14.0"
@@ -125,7 +125,7 @@
 "`P1612R1 <https://wg21.link/P1612R1>`__","LWG","Relocate Endian's Specification","Cologne","|Complete|","10.0"
 "`P1614R2 <https://wg21.link/P1614R2>`__","LWG","The Mothership has Landed","Cologne","|In Progress|",""
 "`P1638R1 <https://wg21.link/P1638R1>`__","LWG","basic_istream_view::iterator should not be copyable","Cologne","|Complete|","16.0","|ranges|"
-"`P1643R1 <https://wg21.link/P1643R1>`__","LWG","Add wait/notify to atomic_ref","Cologne","",""
+"`P1643R1 <https://wg21.link/P1643R1>`__","LWG","Add wait/notify to atomic_ref","Cologne","|Complete|","19.0"
 "`P1644R0 <https://wg21.link/P1644R0>`__","LWG","Add wait/notify to atomic<shared_ptr>","Cologne","",""
 "`P1650R0 <https://wg21.link/P1650R0>`__","LWG","Output std::chrono::days with 'd' suffix","Cologne","|Complete|","16.0"
 "`P1651R0 <https://wg21.link/P1651R0>`__","LWG","bind_front should not unwrap reference_wrapper","Cologne","|Complete|","13.0"
diff --git a/libcxx/docs/Status/Cxx23Issues.csv b/libcxx/docs/Status/Cxx23Issues.csv
index d421fee..cc601b3 100644
--- a/libcxx/docs/Status/Cxx23Issues.csv
+++ b/libcxx/docs/Status/Cxx23Issues.csv
@@ -98,7 +98,7 @@
 `3555 <https://wg21.link/LWG3555>`__,"``{transform,elements}_view::iterator::iterator_concept`` should consider const-qualification of the underlying range","June 2021","","","|ranges|"
 "","","","","",""
 `2191 <https://wg21.link/LWG2191>`__,"Incorrect specification of ``match_results(match_results&&)``","October 2021","|Nothing To Do|",""
-`2381 <https://wg21.link/LWG2381>`__,"Inconsistency in parsing floating point numbers","October 2021","",""
+`2381 <https://wg21.link/LWG2381>`__,"Inconsistency in parsing floating point numbers","October 2021","|Complete|","19.0"
 `2762 <https://wg21.link/LWG2762>`__,"``unique_ptr operator*()`` should be ``noexcept``","October 2021","",""
 `3121 <https://wg21.link/LWG3121>`__,"``tuple`` constructor constraints for ``UTypes&&...`` overloads","October 2021","",""
 `3123 <https://wg21.link/LWG3123>`__,"``duration`` constructor from representation shouldn't be effectively non-throwing","October 2021","","","|chrono|"
diff --git a/libcxx/docs/Status/ParallelismProjects.csv b/libcxx/docs/Status/ParallelismProjects.csv
index 06da008..2ddac1e 100644
--- a/libcxx/docs/Status/ParallelismProjects.csv
+++ b/libcxx/docs/Status/ParallelismProjects.csv
@@ -24,6 +24,7 @@ Section,Description,Dependencies,Assignee,Complete
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "`simd generate constructor <https://reviews.llvm.org/D159442>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "`simd load constructor <https://github.com/llvm/llvm-project/pull/76610>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "`simd subscript operators <https://github.com/llvm/llvm-project/pull/68960>`_", None, Yin Zhang, |Complete|
+| `[parallel.simd.class] <https://wg21.link/N4808>`_, "`simd copy functions <https://github.com/llvm/llvm-project/pull/78935>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "Class template simd implementation", None, Yin Zhang, |In Progress|
 | `[parallel.simd.nonmembers] <https://wg21.link/N4808>`_, "simd non-member operations", None, Yin Zhang, |In Progress|
 | `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "`Class template simd_mask declaration and alias <https://reviews.llvm.org/D144362>`_", [parallel.simd.abi], Yin Zhang, |Complete|
@@ -33,5 +34,6 @@ Section,Description,Dependencies,Assignee,Complete
 | `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "`simd_mask implicit type conversion constructor <https://github.com/llvm/llvm-project/pull/71132>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "`simd_mask load constructor <https://github.com/llvm/llvm-project/pull/76610>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "`simd_mask subscript operators <https://github.com/llvm/llvm-project/pull/68960>`_", None, Yin Zhang, |Complete|
+| `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "`simd_mask copy functions <https://github.com/llvm/llvm-project/pull/78935>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "Class template simd_mask implementation", None, Yin Zhang, |In Progress|
 | `[parallel.simd.mask.nonmembers] <https://wg21.link/N4808>`_, "simd_mask non-member operations", None, Yin Zhang, |In Progress|
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 01e9c24..954e0c0 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -224,6 +224,7 @@ set(files
   __atomic/atomic_flag.h
   __atomic/atomic_init.h
   __atomic/atomic_lock_free.h
+  __atomic/atomic_ref.h
   __atomic/atomic_sync.h
   __atomic/check_memory_order.h
   __atomic/contention_t.h
@@ -232,6 +233,7 @@ set(files
   __atomic/is_always_lock_free.h
   __atomic/kill_dependency.h
   __atomic/memory_order.h
+  __atomic/to_gcc_order.h
   __availability
   __bit/bit_cast.h
   __bit/bit_ceil.h
diff --git a/libcxx/include/__algorithm/copy_move_common.h b/libcxx/include/__algorithm/copy_move_common.h
index 12a26c6..8a98451 100644
--- a/libcxx/include/__algorithm/copy_move_common.h
+++ b/libcxx/include/__algorithm/copy_move_common.h
@@ -21,7 +21,6 @@
 #include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_trivially_assignable.h>
-#include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/is_volatile.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
diff --git a/libcxx/include/__algorithm/pstl_copy.h b/libcxx/include/__algorithm/pstl_copy.h
index 0fcea33..3e17131 100644
--- a/libcxx/include/__algorithm/pstl_copy.h
+++ b/libcxx/include/__algorithm/pstl_copy.h
@@ -20,7 +20,6 @@
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_execution_policy.h>
-#include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/remove_cvref.h>
 #include <__utility/move.h>
 #include <optional>
@@ -95,10 +94,12 @@ template <class _ExecutionPolicy,
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_copy_n, _RawPolicy),
       [&__policy](
           _ForwardIterator __g_first, _Size __g_n, _ForwardOutIterator __g_result) -> optional<_ForwardIterator> {
-        if constexpr (__has_random_access_iterator_category_or_concept<_ForwardIterator>::value)
+        if constexpr (__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
           return std::__copy(__policy, std::move(__g_first), std::move(__g_first + __g_n), std::move(__g_result));
-        else
+        } else {
+          (void)__policy;
           return std::copy_n(__g_first, __g_n, __g_result);
+        }
       },
       std::move(__first),
       std::move(__n),
diff --git a/libcxx/include/__algorithm/pstl_count.h b/libcxx/include/__algorithm/pstl_count.h
index 64c84d8..65c96b2 100644
--- a/libcxx/include/__algorithm/pstl_count.h
+++ b/libcxx/include/__algorithm/pstl_count.h
@@ -87,8 +87,8 @@ template <class _ExecutionPolicy,
           class _Tp,
           class _RawPolicy                                    = __remove_cvref_t<_ExecutionPolicy>,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
-[[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__iter_diff_t<_ForwardIterator>>
-__count(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__iter_diff_t<_ForwardIterator>> __count(
+    _ExecutionPolicy&& __policy, _ForwardIterator&& __first, _ForwardIterator&& __last, const _Tp& __value) noexcept {
   return std::__pstl_frontend_dispatch(
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_count, _RawPolicy),
       [&](_ForwardIterator __g_first, _ForwardIterator __g_last, const _Tp& __g_value)
@@ -97,8 +97,8 @@ __count(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator
           return __v == __g_value;
         });
       },
-      std::move(__first),
-      std::move(__last),
+      std::forward<_ForwardIterator>(__first),
+      std::forward<_ForwardIterator>(__last),
       __value);
 }
 
diff --git a/libcxx/include/__algorithm/pstl_equal.h b/libcxx/include/__algorithm/pstl_equal.h
index 0b38197..47333da 100644
--- a/libcxx/include/__algorithm/pstl_equal.h
+++ b/libcxx/include/__algorithm/pstl_equal.h
@@ -91,7 +91,10 @@ _LIBCPP_HIDE_FROM_ABI bool
 equal(_ExecutionPolicy&& __policy, _ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2) {
   _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator1, "equal requires ForwardIterators");
   _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator2, "equal requires ForwardIterators");
-  return std::equal(__policy, std::move(__first1), std::move(__last1), std::move(__first2), std::equal_to{});
+  auto __res = std::__equal(__policy, std::move(__first1), std::move(__last1), std::move(__first2), std::equal_to{});
+  if (!__res)
+    std::__throw_bad_alloc();
+  return *__res;
 }
 
 template <class _ExecutionPolicy,
@@ -171,8 +174,11 @@ equal(_ExecutionPolicy&& __policy,
       _ForwardIterator2 __last2) {
   _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator1, "equal requires ForwardIterators");
   _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator2, "equal requires ForwardIterators");
-  return std::equal(
+  auto __res = std::__equal(
       __policy, std::move(__first1), std::move(__last1), std::move(__first2), std::move(__last2), std::equal_to{});
+  if (!__res)
+    std::__throw_bad_alloc();
+  return *__res;
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_fill.h b/libcxx/include/__algorithm/pstl_fill.h
index fd24850..1032d77 100644
--- a/libcxx/include/__algorithm/pstl_fill.h
+++ b/libcxx/include/__algorithm/pstl_fill.h
@@ -41,8 +41,8 @@ template <class _ExecutionPolicy,
           class _Tp,
           class _RawPolicy                                    = __remove_cvref_t<_ExecutionPolicy>,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__fill(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) noexcept {
+_LIBCPP_HIDE_FROM_ABI optional<__empty> __fill(
+    _ExecutionPolicy&& __policy, _ForwardIterator&& __first, _ForwardIterator&& __last, const _Tp& __value) noexcept {
   return std::__pstl_frontend_dispatch(
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_fill, _RawPolicy),
       [&](_ForwardIterator __g_first, _ForwardIterator __g_last, const _Tp& __g_value) {
@@ -50,8 +50,8 @@ __fill(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator _
           __element = __g_value;
         });
       },
-      std::move(__first),
-      std::move(__last),
+      std::forward<_ForwardIterator>(__first),
+      std::forward<_ForwardIterator>(__last),
       __value);
 }
 
diff --git a/libcxx/include/__algorithm/pstl_find.h b/libcxx/include/__algorithm/pstl_find.h
index b4c4dfb..998db70 100644
--- a/libcxx/include/__algorithm/pstl_find.h
+++ b/libcxx/include/__algorithm/pstl_find.h
@@ -65,8 +65,8 @@ template <class _ExecutionPolicy,
           class _Predicate,
           class _RawPolicy                                    = __remove_cvref_t<_ExecutionPolicy>,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
-[[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__remove_cvref_t<_ForwardIterator>>
-__find_if_not(_ExecutionPolicy&& __policy, _ForwardIterator&& __first, _ForwardIterator&& __last, _Predicate&& __pred) {
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__remove_cvref_t<_ForwardIterator>> __find_if_not(
+    _ExecutionPolicy&& __policy, _ForwardIterator&& __first, _ForwardIterator&& __last, _Predicate&& __pred) noexcept {
   return std::__pstl_frontend_dispatch(
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_find_if_not, _RawPolicy),
       [&](_ForwardIterator&& __g_first, _ForwardIterator&& __g_last, _Predicate&& __g_pred)
@@ -76,9 +76,9 @@ __find_if_not(_ExecutionPolicy&& __policy, _ForwardIterator&& __first, _ForwardI
               return !__g_pred(__value);
             });
       },
-      std::move(__first),
-      std::move(__last),
-      std::move(__pred));
+      std::forward<_ForwardIterator>(__first),
+      std::forward<_ForwardIterator>(__last),
+      std::forward<_Predicate>(__pred));
 }
 
 template <class _ExecutionPolicy,
@@ -103,8 +103,8 @@ template <class _ExecutionPolicy,
           class _Tp,
           class _RawPolicy                                    = __remove_cvref_t<_ExecutionPolicy>,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
-[[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__remove_cvref_t<_ForwardIterator>>
-__find(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) noexcept {
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__remove_cvref_t<_ForwardIterator>> __find(
+    _ExecutionPolicy&& __policy, _ForwardIterator&& __first, _ForwardIterator&& __last, const _Tp& __value) noexcept {
   return std::__pstl_frontend_dispatch(
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_find, _RawPolicy),
       [&](_ForwardIterator __g_first, _ForwardIterator __g_last, const _Tp& __g_value) -> optional<_ForwardIterator> {
@@ -113,8 +113,8 @@ __find(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator _
               return __element == __g_value;
             });
       },
-      std::move(__first),
-      std::move(__last),
+      std::forward<_ForwardIterator>(__first),
+      std::forward<_ForwardIterator>(__last),
       __value);
 }
 
diff --git a/libcxx/include/__algorithm/pstl_generate.h b/libcxx/include/__algorithm/pstl_generate.h
index 350c0e4..78e4dd8 100644
--- a/libcxx/include/__algorithm/pstl_generate.h
+++ b/libcxx/include/__algorithm/pstl_generate.h
@@ -40,8 +40,8 @@ template <class _ExecutionPolicy,
           class _Generator,
           class _RawPolicy                                    = __remove_cvref_t<_ExecutionPolicy>,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
-[[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__empty>
-__generate(_ExecutionPolicy&& __policy, _ForwardIterator&& __first, _ForwardIterator&& __last, _Generator&& __gen) {
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__empty> __generate(
+    _ExecutionPolicy&& __policy, _ForwardIterator&& __first, _ForwardIterator&& __last, _Generator&& __gen) noexcept {
   return std::__pstl_frontend_dispatch(
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_generate, _RawPolicy),
       [&__policy](_ForwardIterator __g_first, _ForwardIterator __g_last, _Generator __g_gen) {
@@ -77,7 +77,7 @@ template <class _ExecutionPolicy,
           class _RawPolicy                                    = __remove_cvref_t<_ExecutionPolicy>,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__empty>
-__generate_n(_ExecutionPolicy&& __policy, _ForwardIterator&& __first, _Size&& __n, _Generator&& __gen) {
+__generate_n(_ExecutionPolicy&& __policy, _ForwardIterator&& __first, _Size&& __n, _Generator&& __gen) noexcept {
   return std::__pstl_frontend_dispatch(
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_generate_n, _RawPolicy),
       [&__policy](_ForwardIterator __g_first, _Size __g_n, _Generator __g_gen) {
diff --git a/libcxx/include/__algorithm/pstl_is_partitioned.h b/libcxx/include/__algorithm/pstl_is_partitioned.h
index 2dd5cf3..068502e 100644
--- a/libcxx/include/__algorithm/pstl_is_partitioned.h
+++ b/libcxx/include/__algorithm/pstl_is_partitioned.h
@@ -41,7 +41,7 @@ template <class _ExecutionPolicy,
           class _RawPolicy                                    = __remove_cvref_t<_ExecutionPolicy>,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<bool> __is_partitioned(
-    _ExecutionPolicy&& __policy, _ForwardIterator&& __first, _ForwardIterator&& __last, _Predicate&& __pred) {
+    _ExecutionPolicy&& __policy, _ForwardIterator&& __first, _ForwardIterator&& __last, _Predicate&& __pred) noexcept {
   return std::__pstl_frontend_dispatch(
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_is_partitioned, _RawPolicy),
       [&__policy](_ForwardIterator __g_first, _ForwardIterator __g_last, _Predicate __g_pred) {
diff --git a/libcxx/include/__algorithm/pstl_merge.h b/libcxx/include/__algorithm/pstl_merge.h
index 87f634a..f76a281 100644
--- a/libcxx/include/__algorithm/pstl_merge.h
+++ b/libcxx/include/__algorithm/pstl_merge.h
@@ -16,6 +16,7 @@
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/remove_cvref.h>
+#include <__utility/forward.h>
 #include <__utility/move.h>
 #include <optional>
 
@@ -34,26 +35,26 @@ template <class _ExecutionPolicy,
           class _ForwardIterator1,
           class _ForwardIterator2,
           class _ForwardOutIterator,
-          class _Comp                                         = std::less<>,
+          class _Comp,
           class _RawPolicy                                    = __remove_cvref_t<_ExecutionPolicy>,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator>
 __merge(_ExecutionPolicy&&,
-        _ForwardIterator1 __first1,
-        _ForwardIterator1 __last1,
-        _ForwardIterator2 __first2,
-        _ForwardIterator2 __last2,
-        _ForwardOutIterator __result,
-        _Comp __comp = {}) noexcept {
+        _ForwardIterator1&& __first1,
+        _ForwardIterator1&& __last1,
+        _ForwardIterator2&& __first2,
+        _ForwardIterator2&& __last2,
+        _ForwardOutIterator&& __result,
+        _Comp&& __comp) noexcept {
   using _Backend = typename __select_backend<_RawPolicy>::type;
   return std::__pstl_merge<_RawPolicy>(
       _Backend{},
-      std::move(__first1),
-      std::move(__last1),
-      std::move(__first2),
-      std::move(__last2),
-      std::move(__result),
-      std::move(__comp));
+      std::forward<_ForwardIterator1>(__first1),
+      std::forward<_ForwardIterator1>(__last1),
+      std::forward<_ForwardIterator2>(__first2),
+      std::forward<_ForwardIterator2>(__last2),
+      std::forward<_ForwardOutIterator>(__result),
+      std::forward<_Comp>(__comp));
 }
 
 template <class _ExecutionPolicy,
diff --git a/libcxx/include/__algorithm/pstl_move.h b/libcxx/include/__algorithm/pstl_move.h
index 3155dde..745fdef 100644
--- a/libcxx/include/__algorithm/pstl_move.h
+++ b/libcxx/include/__algorithm/pstl_move.h
@@ -20,7 +20,6 @@
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_execution_policy.h>
-#include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/remove_cvref.h>
 #include <optional>
 
diff --git a/libcxx/include/__algorithm/pstl_replace.h b/libcxx/include/__algorithm/pstl_replace.h
index b2ded54..456df21 100644
--- a/libcxx/include/__algorithm/pstl_replace.h
+++ b/libcxx/include/__algorithm/pstl_replace.h
@@ -91,8 +91,8 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __replace(_ExecutionPolicy&& __policy,
-          _ForwardIterator __first,
-          _ForwardIterator __last,
+          _ForwardIterator&& __first,
+          _ForwardIterator&& __last,
           const _Tp& __old_value,
           const _Tp& __new_value) noexcept {
   return std::__pstl_frontend_dispatch(
@@ -106,8 +106,8 @@ __replace(_ExecutionPolicy&& __policy,
             [&](__iter_reference<_ForwardIterator> __element) { return __element == __g_old_value; },
             __g_new_value);
       },
-      std::move(__first),
-      std::move(__last),
+      std::forward<_ForwardIterator>(__first),
+      std::forward<_ForwardIterator>(__last),
       __old_value,
       __new_value);
 }
@@ -144,7 +144,7 @@ template <class _ExecutionPolicy,
     _ForwardIterator&& __last,
     _ForwardOutIterator&& __result,
     _Pred&& __pred,
-    const _Tp& __new_value) {
+    const _Tp& __new_value) noexcept {
   return std::__pstl_frontend_dispatch(
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_replace_copy_if, _RawPolicy),
       [&__policy](_ForwardIterator __g_first,
diff --git a/libcxx/include/__algorithm/pstl_sort.h b/libcxx/include/__algorithm/pstl_sort.h
index 769dd81..1b978b2 100644
--- a/libcxx/include/__algorithm/pstl_sort.h
+++ b/libcxx/include/__algorithm/pstl_sort.h
@@ -41,17 +41,20 @@ template <class _ExecutionPolicy,
           class _Comp,
           class _RawPolicy                                    = __remove_cvref_t<_ExecutionPolicy>,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
-[[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__empty> __sort(
-    _ExecutionPolicy&& __policy, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) noexcept {
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__empty>
+__sort(_ExecutionPolicy&& __policy,
+       _RandomAccessIterator&& __first,
+       _RandomAccessIterator&& __last,
+       _Comp&& __comp) noexcept {
   return std::__pstl_frontend_dispatch(
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_sort, _RawPolicy),
       [&__policy](_RandomAccessIterator __g_first, _RandomAccessIterator __g_last, _Comp __g_comp) {
         std::stable_sort(__policy, std::move(__g_first), std::move(__g_last), std::move(__g_comp));
         return optional<__empty>{__empty{}};
       },
-      std::move(__first),
-      std::move(__last),
-      std::move(__comp));
+      std::forward<_RandomAccessIterator>(__first),
+      std::forward<_RandomAccessIterator>(__last),
+      std::forward<_Comp>(__comp));
 }
 
 template <class _ExecutionPolicy,
@@ -73,7 +76,8 @@ template <class _ExecutionPolicy,
 _LIBCPP_HIDE_FROM_ABI void
 sort(_ExecutionPolicy&& __policy, _RandomAccessIterator __first, _RandomAccessIterator __last) {
   _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(_RandomAccessIterator, "sort requires RandomAccessIterators");
-  std::sort(std::forward<_ExecutionPolicy>(__policy), std::move(__first), std::move(__last), less{});
+  if (!std::__sort(__policy, std::move(__first), std::move(__last), less{}))
+    std::__throw_bad_alloc();
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__atomic/atomic_ref.h b/libcxx/include/__atomic/atomic_ref.h
new file mode 100644
index 0000000..156f196
--- /dev/null
+++ b/libcxx/include/__atomic/atomic_ref.h
@@ -0,0 +1,360 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ATOMIC_ATOMIC_REF_H
+#define _LIBCPP___ATOMIC_ATOMIC_REF_H
+
+#include <__assert>
+#include <__atomic/atomic_sync.h>
+#include <__atomic/check_memory_order.h>
+#include <__atomic/to_gcc_order.h>
+#include <__concepts/arithmetic.h>
+#include <__concepts/same_as.h>
+#include <__config>
+#include <__memory/addressof.h>
+#include <__type_traits/has_unique_object_representation.h>
+#include <__type_traits/is_trivially_copyable.h>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if _LIBCPP_STD_VER >= 20
+
+template <class _Tp>
+struct __atomic_ref_base {
+protected:
+  _Tp* __ptr_;
+
+  _LIBCPP_HIDE_FROM_ABI __atomic_ref_base(_Tp& __obj) : __ptr_(std::addressof(__obj)) {}
+
+private:
+  _LIBCPP_HIDE_FROM_ABI static _Tp* __clear_padding(_Tp& __val) noexcept {
+    _Tp* __ptr = std::addressof(__val);
+#  if __has_builtin(__builtin_clear_padding)
+    __builtin_clear_padding(__ptr);
+#  endif
+    return __ptr;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI static bool __compare_exchange(
+      _Tp* __ptr, _Tp* __expected, _Tp* __desired, bool __is_weak, int __success, int __failure) noexcept {
+    if constexpr (
+#  if __has_builtin(__builtin_clear_padding)
+        has_unique_object_representations_v<_Tp> || floating_point<_Tp>
+#  else
+        true // NOLINT(readability-simplify-boolean-expr)
+#  endif
+    ) {
+      return __atomic_compare_exchange(__ptr, __expected, __desired, __is_weak, __success, __failure);
+    } else { // _Tp has padding bits and __builtin_clear_padding is available
+      __clear_padding(*__desired);
+      _Tp __copy = *__expected;
+      __clear_padding(__copy);
+      // The algorithm we use here is basically to perform `__atomic_compare_exchange` on the
+      // values until it has either succeeded, or failed because the value representation of the
+      // objects involved was different. This is why we loop around __atomic_compare_exchange:
+      // we basically loop until its failure is caused by the value representation of the objects
+      // being different, not only their object representation.
+      while (true) {
+        _Tp __prev = __copy;
+        if (__atomic_compare_exchange(__ptr, std::addressof(__copy), __desired, __is_weak, __success, __failure)) {
+          return true;
+        }
+        _Tp __curr = __copy;
+        if (std::memcmp(__clear_padding(__prev), __clear_padding(__curr), sizeof(_Tp)) != 0) {
+          // Value representation without padding bits do not compare equal ->
+          // write the current content of *ptr into *expected
+          std::memcpy(__expected, std::addressof(__copy), sizeof(_Tp));
+          return false;
+        }
+      }
+    }
+  }
+
+  friend struct __atomic_waitable_traits<__atomic_ref_base<_Tp>>;
+
+public:
+  using value_type = _Tp;
+
+  static constexpr size_t required_alignment = alignof(_Tp);
+
+  // The __atomic_always_lock_free builtin takes into account the alignment of the pointer if provided,
+  // so we create a fake pointer with a suitable alignment when querying it. Note that we are guaranteed
+  // that the pointer is going to be aligned properly at runtime because that is a (checked) precondition
+  // of atomic_ref's constructor.
+  static constexpr bool is_always_lock_free =
+      __atomic_always_lock_free(sizeof(_Tp), reinterpret_cast<void*>(-required_alignment));
+
+  _LIBCPP_HIDE_FROM_ABI bool is_lock_free() const noexcept { return __atomic_is_lock_free(sizeof(_Tp), __ptr_); }
+
+  _LIBCPP_HIDE_FROM_ABI void store(_Tp __desired, memory_order __order = memory_order::seq_cst) const noexcept
+      _LIBCPP_CHECK_STORE_MEMORY_ORDER(__order) {
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        __order == memory_order::relaxed || __order == memory_order::release || __order == memory_order::seq_cst,
+        "atomic_ref: memory order argument to atomic store operation is invalid");
+    __atomic_store(__ptr_, __clear_padding(__desired), std::__to_gcc_order(__order));
+  }
+
+  _LIBCPP_HIDE_FROM_ABI _Tp operator=(_Tp __desired) const noexcept {
+    store(__desired);
+    return __desired;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI _Tp load(memory_order __order = memory_order::seq_cst) const noexcept
+      _LIBCPP_CHECK_LOAD_MEMORY_ORDER(__order) {
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        __order == memory_order::relaxed || __order == memory_order::consume || __order == memory_order::acquire ||
+            __order == memory_order::seq_cst,
+        "atomic_ref: memory order argument to atomic load operation is invalid");
+    alignas(_Tp) byte __mem[sizeof(_Tp)];
+    auto* __ret = reinterpret_cast<_Tp*>(__mem);
+    __atomic_load(__ptr_, __ret, std::__to_gcc_order(__order));
+    return *__ret;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI operator _Tp() const noexcept { return load(); }
+
+  _LIBCPP_HIDE_FROM_ABI _Tp exchange(_Tp __desired, memory_order __order = memory_order::seq_cst) const noexcept {
+    alignas(_Tp) byte __mem[sizeof(_Tp)];
+    auto* __ret = reinterpret_cast<_Tp*>(__mem);
+    __atomic_exchange(__ptr_, __clear_padding(__desired), __ret, std::__to_gcc_order(__order));
+    return *__ret;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI bool
+  compare_exchange_weak(_Tp& __expected, _Tp __desired, memory_order __success, memory_order __failure) const noexcept
+      _LIBCPP_CHECK_EXCHANGE_MEMORY_ORDER(__success, __failure) {
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        __failure == memory_order::relaxed || __failure == memory_order::consume ||
+            __failure == memory_order::acquire || __failure == memory_order::seq_cst,
+        "atomic_ref: failure memory order argument to weak atomic compare-and-exchange operation is invalid");
+    return __compare_exchange(
+        __ptr_,
+        std::addressof(__expected),
+        std::addressof(__desired),
+        true,
+        std::__to_gcc_order(__success),
+        std::__to_gcc_order(__failure));
+  }
+  _LIBCPP_HIDE_FROM_ABI bool
+  compare_exchange_strong(_Tp& __expected, _Tp __desired, memory_order __success, memory_order __failure) const noexcept
+      _LIBCPP_CHECK_EXCHANGE_MEMORY_ORDER(__success, __failure) {
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        __failure == memory_order::relaxed || __failure == memory_order::consume ||
+            __failure == memory_order::acquire || __failure == memory_order::seq_cst,
+        "atomic_ref: failure memory order argument to strong atomic compare-and-exchange operation is invalid");
+    return __compare_exchange(
+        __ptr_,
+        std::addressof(__expected),
+        std::addressof(__desired),
+        false,
+        std::__to_gcc_order(__success),
+        std::__to_gcc_order(__failure));
+  }
+
+  _LIBCPP_HIDE_FROM_ABI bool
+  compare_exchange_weak(_Tp& __expected, _Tp __desired, memory_order __order = memory_order::seq_cst) const noexcept {
+    return __compare_exchange(
+        __ptr_,
+        std::addressof(__expected),
+        std::addressof(__desired),
+        true,
+        std::__to_gcc_order(__order),
+        std::__to_gcc_failure_order(__order));
+  }
+  _LIBCPP_HIDE_FROM_ABI bool
+  compare_exchange_strong(_Tp& __expected, _Tp __desired, memory_order __order = memory_order::seq_cst) const noexcept {
+    return __compare_exchange(
+        __ptr_,
+        std::addressof(__expected),
+        std::addressof(__desired),
+        false,
+        std::__to_gcc_order(__order),
+        std::__to_gcc_failure_order(__order));
+  }
+
+  _LIBCPP_HIDE_FROM_ABI void wait(_Tp __old, memory_order __order = memory_order::seq_cst) const noexcept
+      _LIBCPP_CHECK_WAIT_MEMORY_ORDER(__order) {
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        __order == memory_order::relaxed || __order == memory_order::consume || __order == memory_order::acquire ||
+            __order == memory_order::seq_cst,
+        "atomic_ref: memory order argument to atomic wait operation is invalid");
+    std::__atomic_wait(*this, __old, __order);
+  }
+  _LIBCPP_HIDE_FROM_ABI void notify_one() const noexcept { std::__atomic_notify_one(*this); }
+  _LIBCPP_HIDE_FROM_ABI void notify_all() const noexcept { std::__atomic_notify_all(*this); }
+};
+
+template <class _Tp>
+struct __atomic_waitable_traits<__atomic_ref_base<_Tp>> {
+  static _LIBCPP_HIDE_FROM_ABI _Tp __atomic_load(const __atomic_ref_base<_Tp>& __a, memory_order __order) {
+    return __a.load(__order);
+  }
+  static _LIBCPP_HIDE_FROM_ABI const _Tp* __atomic_contention_address(const __atomic_ref_base<_Tp>& __a) {
+    return __a.__ptr_;
+  }
+};
+
+template <class _Tp>
+struct atomic_ref : public __atomic_ref_base<_Tp> {
+  static_assert(is_trivially_copyable_v<_Tp>, "std::atomic_ref<T> requires that 'T' be a trivially copyable type");
+
+  using __base = __atomic_ref_base<_Tp>;
+
+  _LIBCPP_HIDE_FROM_ABI explicit atomic_ref(_Tp& __obj) : __base(__obj) {
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        reinterpret_cast<uintptr_t>(std::addressof(__obj)) % __base::required_alignment == 0,
+        "atomic_ref ctor: referenced object must be aligned to required_alignment");
+  }
+
+  _LIBCPP_HIDE_FROM_ABI atomic_ref(const atomic_ref&) noexcept = default;
+
+  _LIBCPP_HIDE_FROM_ABI _Tp operator=(_Tp __desired) const noexcept { return __base::operator=(__desired); }
+
+  atomic_ref& operator=(const atomic_ref&) = delete;
+};
+
+template <class _Tp>
+  requires(std::integral<_Tp> && !std::same_as<bool, _Tp>)
+struct atomic_ref<_Tp> : public __atomic_ref_base<_Tp> {
+  using __base = __atomic_ref_base<_Tp>;
+
+  using difference_type = __base::value_type;
+
+  _LIBCPP_HIDE_FROM_ABI explicit atomic_ref(_Tp& __obj) : __base(__obj) {
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        reinterpret_cast<uintptr_t>(std::addressof(__obj)) % __base::required_alignment == 0,
+        "atomic_ref ctor: referenced object must be aligned to required_alignment");
+  }
+
+  _LIBCPP_HIDE_FROM_ABI atomic_ref(const atomic_ref&) noexcept = default;
+
+  _LIBCPP_HIDE_FROM_ABI _Tp operator=(_Tp __desired) const noexcept { return __base::operator=(__desired); }
+
+  atomic_ref& operator=(const atomic_ref&) = delete;
+
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_add(_Tp __arg, memory_order __order = memory_order_seq_cst) const noexcept {
+    return __atomic_fetch_add(this->__ptr_, __arg, std::__to_gcc_order(__order));
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_sub(_Tp __arg, memory_order __order = memory_order_seq_cst) const noexcept {
+    return __atomic_fetch_sub(this->__ptr_, __arg, std::__to_gcc_order(__order));
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_and(_Tp __arg, memory_order __order = memory_order_seq_cst) const noexcept {
+    return __atomic_fetch_and(this->__ptr_, __arg, std::__to_gcc_order(__order));
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_or(_Tp __arg, memory_order __order = memory_order_seq_cst) const noexcept {
+    return __atomic_fetch_or(this->__ptr_, __arg, std::__to_gcc_order(__order));
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_xor(_Tp __arg, memory_order __order = memory_order_seq_cst) const noexcept {
+    return __atomic_fetch_xor(this->__ptr_, __arg, std::__to_gcc_order(__order));
+  }
+
+  _LIBCPP_HIDE_FROM_ABI _Tp operator++(int) const noexcept { return fetch_add(_Tp(1)); }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator--(int) const noexcept { return fetch_sub(_Tp(1)); }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator++() const noexcept { return fetch_add(_Tp(1)) + _Tp(1); }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator--() const noexcept { return fetch_sub(_Tp(1)) - _Tp(1); }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator+=(_Tp __arg) const noexcept { return fetch_add(__arg) + __arg; }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator-=(_Tp __arg) const noexcept { return fetch_sub(__arg) - __arg; }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator&=(_Tp __arg) const noexcept { return fetch_and(__arg) & __arg; }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator|=(_Tp __arg) const noexcept { return fetch_or(__arg) | __arg; }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator^=(_Tp __arg) const noexcept { return fetch_xor(__arg) ^ __arg; }
+};
+
+template <class _Tp>
+  requires std::floating_point<_Tp>
+struct atomic_ref<_Tp> : public __atomic_ref_base<_Tp> {
+  using __base = __atomic_ref_base<_Tp>;
+
+  using difference_type = __base::value_type;
+
+  _LIBCPP_HIDE_FROM_ABI explicit atomic_ref(_Tp& __obj) : __base(__obj) {
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        reinterpret_cast<uintptr_t>(std::addressof(__obj)) % __base::required_alignment == 0,
+        "atomic_ref ctor: referenced object must be aligned to required_alignment");
+  }
+
+  _LIBCPP_HIDE_FROM_ABI atomic_ref(const atomic_ref&) noexcept = default;
+
+  _LIBCPP_HIDE_FROM_ABI _Tp operator=(_Tp __desired) const noexcept { return __base::operator=(__desired); }
+
+  atomic_ref& operator=(const atomic_ref&) = delete;
+
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_add(_Tp __arg, memory_order __order = memory_order_seq_cst) const noexcept {
+    _Tp __old = this->load(memory_order_relaxed);
+    _Tp __new = __old + __arg;
+    while (!this->compare_exchange_weak(__old, __new, __order, memory_order_relaxed)) {
+      __new = __old + __arg;
+    }
+    return __old;
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_sub(_Tp __arg, memory_order __order = memory_order_seq_cst) const noexcept {
+    _Tp __old = this->load(memory_order_relaxed);
+    _Tp __new = __old - __arg;
+    while (!this->compare_exchange_weak(__old, __new, __order, memory_order_relaxed)) {
+      __new = __old - __arg;
+    }
+    return __old;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI _Tp operator+=(_Tp __arg) const noexcept { return fetch_add(__arg) + __arg; }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator-=(_Tp __arg) const noexcept { return fetch_sub(__arg) - __arg; }
+};
+
+template <class _Tp>
+struct atomic_ref<_Tp*> : public __atomic_ref_base<_Tp*> {
+  using __base = __atomic_ref_base<_Tp*>;
+
+  using difference_type = ptrdiff_t;
+
+  _LIBCPP_HIDE_FROM_ABI explicit atomic_ref(_Tp*& __ptr) : __base(__ptr) {}
+
+  _LIBCPP_HIDE_FROM_ABI _Tp* operator=(_Tp* __desired) const noexcept { return __base::operator=(__desired); }
+
+  atomic_ref& operator=(const atomic_ref&) = delete;
+
+  _LIBCPP_HIDE_FROM_ABI _Tp* fetch_add(ptrdiff_t __arg, memory_order __order = memory_order_seq_cst) const noexcept {
+    return __atomic_fetch_add(this->__ptr_, __arg * sizeof(_Tp), std::__to_gcc_order(__order));
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp* fetch_sub(ptrdiff_t __arg, memory_order __order = memory_order_seq_cst) const noexcept {
+    return __atomic_fetch_sub(this->__ptr_, __arg * sizeof(_Tp), std::__to_gcc_order(__order));
+  }
+
+  _LIBCPP_HIDE_FROM_ABI _Tp* operator++(int) const noexcept { return fetch_add(1); }
+  _LIBCPP_HIDE_FROM_ABI _Tp* operator--(int) const noexcept { return fetch_sub(1); }
+  _LIBCPP_HIDE_FROM_ABI _Tp* operator++() const noexcept { return fetch_add(1) + 1; }
+  _LIBCPP_HIDE_FROM_ABI _Tp* operator--() const noexcept { return fetch_sub(1) - 1; }
+  _LIBCPP_HIDE_FROM_ABI _Tp* operator+=(ptrdiff_t __arg) const noexcept { return fetch_add(__arg) + __arg; }
+  _LIBCPP_HIDE_FROM_ABI _Tp* operator-=(ptrdiff_t __arg) const noexcept { return fetch_sub(__arg) - __arg; }
+};
+
+_LIBCPP_CTAD_SUPPORTED_FOR_TYPE(atomic_ref);
+
+#endif // _LIBCPP_STD_VER >= 20
+
+_LIBCPP_END_NAMESPACE_STD
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP__ATOMIC_ATOMIC_REF_H
diff --git a/libcxx/include/__atomic/atomic_sync.h b/libcxx/include/__atomic/atomic_sync.h
index e583dca..175700be 100644
--- a/libcxx/include/__atomic/atomic_sync.h
+++ b/libcxx/include/__atomic/atomic_sync.h
@@ -12,6 +12,7 @@
 #include <__atomic/contention_t.h>
 #include <__atomic/cxx_atomic_impl.h>
 #include <__atomic/memory_order.h>
+#include <__atomic/to_gcc_order.h>
 #include <__availability>
 #include <__chrono/duration.h>
 #include <__config>
diff --git a/libcxx/include/__atomic/check_memory_order.h b/libcxx/include/__atomic/check_memory_order.h
index 3012aec0..536f764 100644
--- a/libcxx/include/__atomic/check_memory_order.h
+++ b/libcxx/include/__atomic/check_memory_order.h
@@ -27,4 +27,8 @@
   _LIBCPP_DIAGNOSE_WARNING(__f == memory_order_release || __f == memory_order_acq_rel,                                 \
                            "memory order argument to atomic operation is invalid")
 
+#define _LIBCPP_CHECK_WAIT_MEMORY_ORDER(__m)                                                                           \
+  _LIBCPP_DIAGNOSE_WARNING(__m == memory_order_release || __m == memory_order_acq_rel,                                 \
+                           "memory order argument to atomic operation is invalid")
+
 #endif // _LIBCPP___ATOMIC_CHECK_MEMORY_ORDER_H
diff --git a/libcxx/include/__atomic/cxx_atomic_impl.h b/libcxx/include/__atomic/cxx_atomic_impl.h
index b900cc1..18e88aa 100644
--- a/libcxx/include/__atomic/cxx_atomic_impl.h
+++ b/libcxx/include/__atomic/cxx_atomic_impl.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___ATOMIC_CXX_ATOMIC_IMPL_H
 
 #include <__atomic/memory_order.h>
+#include <__atomic/to_gcc_order.h>
 #include <__config>
 #include <__memory/addressof.h>
 #include <__type_traits/is_assignable.h>
@@ -54,32 +55,6 @@ struct __cxx_atomic_base_impl {
   _Tp __a_value;
 };
 
-_LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR int __to_gcc_order(memory_order __order) {
-  // Avoid switch statement to make this a constexpr.
-  return __order == memory_order_relaxed
-           ? __ATOMIC_RELAXED
-           : (__order == memory_order_acquire
-                  ? __ATOMIC_ACQUIRE
-                  : (__order == memory_order_release
-                         ? __ATOMIC_RELEASE
-                         : (__order == memory_order_seq_cst
-                                ? __ATOMIC_SEQ_CST
-                                : (__order == memory_order_acq_rel ? __ATOMIC_ACQ_REL : __ATOMIC_CONSUME))));
-}
-
-_LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR int __to_gcc_failure_order(memory_order __order) {
-  // Avoid switch statement to make this a constexpr.
-  return __order == memory_order_relaxed
-           ? __ATOMIC_RELAXED
-           : (__order == memory_order_acquire
-                  ? __ATOMIC_ACQUIRE
-                  : (__order == memory_order_release
-                         ? __ATOMIC_RELAXED
-                         : (__order == memory_order_seq_cst
-                                ? __ATOMIC_SEQ_CST
-                                : (__order == memory_order_acq_rel ? __ATOMIC_ACQUIRE : __ATOMIC_CONSUME))));
-}
-
 template <typename _Tp>
 _LIBCPP_HIDE_FROM_ABI void __cxx_atomic_init(volatile __cxx_atomic_base_impl<_Tp>* __a, _Tp __val) {
   __cxx_atomic_assign_volatile(__a->__a_value, __val);
diff --git a/libcxx/include/__atomic/to_gcc_order.h b/libcxx/include/__atomic/to_gcc_order.h
new file mode 100644
index 0000000..d04c111
--- /dev/null
+++ b/libcxx/include/__atomic/to_gcc_order.h
@@ -0,0 +1,54 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ATOMIC_TO_GCC_ORDER_H
+#define _LIBCPP___ATOMIC_TO_GCC_ORDER_H
+
+#include <__atomic/memory_order.h>
+#include <__config>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if defined(__ATOMIC_RELAXED) && defined(__ATOMIC_CONSUME) && defined(__ATOMIC_ACQUIRE) &&                             \
+    defined(__ATOMIC_RELEASE) && defined(__ATOMIC_ACQ_REL) && defined(__ATOMIC_SEQ_CST)
+
+_LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR int __to_gcc_order(memory_order __order) {
+  // Avoid switch statement to make this a constexpr.
+  return __order == memory_order_relaxed
+           ? __ATOMIC_RELAXED
+           : (__order == memory_order_acquire
+                  ? __ATOMIC_ACQUIRE
+                  : (__order == memory_order_release
+                         ? __ATOMIC_RELEASE
+                         : (__order == memory_order_seq_cst
+                                ? __ATOMIC_SEQ_CST
+                                : (__order == memory_order_acq_rel ? __ATOMIC_ACQ_REL : __ATOMIC_CONSUME))));
+}
+
+_LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR int __to_gcc_failure_order(memory_order __order) {
+  // Avoid switch statement to make this a constexpr.
+  return __order == memory_order_relaxed
+           ? __ATOMIC_RELAXED
+           : (__order == memory_order_acquire
+                  ? __ATOMIC_ACQUIRE
+                  : (__order == memory_order_release
+                         ? __ATOMIC_RELAXED
+                         : (__order == memory_order_seq_cst
+                                ? __ATOMIC_SEQ_CST
+                                : (__order == memory_order_acq_rel ? __ATOMIC_ACQUIRE : __ATOMIC_CONSUME))));
+}
+
+#endif
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___ATOMIC_TO_GCC_ORDER_H
diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h
index c9027de..868fd7c 100644
--- a/libcxx/include/__exception/exception_ptr.h
+++ b/libcxx/include/__exception/exception_ptr.h
@@ -38,11 +38,14 @@ struct __cxa_exception;
 _LIBCPP_OVERRIDABLE_FUNC_VIS __cxa_exception* __cxa_init_primary_exception(
     void*,
     std::type_info*,
-    void(
 #    if defined(_WIN32)
-        __thiscall
+    void(__thiscall*)(void*)) throw();
+#    elif defined(__wasm__)
+    // In Wasm, a destructor returns its argument
+    void* (*)(void*)) throw();
+#    else
+    void (*)(void*)) throw();
 #    endif
-            *)(void*)) throw();
 }
 
 } // namespace __cxxabiv1
@@ -92,8 +95,16 @@ _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep __e) _NOEXCEPT {
   using _Ep2 = __decay_t<_Ep>;
 
   void* __ex = __cxxabiv1::__cxa_allocate_exception(sizeof(_Ep));
+#      ifdef __wasm__
+  // In Wasm, a destructor returns its argument
+  (void)__cxxabiv1::__cxa_init_primary_exception(__ex, const_cast<std::type_info*>(&typeid(_Ep)), [](void* __p) -> void* {
+#      else
   (void)__cxxabiv1::__cxa_init_primary_exception(__ex, const_cast<std::type_info*>(&typeid(_Ep)), [](void* __p) {
+#      endif
     std::__destroy_at(static_cast<_Ep2*>(__p));
+#      ifdef __wasm__
+    return __p;
+#      endif
   });
 
   try {
diff --git a/libcxx/include/__locale b/libcxx/include/__locale
index 36ac099..1e97c75 100644
--- a/libcxx/include/__locale
+++ b/libcxx/include/__locale
@@ -343,12 +343,12 @@ public:
   static const mask __regex_word = 0x4000; // 0x8000 and 0x0100 and 0x00ff are used
 #  define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_PRINT
 #  define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_ALPHA
-#elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__EMSCRIPTEN__) || defined(__NetBSD__)
+#elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__)
 #  ifdef __APPLE__
   typedef __uint32_t mask;
 #  elif defined(__FreeBSD__)
   typedef unsigned long mask;
-#  elif defined(__EMSCRIPTEN__) || defined(__NetBSD__)
+#  elif defined(__NetBSD__)
   typedef unsigned short mask;
 #  endif
   static const mask space  = _CTYPE_S;
diff --git a/libcxx/include/atomic b/libcxx/include/atomic
index cb142b0..80a0f9e 100644
--- a/libcxx/include/atomic
+++ b/libcxx/include/atomic
@@ -599,6 +599,7 @@ template <class T>
 #include <__atomic/atomic_flag.h>
 #include <__atomic/atomic_init.h>
 #include <__atomic/atomic_lock_free.h>
+#include <__atomic/atomic_ref.h>
 #include <__atomic/atomic_sync.h>
 #include <__atomic/check_memory_order.h>
 #include <__atomic/contention_t.h>
diff --git a/libcxx/include/experimental/__simd/scalar.h b/libcxx/include/experimental/__simd/scalar.h
index aff2cd1..1add465 100644
--- a/libcxx/include/experimental/__simd/scalar.h
+++ b/libcxx/include/experimental/__simd/scalar.h
@@ -62,6 +62,11 @@ struct __simd_operations<_Tp, simd_abi::__scalar> {
   static _LIBCPP_HIDE_FROM_ABI void __load(_SimdStorage& __s, const _Up* __mem) noexcept {
     __s.__data = static_cast<_Tp>(__mem[0]);
   }
+
+  template <class _Up>
+  static _LIBCPP_HIDE_FROM_ABI void __store(_SimdStorage __s, _Up* __mem) noexcept {
+    *__mem = static_cast<_Up>(__s.__data);
+  }
 };
 
 template <class _Tp>
@@ -71,6 +76,8 @@ struct __mask_operations<_Tp, simd_abi::__scalar> {
   static _LIBCPP_HIDE_FROM_ABI _MaskStorage __broadcast(bool __v) noexcept { return {__v}; }
 
   static _LIBCPP_HIDE_FROM_ABI void __load(_MaskStorage& __s, const bool* __mem) noexcept { __s.__data = __mem[0]; }
+
+  static _LIBCPP_HIDE_FROM_ABI void __store(_MaskStorage __s, bool* __mem) noexcept { __mem[0] = __s.__data; }
 };
 
 } // namespace parallelism_v2
diff --git a/libcxx/include/experimental/__simd/simd.h b/libcxx/include/experimental/__simd/simd.h
index db4ebb8..37e334a 100644
--- a/libcxx/include/experimental/__simd/simd.h
+++ b/libcxx/include/experimental/__simd/simd.h
@@ -70,6 +70,17 @@ public:
     _Impl::__load(__s_, _Flags::template __apply<simd>(__mem));
   }
 
+  // copy functions
+  template <class _Up, class _Flags, enable_if_t<__is_vectorizable_v<_Up> && is_simd_flag_type_v<_Flags>, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI void copy_from(const _Up* __mem, _Flags) {
+    _Impl::__load(__s_, _Flags::template __apply<simd>(__mem));
+  }
+
+  template <class _Up, class _Flags, enable_if_t<__is_vectorizable_v<_Up> && is_simd_flag_type_v<_Flags>, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI void copy_to(_Up* __mem, _Flags) const {
+    _Impl::__store(__s_, _Flags::template __apply<simd>(__mem));
+  }
+
   // scalar access [simd.subscr]
   _LIBCPP_HIDE_FROM_ABI reference operator[](size_t __i) noexcept { return reference(__s_, __i); }
   _LIBCPP_HIDE_FROM_ABI value_type operator[](size_t __i) const noexcept { return __s_.__get(__i); }
diff --git a/libcxx/include/experimental/__simd/simd_mask.h b/libcxx/include/experimental/__simd/simd_mask.h
index 754db79..fd6dee2 100644
--- a/libcxx/include/experimental/__simd/simd_mask.h
+++ b/libcxx/include/experimental/__simd/simd_mask.h
@@ -58,6 +58,17 @@ public:
     _Impl::__load(__s_, _Flags::template __apply<simd_mask>(__mem));
   }
 
+  // copy functions
+  template <class _Flags, enable_if_t<is_simd_flag_type_v<_Flags>, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI void copy_from(const value_type* __mem, _Flags) {
+    _Impl::__load(__s_, _Flags::template __apply<simd_mask>(__mem));
+  }
+
+  template <class _Flags, enable_if_t<is_simd_flag_type_v<_Flags>, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI void copy_to(value_type* __mem, _Flags) const {
+    _Impl::__store(__s_, _Flags::template __apply<simd_mask>(__mem));
+  }
+
   // scalar access [simd.mask.subscr]
   _LIBCPP_HIDE_FROM_ABI reference operator[](size_t __i) noexcept { return reference(__s_, __i); }
   _LIBCPP_HIDE_FROM_ABI value_type operator[](size_t __i) const noexcept { return __s_.__get(__i); }
diff --git a/libcxx/include/experimental/__simd/vec_ext.h b/libcxx/include/experimental/__simd/vec_ext.h
index c9423df..316866b 100644
--- a/libcxx/include/experimental/__simd/vec_ext.h
+++ b/libcxx/include/experimental/__simd/vec_ext.h
@@ -80,6 +80,12 @@ struct __simd_operations<_Tp, simd_abi::__vec_ext<_Np>> {
     for (size_t __i = 0; __i < _Np; __i++)
       __s.__data[__i] = static_cast<_Tp>(__mem[__i]);
   }
+
+  template <class _Up>
+  static _LIBCPP_HIDE_FROM_ABI void __store(_SimdStorage __s, _Up* __mem) noexcept {
+    for (size_t __i = 0; __i < _Np; __i++)
+      __mem[__i] = static_cast<_Up>(__s.__data[__i]);
+  }
 };
 
 template <class _Tp, int _Np>
@@ -99,6 +105,11 @@ struct __mask_operations<_Tp, simd_abi::__vec_ext<_Np>> {
     for (size_t __i = 0; __i < _Np; __i++)
       __s.__data[__i] = experimental::__set_all_bits<_Tp>(__mem[__i]);
   }
+
+  static _LIBCPP_HIDE_FROM_ABI void __store(_MaskStorage __s, bool* __mem) noexcept {
+    for (size_t __i = 0; __i < _Np; __i++)
+      __mem[__i] = static_cast<bool>(__s.__data[__i]);
+  }
 };
 
 } // namespace parallelism_v2
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index 5a7521e..80dd49f 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -554,7 +554,6 @@ protected:
     return __guard.__release_ptr();
   }
 
-  template <class... _Args>
   _LIBCPP_HIDE_FROM_ABI void __delete_node(__node_pointer __node) {
     // For the same reason as above, we use the allocator's destroy() method for the value_type,
     // but not for the node itself.
diff --git a/libcxx/include/libcxx.imp b/libcxx/include/libcxx.imp
new file mode 100644
index 0000000..f6aa1ea
--- /dev/null
+++ b/libcxx/include/libcxx.imp
@@ -0,0 +1,869 @@
+[
+  { include: [ "<__algorithm/adjacent_find.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/all_of.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/any_of.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/binary_search.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/clamp.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/comp.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/comp_ref_type.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/copy_backward.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/copy_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/copy_move_common.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/copy_n.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/count.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/count_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/equal.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/equal_range.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/fill.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/fill_n.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/find.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/find_end.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/find_first_of.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/find_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/find_if_not.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/find_segment_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/fold.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/for_each.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/for_each_n.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/for_each_segment.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/generate.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/generate_n.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/half_positive.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/in_found_result.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/in_fun_result.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/in_in_out_result.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/in_in_result.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/in_out_out_result.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/in_out_result.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/includes.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/inplace_merge.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/is_heap.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/is_heap_until.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/is_partitioned.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/is_permutation.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/is_sorted.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/is_sorted_until.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/iter_swap.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/iterator_operations.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/lexicographical_compare.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/lexicographical_compare_three_way.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/lower_bound.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/make_heap.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/make_projected.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/max.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/max_element.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/merge.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/min.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/min_element.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/min_max_result.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/minmax.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/minmax_element.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/mismatch.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/move.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/move_backward.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/next_permutation.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/none_of.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/nth_element.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/partial_sort.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/partial_sort_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/partition.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/partition_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/partition_point.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pop_heap.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/prev_permutation.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_any_all_none_of.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backend.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backend.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/any_of.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/backend.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/fill.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/find_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/for_each.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/libdispatch.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/merge.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/serial.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/stable_sort.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/thread.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/transform.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/transform_reduce.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_count.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_equal.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_fill.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_find.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_for_each.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_frontend_dispatch.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_generate.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_is_partitioned.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_merge.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_move.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_replace.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_rotate_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_sort.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_stable_sort.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_transform.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/push_heap.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_adjacent_find.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_all_of.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_any_of.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_binary_search.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_clamp.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_contains.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_contains_subrange.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_copy_backward.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_copy_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_copy_n.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_count.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_count_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_ends_with.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_equal.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_equal_range.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_fill.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_fill_n.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_find.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_find_end.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_find_first_of.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_find_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_find_if_not.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_for_each.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_for_each_n.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_generate.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_generate_n.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_includes.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_inplace_merge.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_is_heap.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_is_heap_until.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_is_partitioned.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_is_permutation.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_is_sorted.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_is_sorted_until.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_iterator_concept.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_lexicographical_compare.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_lower_bound.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_make_heap.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_max.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_max_element.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_merge.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_min.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_min_element.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_minmax.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_minmax_element.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_mismatch.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_move.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_move_backward.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_next_permutation.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_none_of.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_nth_element.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_partial_sort.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_partial_sort_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_partition.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_partition_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_partition_point.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_pop_heap.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_prev_permutation.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_push_heap.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_remove.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_remove_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_remove_copy_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_remove_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_replace.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_replace_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_replace_copy_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_replace_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_reverse.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_reverse_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_rotate.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_rotate_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_sample.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_search.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_search_n.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_set_difference.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_set_intersection.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_set_symmetric_difference.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_set_union.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_shuffle.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_sort.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_sort_heap.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_stable_partition.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_stable_sort.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_starts_with.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_swap_ranges.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_transform.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_unique.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_unique_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_upper_bound.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/remove.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/remove_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/remove_copy_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/remove_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/replace.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/replace_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/replace_copy_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/replace_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/reverse.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/reverse_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/rotate.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/rotate_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/sample.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/search.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/search_n.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/set_difference.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/set_intersection.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/set_symmetric_difference.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/set_union.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/shift_left.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/shift_right.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/shuffle.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/sift_down.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/simd_utils.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/sort.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/sort_heap.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/stable_partition.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/stable_sort.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/swap_ranges.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/three_way_comp_ref_type.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/transform.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/uniform_random_bit_generator_adaptor.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/unique.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/unique_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/unwrap_iter.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/unwrap_range.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/upper_bound.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__atomic/aliases.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/atomic.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/atomic_base.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/atomic_flag.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/atomic_init.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/atomic_lock_free.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/atomic_ref.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/atomic_sync.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/check_memory_order.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/contention_t.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/cxx_atomic_impl.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/fence.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/is_always_lock_free.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/kill_dependency.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/memory_order.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/to_gcc_order.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__bit/bit_cast.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/bit_ceil.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/bit_floor.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/bit_log2.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/bit_width.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/blsr.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/byteswap.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/countl.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/countr.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/endian.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/has_single_bit.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/invert_if.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/popcount.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/rotate.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__charconv/chars_format.h>", "private", "<charconv>", "public" ] },
+  { include: [ "<__charconv/from_chars_integral.h>", "private", "<charconv>", "public" ] },
+  { include: [ "<__charconv/from_chars_result.h>", "private", "<charconv>", "public" ] },
+  { include: [ "<__charconv/tables.h>", "private", "<charconv>", "public" ] },
+  { include: [ "<__charconv/to_chars.h>", "private", "<charconv>", "public" ] },
+  { include: [ "<__charconv/to_chars_base_10.h>", "private", "<charconv>", "public" ] },
+  { include: [ "<__charconv/to_chars_floating_point.h>", "private", "<charconv>", "public" ] },
+  { include: [ "<__charconv/to_chars_integral.h>", "private", "<charconv>", "public" ] },
+  { include: [ "<__charconv/to_chars_result.h>", "private", "<charconv>", "public" ] },
+  { include: [ "<__charconv/traits.h>", "private", "<charconv>", "public" ] },
+  { include: [ "<__chrono/calendar.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/concepts.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/convert_to_timespec.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/convert_to_tm.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/day.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/duration.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/file_clock.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/formatter.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/hh_mm_ss.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/high_resolution_clock.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/leap_second.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/literals.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/month.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/month_weekday.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/monthday.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/ostream.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/parser_std_format_spec.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/statically_widen.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/steady_clock.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/sys_info.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/system_clock.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/time_point.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/time_zone.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/time_zone_link.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/tzdb.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/tzdb_list.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/weekday.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/year.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/year_month.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/year_month_day.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/year_month_weekday.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__compare/common_comparison_category.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/compare_partial_order_fallback.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/compare_strong_order_fallback.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/compare_three_way.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/compare_three_way_result.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/compare_weak_order_fallback.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/is_eq.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/ordering.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/partial_order.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/strong_order.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/synth_three_way.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/three_way_comparable.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/weak_order.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__concepts/arithmetic.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/assignable.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/boolean_testable.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/class_or_enum.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/common_reference_with.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/common_with.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/constructible.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/convertible_to.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/copyable.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/derived_from.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/destructible.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/different_from.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/equality_comparable.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/invocable.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/movable.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/predicate.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/regular.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/relation.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/same_as.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/semiregular.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/swappable.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/totally_ordered.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__condition_variable/condition_variable.h>", "private", "<condition_variable>", "public" ] },
+  { include: [ "<__coroutine/coroutine_handle.h>", "private", "<coroutine>", "public" ] },
+  { include: [ "<__coroutine/coroutine_traits.h>", "private", "<coroutine>", "public" ] },
+  { include: [ "<__coroutine/noop_coroutine_handle.h>", "private", "<coroutine>", "public" ] },
+  { include: [ "<__coroutine/trivial_awaitables.h>", "private", "<coroutine>", "public" ] },
+  { include: [ "<__exception/exception.h>", "private", "<exception>", "public" ] },
+  { include: [ "<__exception/exception_ptr.h>", "private", "<exception>", "public" ] },
+  { include: [ "<__exception/nested_exception.h>", "private", "<exception>", "public" ] },
+  { include: [ "<__exception/operations.h>", "private", "<exception>", "public" ] },
+  { include: [ "<__exception/terminate.h>", "private", "<exception>", "public" ] },
+  { include: [ "<__expected/bad_expected_access.h>", "private", "<expected>", "public" ] },
+  { include: [ "<__expected/expected.h>", "private", "<expected>", "public" ] },
+  { include: [ "<__expected/unexpect.h>", "private", "<expected>", "public" ] },
+  { include: [ "<__expected/unexpected.h>", "private", "<expected>", "public" ] },
+  { include: [ "<__filesystem/copy_options.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/directory_entry.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/directory_iterator.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/directory_options.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/file_status.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/file_time_type.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/file_type.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/filesystem_error.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/operations.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/path.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/path_iterator.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/perm_options.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/perms.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/recursive_directory_iterator.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/space_info.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/u8path.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__format/buffer.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/concepts.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/container_adaptor.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/enable_insertable.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/escaped_output_table.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/extended_grapheme_cluster_table.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/format_arg.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/format_arg_store.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/format_args.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/format_context.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/format_error.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/format_functions.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/format_parse_context.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/format_string.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/format_to_n_result.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/formatter.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/formatter_bool.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/formatter_char.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/formatter_floating_point.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/formatter_integer.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/formatter_integral.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/formatter_output.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/formatter_pointer.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/formatter_string.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/formatter_tuple.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/indic_conjunct_break_table.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/parser_std_format_spec.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/range_default_formatter.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/range_formatter.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/unicode.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/width_estimation_table.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/write_escaped.h>", "private", "<format>", "public" ] },
+  { include: [ "<__functional/binary_function.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/binary_negate.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/bind.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/bind_back.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/bind_front.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/binder1st.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/binder2nd.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/boyer_moore_searcher.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/compose.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/default_searcher.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/function.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/hash.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/identity.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/invoke.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/is_transparent.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/mem_fn.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/mem_fun_ref.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/not_fn.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/operations.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/perfect_forward.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/pointer_to_binary_function.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/pointer_to_unary_function.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/ranges_operations.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/reference_wrapper.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/unary_function.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/unary_negate.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/weak_result_type.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__fwd/array.h>", "private", "<array>", "public" ] },
+  { include: [ "<__fwd/bit_reference.h>", "private", "<bitset>", "public" ] },
+  { include: [ "<__fwd/bit_reference.h>", "private", "<vector>", "public" ] },
+  { include: [ "<__fwd/complex.h>", "private", "<complex>", "public" ] },
+  { include: [ "<__fwd/deque.h>", "private", "<deque>", "public" ] },
+  { include: [ "<__fwd/format.h>", "private", "<format>", "public" ] },
+  { include: [ "<__fwd/fstream.h>", "private", "<iosfwd>", "public" ] },
+  { include: [ "<__fwd/functional.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__fwd/ios.h>", "private", "<iosfwd>", "public" ] },
+  { include: [ "<__fwd/istream.h>", "private", "<iosfwd>", "public" ] },
+  { include: [ "<__fwd/mdspan.h>", "private", "<mdspan>", "public" ] },
+  { include: [ "<__fwd/memory.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__fwd/memory_resource.h>", "private", "<memory_resource>", "public" ] },
+  { include: [ "<__fwd/ostream.h>", "private", "<iosfwd>", "public" ] },
+  { include: [ "<__fwd/pair.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__fwd/queue.h>", "private", "<queue>", "public" ] },
+  { include: [ "<__fwd/span.h>", "private", "<span>", "public" ] },
+  { include: [ "<__fwd/sstream.h>", "private", "<iosfwd>", "public" ] },
+  { include: [ "<__fwd/stack.h>", "private", "<stack>", "public" ] },
+  { include: [ "<__fwd/streambuf.h>", "private", "<iosfwd>", "public" ] },
+  { include: [ "<__fwd/string.h>", "private", "<string>", "public" ] },
+  { include: [ "<__fwd/string_view.h>", "private", "<string_view>", "public" ] },
+  { include: [ "<__fwd/subrange.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__fwd/tuple.h>", "private", "<tuple>", "public" ] },
+  { include: [ "<__fwd/vector.h>", "private", "<vector>", "public" ] },
+  { include: [ "<__ios/fpos.h>", "private", "<ios>", "public" ] },
+  { include: [ "<__iterator/access.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/advance.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/back_insert_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/bounded_iter.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/common_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/concepts.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/counted_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/cpp17_iterator_concepts.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/data.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/default_sentinel.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/distance.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/empty.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/erase_if_container.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/front_insert_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/incrementable_traits.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/indirectly_comparable.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/insert_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/istream_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/istreambuf_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/iter_move.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/iter_swap.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/iterator_traits.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/iterator_with_data.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/mergeable.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/move_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/move_sentinel.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/next.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/ostream_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/ostreambuf_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/permutable.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/prev.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/projected.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/ranges_iterator_traits.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/readable_traits.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/reverse_access.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/reverse_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/segmented_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/size.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/sortable.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/unreachable_sentinel.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/wrap_iter.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__locale_dir/locale_base_api.h>", "private", "<locale>", "public" ] },
+  { include: [ "<__locale_dir/locale_base_api/android.h>", "private", "<locale>", "public" ] },
+  { include: [ "<__locale_dir/locale_base_api/bsd_locale_defaults.h>", "private", "<locale>", "public" ] },
+  { include: [ "<__locale_dir/locale_base_api/bsd_locale_fallbacks.h>", "private", "<locale>", "public" ] },
+  { include: [ "<__locale_dir/locale_base_api/fuchsia.h>", "private", "<locale>", "public" ] },
+  { include: [ "<__locale_dir/locale_base_api/ibm.h>", "private", "<locale>", "public" ] },
+  { include: [ "<__locale_dir/locale_base_api/locale_guard.h>", "private", "<locale>", "public" ] },
+  { include: [ "<__locale_dir/locale_base_api/musl.h>", "private", "<locale>", "public" ] },
+  { include: [ "<__locale_dir/locale_base_api/newlib.h>", "private", "<locale>", "public" ] },
+  { include: [ "<__locale_dir/locale_base_api/openbsd.h>", "private", "<locale>", "public" ] },
+  { include: [ "<__locale_dir/locale_base_api/win32.h>", "private", "<locale>", "public" ] },
+  { include: [ "<__math/abs.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/copysign.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/error_functions.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/exponential_functions.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/fdim.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/fma.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/gamma.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/hyperbolic_functions.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/hypot.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/inverse_hyperbolic_functions.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/inverse_trigonometric_functions.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/logarithms.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/min_max.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/modulo.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/remainder.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/roots.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/rounding_functions.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/traits.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/trigonometric_functions.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__mdspan/default_accessor.h>", "private", "<mdspan>", "public" ] },
+  { include: [ "<__mdspan/extents.h>", "private", "<mdspan>", "public" ] },
+  { include: [ "<__mdspan/layout_left.h>", "private", "<mdspan>", "public" ] },
+  { include: [ "<__mdspan/layout_right.h>", "private", "<mdspan>", "public" ] },
+  { include: [ "<__mdspan/layout_stride.h>", "private", "<mdspan>", "public" ] },
+  { include: [ "<__mdspan/mdspan.h>", "private", "<mdspan>", "public" ] },
+  { include: [ "<__memory/addressof.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/align.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/aligned_alloc.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/allocate_at_least.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/allocation_guard.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/allocator.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/allocator_arg_t.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/allocator_destructor.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/allocator_traits.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/assume_aligned.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/auto_ptr.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/builtin_new_allocator.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/compressed_pair.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/concepts.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/construct_at.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/destruct_n.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/pointer_traits.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/ranges_construct_at.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/ranges_uninitialized_algorithms.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/raw_storage_iterator.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/shared_ptr.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/swap_allocator.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/temp_value.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/temporary_buffer.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/uninitialized_algorithms.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/unique_ptr.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/uses_allocator.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/uses_allocator_construction.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/voidify.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory_resource/memory_resource.h>", "private", "<memory_resource>", "public" ] },
+  { include: [ "<__memory_resource/monotonic_buffer_resource.h>", "private", "<memory_resource>", "public" ] },
+  { include: [ "<__memory_resource/polymorphic_allocator.h>", "private", "<memory_resource>", "public" ] },
+  { include: [ "<__memory_resource/pool_options.h>", "private", "<memory_resource>", "public" ] },
+  { include: [ "<__memory_resource/synchronized_pool_resource.h>", "private", "<memory_resource>", "public" ] },
+  { include: [ "<__memory_resource/unsynchronized_pool_resource.h>", "private", "<memory_resource>", "public" ] },
+  { include: [ "<__mutex/lock_guard.h>", "private", "<mutex>", "public" ] },
+  { include: [ "<__mutex/mutex.h>", "private", "<mutex>", "public" ] },
+  { include: [ "<__mutex/once_flag.h>", "private", "<mutex>", "public" ] },
+  { include: [ "<__mutex/tag_types.h>", "private", "<mutex>", "public" ] },
+  { include: [ "<__mutex/unique_lock.h>", "private", "<mutex>", "public" ] },
+  { include: [ "<__numeric/accumulate.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/adjacent_difference.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/exclusive_scan.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/gcd_lcm.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/inclusive_scan.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/inner_product.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/iota.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/midpoint.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/partial_sum.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/pstl_reduce.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/pstl_transform_reduce.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/reduce.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/saturation_arithmetic.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/transform_exclusive_scan.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/transform_inclusive_scan.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/transform_reduce.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__random/bernoulli_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/binomial_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/cauchy_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/chi_squared_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/clamp_to_integral.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/default_random_engine.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/discard_block_engine.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/discrete_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/exponential_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/extreme_value_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/fisher_f_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/gamma_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/generate_canonical.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/geometric_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/independent_bits_engine.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/is_seed_sequence.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/is_valid.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/knuth_b.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/linear_congruential_engine.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/log2.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/lognormal_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/mersenne_twister_engine.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/negative_binomial_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/normal_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/piecewise_constant_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/piecewise_linear_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/poisson_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/random_device.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/ranlux.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/seed_seq.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/shuffle_order_engine.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/student_t_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/subtract_with_carry_engine.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/uniform_int_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/uniform_random_bit_generator.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/uniform_real_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/weibull_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__ranges/access.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/all.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/as_rvalue_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/chunk_by_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/common_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/concepts.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/container_compatible_range.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/counted.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/dangling.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/data.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/drop_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/drop_while_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/elements_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/empty.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/empty_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/enable_borrowed_range.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/enable_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/filter_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/from_range.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/iota_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/istream_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/join_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/lazy_split_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/movable_box.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/non_propagating_cache.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/owning_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/range_adaptor.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/rbegin.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/ref_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/rend.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/repeat_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/reverse_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/single_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/size.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/split_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/subrange.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/take_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/take_while_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/to.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/transform_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/view_interface.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/views.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/zip_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__stop_token/atomic_unique_lock.h>", "private", "<stop_token>", "public" ] },
+  { include: [ "<__stop_token/intrusive_list_view.h>", "private", "<stop_token>", "public" ] },
+  { include: [ "<__stop_token/intrusive_shared_ptr.h>", "private", "<stop_token>", "public" ] },
+  { include: [ "<__stop_token/stop_callback.h>", "private", "<stop_token>", "public" ] },
+  { include: [ "<__stop_token/stop_source.h>", "private", "<stop_token>", "public" ] },
+  { include: [ "<__stop_token/stop_state.h>", "private", "<stop_token>", "public" ] },
+  { include: [ "<__stop_token/stop_token.h>", "private", "<stop_token>", "public" ] },
+  { include: [ "<__string/char_traits.h>", "private", "<string>", "public" ] },
+  { include: [ "<__string/constexpr_c_functions.h>", "private", "<string>", "public" ] },
+  { include: [ "<__string/extern_template_lists.h>", "private", "<string>", "public" ] },
+  { include: [ "<__system_error/errc.h>", "private", "<system_error>", "public" ] },
+  { include: [ "<__system_error/error_category.h>", "private", "<system_error>", "public" ] },
+  { include: [ "<__system_error/error_code.h>", "private", "<system_error>", "public" ] },
+  { include: [ "<__system_error/error_condition.h>", "private", "<system_error>", "public" ] },
+  { include: [ "<__system_error/system_error.h>", "private", "<system_error>", "public" ] },
+  { include: [ "<__thread/formatter.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__thread/id.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__thread/jthread.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__thread/poll_with_backoff.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__thread/support.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__thread/support.h>", "private", "<mutex>", "public" ] },
+  { include: [ "<__thread/support.h>", "private", "<semaphore>", "public" ] },
+  { include: [ "<__thread/support.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__thread/support/c11.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__thread/support/c11.h>", "private", "<mutex>", "public" ] },
+  { include: [ "<__thread/support/c11.h>", "private", "<semaphore>", "public" ] },
+  { include: [ "<__thread/support/c11.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__thread/support/external.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__thread/support/external.h>", "private", "<mutex>", "public" ] },
+  { include: [ "<__thread/support/external.h>", "private", "<semaphore>", "public" ] },
+  { include: [ "<__thread/support/external.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__thread/support/pthread.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__thread/support/pthread.h>", "private", "<mutex>", "public" ] },
+  { include: [ "<__thread/support/pthread.h>", "private", "<semaphore>", "public" ] },
+  { include: [ "<__thread/support/pthread.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__thread/support/windows.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__thread/support/windows.h>", "private", "<mutex>", "public" ] },
+  { include: [ "<__thread/support/windows.h>", "private", "<semaphore>", "public" ] },
+  { include: [ "<__thread/support/windows.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__thread/this_thread.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__thread/thread.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__thread/timed_backoff_policy.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__tuple/find_index.h>", "private", "<tuple>", "public" ] },
+  { include: [ "<__tuple/make_tuple_types.h>", "private", "<tuple>", "public" ] },
+  { include: [ "<__tuple/sfinae_helpers.h>", "private", "<tuple>", "public" ] },
+  { include: [ "<__tuple/tuple_element.h>", "private", "<tuple>", "public" ] },
+  { include: [ "<__tuple/tuple_indices.h>", "private", "<tuple>", "public" ] },
+  { include: [ "<__tuple/tuple_like.h>", "private", "<tuple>", "public" ] },
+  { include: [ "<__tuple/tuple_like_ext.h>", "private", "<tuple>", "public" ] },
+  { include: [ "<__tuple/tuple_like_no_subrange.h>", "private", "<tuple>", "public" ] },
+  { include: [ "<__tuple/tuple_size.h>", "private", "<tuple>", "public" ] },
+  { include: [ "<__tuple/tuple_types.h>", "private", "<tuple>", "public" ] },
+  { include: [ "<__type_traits/add_const.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/add_cv.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/add_lvalue_reference.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/add_pointer.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/add_rvalue_reference.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/add_volatile.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/aligned_storage.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/aligned_union.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/alignment_of.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/apply_cv.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/can_extract_key.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/common_reference.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/common_type.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/conditional.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/conjunction.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/copy_cv.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/copy_cvref.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/datasizeof.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/decay.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/dependent_type.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/desugars_to.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/disjunction.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/enable_if.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/extent.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/has_unique_object_representation.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/has_virtual_destructor.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/integral_constant.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/invoke.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_abstract.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_aggregate.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_allocator.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_always_bitcastable.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_arithmetic.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_array.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_assignable.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_base_of.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_bounded_array.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_callable.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_char_like_type.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_class.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_compound.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_const.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_constant_evaluated.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_constructible.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_convertible.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_core_convertible.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_destructible.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_empty.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_enum.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_equality_comparable.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_execution_policy.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_final.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_floating_point.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_function.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_fundamental.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_implicitly_default_constructible.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_integral.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_literal_type.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_member_function_pointer.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_member_object_pointer.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_member_pointer.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_nothrow_assignable.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_nothrow_constructible.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_nothrow_convertible.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_nothrow_destructible.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_null_pointer.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_object.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_pod.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_pointer.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_polymorphic.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_primary_template.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_reference.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_reference_wrapper.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_referenceable.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_same.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_scalar.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_scoped_enum.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_signed.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_signed_integer.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_specialization.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_standard_layout.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_swappable.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_trivial.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_trivially_assignable.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_trivially_constructible.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_trivially_copyable.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_trivially_destructible.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_trivially_lexicographically_comparable.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_trivially_relocatable.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_unbounded_array.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_union.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_unsigned.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_unsigned_integer.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_valid_expansion.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_void.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_volatile.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/lazy.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/make_32_64_or_128_bit.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/make_const_lvalue_ref.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/make_signed.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/make_unsigned.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/maybe_const.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/nat.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/negation.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/noexcept_move_assign_container.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/promote.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/rank.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/remove_all_extents.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/remove_const.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/remove_const_ref.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/remove_cv.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/remove_cvref.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/remove_extent.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/remove_pointer.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/remove_reference.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/remove_volatile.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/result_of.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/strip_signature.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/type_identity.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/type_list.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/underlying_type.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/unwrap_ref.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/void_t.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__utility/as_const.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/as_lvalue.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/auto_cast.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/cmp.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/convert_to_integral.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/declval.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/empty.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/exception_guard.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/exchange.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/forward.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/forward_like.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/in_place.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/integer_sequence.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/is_pointer_in_range.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/move.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/no_destroy.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/pair.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/piecewise_construct.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/priority_tag.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/rel_ops.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/small_buffer.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/swap.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/to_underlying.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/unreachable.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__variant/monostate.h>", "private", "<variant>", "public" ] },
+]
diff --git a/libcxx/include/list b/libcxx/include/list
index 90bddcc..610a24e 100644
--- a/libcxx/include/list
+++ b/libcxx/include/list
@@ -567,7 +567,6 @@ protected:
     return __guard.__release_ptr();
   }
 
-  template <class... _Args>
   _LIBCPP_HIDE_FROM_ABI void __delete_node(__node_pointer __node) {
     // For the same reason as above, we use the allocator's destroy() method for the value_type,
     // but not for the node itself.
diff --git a/libcxx/include/locale b/libcxx/include/locale
index 748b276..041d7bc 100644
--- a/libcxx/include/locale
+++ b/libcxx/include/locale
@@ -368,7 +368,11 @@ struct _LIBCPP_EXPORTED_FROM_ABI __num_get_base {
   static const int __num_get_buf_sz = 40;
 
   static int __get_base(ios_base&);
-  static const char __src[33];
+  static const char __src[33]; // "0123456789abcdefABCDEFxX+-pPiInN"
+  // count of leading characters in __src used for parsing integers ("012..X+-")
+  static const size_t __int_chr_cnt = 26;
+  // count of leading characters in __src used for parsing floating-point values ("012..-pP")
+  static const size_t __fp_chr_cnt = 28;
 };
 
 _LIBCPP_EXPORTED_FROM_ABI void
@@ -431,7 +435,7 @@ private:
   template <typename _Tp>
   const _Tp* __do_widen_p(ios_base& __iob, _Tp* __atoms) const {
     locale __loc = __iob.getloc();
-    use_facet<ctype<_Tp> >(__loc).widen(__src, __src + 26, __atoms);
+    use_facet<ctype<_Tp> >(__loc).widen(__src, __src + __int_chr_cnt, __atoms);
     return __atoms;
   }
 
@@ -447,7 +451,7 @@ private:
 template <class _CharT>
 string __num_get<_CharT>::__stage2_int_prep(ios_base& __iob, _CharT* __atoms, _CharT& __thousands_sep) {
   locale __loc = __iob.getloc();
-  std::use_facet<ctype<_CharT> >(__loc).widen(__src, __src + 26, __atoms);
+  std::use_facet<ctype<_CharT> >(__loc).widen(__src, __src + __int_chr_cnt, __atoms);
   const numpunct<_CharT>& __np = std::use_facet<numpunct<_CharT> >(__loc);
   __thousands_sep              = __np.thousands_sep();
   return __np.grouping();
@@ -458,7 +462,7 @@ template <class _CharT>
 string __num_get<_CharT>::__stage2_float_prep(
     ios_base& __iob, _CharT* __atoms, _CharT& __decimal_point, _CharT& __thousands_sep) {
   locale __loc = __iob.getloc();
-  std::use_facet<ctype<_CharT> >(__loc).widen(__src, __src + 32, __atoms);
+  std::use_facet<ctype<_CharT> >(__loc).widen(__src, __src + __fp_chr_cnt, __atoms);
   const numpunct<_CharT>& __np = std::use_facet<numpunct<_CharT> >(__loc);
   __decimal_point              = __np.decimal_point();
   __thousands_sep              = __np.thousands_sep();
@@ -490,7 +494,7 @@ __num_get<_CharT>::__stage2_int_loop(_CharT __ct, int __base, char* __a, char*&
     }
     return 0;
   }
-  ptrdiff_t __f = std::find(__atoms, __atoms + 26, __ct) - __atoms;
+  ptrdiff_t __f = std::find(__atoms, __atoms + __int_chr_cnt, __ct) - __atoms;
   if (__f >= 24)
     return -1;
   switch (__base) {
@@ -546,8 +550,8 @@ int __num_get<_CharT>::__stage2_float_loop(
     }
     return 0;
   }
-  ptrdiff_t __f = std::find(__atoms, __atoms + 32, __ct) - __atoms;
-  if (__f >= 32)
+  ptrdiff_t __f = std::find(__atoms, __atoms + __num_get_base::__fp_chr_cnt, __ct) - __atoms;
+  if (__f >= static_cast<ptrdiff_t>(__num_get_base::__fp_chr_cnt))
     return -1;
   char __x = __src[__f];
   if (__x == '-' || __x == '+') {
@@ -846,7 +850,7 @@ _InputIterator num_get<_CharT, _InputIterator>::__do_get_signed(
   int __base = this->__get_base(__iob);
   // Stage 2
   char_type __thousands_sep;
-  const int __atoms_size = 26;
+  const int __atoms_size = __num_get_base::__int_chr_cnt;
 #ifdef _LIBCPP_ABI_OPTIMIZED_LOCALE_NUM_GET
   char_type __atoms1[__atoms_size];
   const char_type* __atoms = this->__do_widen(__iob, __atoms1);
@@ -895,7 +899,7 @@ _InputIterator num_get<_CharT, _InputIterator>::__do_get_unsigned(
   int __base = this->__get_base(__iob);
   // Stage 2
   char_type __thousands_sep;
-  const int __atoms_size = 26;
+  const int __atoms_size = __num_get_base::__int_chr_cnt;
 #ifdef _LIBCPP_ABI_OPTIMIZED_LOCALE_NUM_GET
   char_type __atoms1[__atoms_size];
   const char_type* __atoms = this->__do_widen(__iob, __atoms1);
@@ -942,7 +946,7 @@ _InputIterator num_get<_CharT, _InputIterator>::__do_get_floating_point(
     iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, _Fp& __v) const {
   // Stage 1, nothing to do
   // Stage 2
-  char_type __atoms[32];
+  char_type __atoms[__num_get_base::__fp_chr_cnt];
   char_type __decimal_point;
   char_type __thousands_sep;
   string __grouping = this->__stage2_float_prep(__iob, __atoms, __decimal_point, __thousands_sep);
@@ -951,10 +955,11 @@ _InputIterator num_get<_CharT, _InputIterator>::__do_get_floating_point(
   char* __a     = &__buf[0];
   char* __a_end = __a;
   unsigned __g[__num_get_base::__num_get_buf_sz];
-  unsigned* __g_end = __g;
-  unsigned __dc     = 0;
-  bool __in_units   = true;
-  char __exp        = 'E';
+  unsigned* __g_end        = __g;
+  unsigned __dc            = 0;
+  bool __in_units          = true;
+  char __exp               = 'E';
+  bool __is_leading_parsed = false;
   for (; __b != __e; ++__b) {
     if (__a_end == __a + __buf.size()) {
       size_t __tmp = __buf.size();
@@ -977,6 +982,21 @@ _InputIterator num_get<_CharT, _InputIterator>::__do_get_floating_point(
             __dc,
             __atoms))
       break;
+
+    // the leading character excluding the sign must be a decimal digit
+    if (!__is_leading_parsed) {
+      if (__a_end - __a >= 1 && __a[0] != '-' && __a[0] != '+') {
+        if ('0' <= __a[0] && __a[0] <= '9')
+          __is_leading_parsed = true;
+        else
+          break;
+      } else if (__a_end - __a >= 2 && (__a[0] == '-' || __a[0] == '+')) {
+        if ('0' <= __a[1] && __a[1] <= '9')
+          __is_leading_parsed = true;
+        else
+          break;
+      }
+    }
   }
   if (__grouping.size() != 0 && __in_units && __g_end - __g < __num_get_base::__num_get_buf_sz)
     *__g_end++ = __dc;
@@ -996,10 +1016,11 @@ _InputIterator num_get<_CharT, _InputIterator>::do_get(
   // Stage 1
   int __base = 16;
   // Stage 2
-  char_type __atoms[26];
+  char_type __atoms[__num_get_base::__int_chr_cnt];
   char_type __thousands_sep = char_type();
   string __grouping;
-  std::use_facet<ctype<_CharT> >(__iob.getloc()).widen(__num_get_base::__src, __num_get_base::__src + 26, __atoms);
+  std::use_facet<ctype<_CharT> >(__iob.getloc())
+      .widen(__num_get_base::__src, __num_get_base::__src + __num_get_base::__int_chr_cnt, __atoms);
   string __buf;
   __buf.resize(__buf.capacity());
   char* __a     = &__buf[0];
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 70dac2f..8bc94d7 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -1066,7 +1066,11 @@ module std_private_atomic_atomic_flag         [system] {
 }
 module std_private_atomic_atomic_init         [system] { header "__atomic/atomic_init.h" }
 module std_private_atomic_atomic_lock_free    [system] { header "__atomic/atomic_lock_free.h" }
-module std_private_atomic_atomic_sync         [system] { header "__atomic/atomic_sync.h" }
+module std_private_atomic_atomic_ref          [system] { header "__atomic/atomic_ref.h" }
+module std_private_atomic_atomic_sync         [system] {
+  header "__atomic/atomic_sync.h"
+  export std_private_atomic_to_gcc_order
+}
 module std_private_atomic_check_memory_order  [system] { header "__atomic/check_memory_order.h" }
 module std_private_atomic_contention_t        [system] { header "__atomic/contention_t.h" }
 module std_private_atomic_cxx_atomic_impl     [system] { header "__atomic/cxx_atomic_impl.h" }
@@ -1074,6 +1078,10 @@ module std_private_atomic_fence               [system] { header "__atomic/fence.
 module std_private_atomic_is_always_lock_free [system] { header "__atomic/is_always_lock_free.h" }
 module std_private_atomic_kill_dependency     [system] { header "__atomic/kill_dependency.h" }
 module std_private_atomic_memory_order        [system] { header "__atomic/memory_order.h" }
+module std_private_atomic_to_gcc_order        [system] {
+  header "__atomic/to_gcc_order.h"
+  export std_private_atomic_memory_order
+}
 
 module std_private_bit_bit_cast       [system] { header "__bit/bit_cast.h" }
 module std_private_bit_bit_ceil       [system] { header "__bit/bit_ceil.h" }
diff --git a/libcxx/include/vector b/libcxx/include/vector
index 976bde9..b190557 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -424,11 +424,36 @@ public:
 #endif
       : __end_cap_(nullptr, __a) {
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit vector(size_type __n);
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit vector(size_type __n) {
+    auto __guard = std::__make_exception_guard(__destroy_vector(*this));
+    if (__n > 0) {
+      __vallocate(__n);
+      __construct_at_end(__n);
+    }
+    __guard.__complete();
+  }
+
 #if _LIBCPP_STD_VER >= 14
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit vector(size_type __n, const allocator_type& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit vector(size_type __n, const allocator_type& __a)
+      : __end_cap_(nullptr, __a) {
+    auto __guard = std::__make_exception_guard(__destroy_vector(*this));
+    if (__n > 0) {
+      __vallocate(__n);
+      __construct_at_end(__n);
+    }
+    __guard.__complete();
+  }
 #endif
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI vector(size_type __n, const value_type& __x);
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI vector(size_type __n, const value_type& __x) {
+    auto __guard = std::__make_exception_guard(__destroy_vector(*this));
+    if (__n > 0) {
+      __vallocate(__n);
+      __construct_at_end(__n, __x);
+    }
+    __guard.__complete();
+  }
 
   template <__enable_if_t<__is_allocator<_Allocator>::value, int> = 0>
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI
@@ -1126,39 +1151,6 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::__append(size_type _
 }
 
 template <class _Tp, class _Allocator>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 vector<_Tp, _Allocator>::vector(size_type __n) {
-  auto __guard = std::__make_exception_guard(__destroy_vector(*this));
-  if (__n > 0) {
-    __vallocate(__n);
-    __construct_at_end(__n);
-  }
-  __guard.__complete();
-}
-
-#if _LIBCPP_STD_VER >= 14
-template <class _Tp, class _Allocator>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 vector<_Tp, _Allocator>::vector(size_type __n, const allocator_type& __a)
-    : __end_cap_(nullptr, __a) {
-  auto __guard = std::__make_exception_guard(__destroy_vector(*this));
-  if (__n > 0) {
-    __vallocate(__n);
-    __construct_at_end(__n);
-  }
-  __guard.__complete();
-}
-#endif
-
-template <class _Tp, class _Allocator>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 vector<_Tp, _Allocator>::vector(size_type __n, const value_type& __x) {
-  auto __guard = std::__make_exception_guard(__destroy_vector(*this));
-  if (__n > 0) {
-    __vallocate(__n);
-    __construct_at_end(__n, __x);
-  }
-  __guard.__complete();
-}
-
-template <class _Tp, class _Allocator>
 template <class _InputIterator,
           __enable_if_t<__has_exactly_input_iterator_category<_InputIterator>::value &&
                             is_constructible<_Tp, typename iterator_traits<_InputIterator>::reference>::value,
diff --git a/libcxx/modules/std/atomic.inc b/libcxx/modules/std/atomic.inc
index d77d7a5..e8cf90d 100644
--- a/libcxx/modules/std/atomic.inc
+++ b/libcxx/modules/std/atomic.inc
@@ -22,7 +22,7 @@ export namespace std {
 
   // [atomics.ref.generic], class template atomic_ref
   // [atomics.ref.pointer], partial specialization for pointers
-  // using std::atomic_ref _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_ref _LIBCPP_USING_IF_EXISTS;
 
   // [atomics.types.generic], class template atomic
   using std::atomic _LIBCPP_USING_IF_EXISTS;
diff --git a/libcxx/test/libcxx/atomics/atomics.ref/assert.compare_exchange_strong.pass.cpp b/libcxx/test/libcxx/atomics/atomics.ref/assert.compare_exchange_strong.pass.cpp
new file mode 100644
index 0000000..066ed11
--- /dev/null
+++ b/libcxx/test/libcxx/atomics/atomics.ref/assert.compare_exchange_strong.pass.cpp
@@ -0,0 +1,58 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-hardening-mode=none || libcpp-hardening-mode=fast
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+// ADDITIONAL_COMPILE_FLAGS: -Wno-user-defined-warnings
+
+// <atomic>
+
+// bool compare_exchange_strong(T& expected, T desired, memory_order success, memory_order failure) const noexcept;
+//
+// Preconditions: failure is memory_order::relaxed, memory_order::consume, memory_order::acquire, or memory_order::seq_cst.
+
+#include <atomic>
+
+#include "atomic_helpers.h"
+#include "check_assertion.h"
+
+template <typename T>
+struct TestCompareExchangeStrongInvalidMemoryOrder {
+  void operator()() const {
+    { // no assertion should trigger here
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+      T t(T(2));
+      a.compare_exchange_strong(t, T(3), std::memory_order_relaxed, std::memory_order_relaxed);
+    }
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ([] {
+          T x(T(1));
+          std::atomic_ref<T> const a(x);
+          T t(T(2));
+          a.compare_exchange_strong(t, T(3), std::memory_order_relaxed, std::memory_order_release);
+        }()),
+        "atomic_ref: failure memory order argument to strong atomic compare-and-exchange operation is invalid");
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ([] {
+          T x(T(1));
+          std::atomic_ref<T> const a(x);
+          T t(T(2));
+          a.compare_exchange_strong(t, T(3), std::memory_order_relaxed, std::memory_order_acq_rel);
+        }()),
+        "atomic_ref: failure memory order argument to strong atomic compare-and-exchange operation is invalid");
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestCompareExchangeStrongInvalidMemoryOrder>()();
+  return 0;
+}
diff --git a/libcxx/test/libcxx/atomics/atomics.ref/assert.compare_exchange_weak.pass.cpp b/libcxx/test/libcxx/atomics/atomics.ref/assert.compare_exchange_weak.pass.cpp
new file mode 100644
index 0000000..e83a143
--- /dev/null
+++ b/libcxx/test/libcxx/atomics/atomics.ref/assert.compare_exchange_weak.pass.cpp
@@ -0,0 +1,58 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-hardening-mode=none || libcpp-hardening-mode=fast
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+// ADDITIONAL_COMPILE_FLAGS: -Wno-user-defined-warnings
+
+// <atomic>
+
+// bool compare_exchange_weak(T& expected, T desired, memory_order success, memory_order failure) const noexcept;
+//
+// Preconditions: failure is memory_order::relaxed, memory_order::consume, memory_order::acquire, or memory_order::seq_cst.
+
+#include <atomic>
+
+#include "atomic_helpers.h"
+#include "check_assertion.h"
+
+template <typename T>
+struct TestCompareExchangeWeakInvalidMemoryOrder {
+  void operator()() const {
+    { // no assertion should trigger here
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+      T t(T(2));
+      a.compare_exchange_weak(t, T(3), std::memory_order_relaxed, std::memory_order_relaxed);
+    }
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ([] {
+          T x(T(1));
+          std::atomic_ref<T> const a(x);
+          T t(T(2));
+          a.compare_exchange_weak(t, T(3), std::memory_order_relaxed, std::memory_order_release);
+        }()),
+        "atomic_ref: failure memory order argument to weak atomic compare-and-exchange operation is invalid");
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ([] {
+          T x(T(1));
+          std::atomic_ref<T> const a(x);
+          T t(T(2));
+          a.compare_exchange_weak(t, T(3), std::memory_order_relaxed, std::memory_order_acq_rel);
+        }()),
+        "atomic_ref: failure memory order argument to weak atomic compare-and-exchange operation is invalid");
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestCompareExchangeWeakInvalidMemoryOrder>()();
+  return 0;
+}
diff --git a/libcxx/test/libcxx/atomics/atomics.ref/assert.ctor.pass.cpp b/libcxx/test/libcxx/atomics/atomics.ref/assert.ctor.pass.cpp
new file mode 100644
index 0000000..ef3705d
--- /dev/null
+++ b/libcxx/test/libcxx/atomics/atomics.ref/assert.ctor.pass.cpp
@@ -0,0 +1,40 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-hardening-mode=none || libcpp-hardening-mode=fast
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+// <atomic>
+
+// atomic_ref(T& obj);
+//
+// Preconditions: The referenced object is aligned to required_alignment.
+
+#include <atomic>
+#include <cstddef>
+
+#include "check_assertion.h"
+
+int main(int, char**) {
+  { // no assertion should trigger here
+    alignas(float) std::byte c[sizeof(float)];
+    float* f = new (c) float(3.14f);
+    [[maybe_unused]] std::atomic_ref<float> r(*f);
+  }
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        alignas(float) std::byte c[2 * sizeof(float)]; // intentionally larger
+        float* f = new (c + 1) float(3.14f);           // intentionally misaligned
+        [[maybe_unused]] std::atomic_ref<float> r(*f);
+      }()),
+      "atomic_ref ctor: referenced object must be aligned to required_alignment");
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/atomics/atomics.ref/assert.load.pass.cpp b/libcxx/test/libcxx/atomics/atomics.ref/assert.load.pass.cpp
new file mode 100644
index 0000000..bc92b3d
--- /dev/null
+++ b/libcxx/test/libcxx/atomics/atomics.ref/assert.load.pass.cpp
@@ -0,0 +1,55 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-hardening-mode=none || libcpp-hardening-mode=fast
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+// ADDITIONAL_COMPILE_FLAGS: -Wno-user-defined-warnings
+
+// <atomic>
+
+// T load(memory_order order = memory_order::seq_cst) const noexcept;
+//
+// Preconditions: order is memory_order::relaxed, memory_order::consume, memory_order::acquire, or memory_order::seq_cst.
+
+#include <atomic>
+
+#include "atomic_helpers.h"
+#include "check_assertion.h"
+
+template <typename T>
+struct TestLoadInvalidMemoryOrder {
+  void operator()() const {
+    { // no assertion should trigger here
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+      (void)a.load(std::memory_order_relaxed);
+    }
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ([] {
+          T x(T(1));
+          std::atomic_ref<T> const a(x);
+          (void)a.load(std::memory_order_release);
+        }()),
+        "atomic_ref: memory order argument to atomic load operation is invalid");
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ([] {
+          T x(T(1));
+          std::atomic_ref<T> const a(x);
+          (void)a.load(std::memory_order_acq_rel);
+        }()),
+        "atomic_ref: memory order argument to atomic load operation is invalid");
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestLoadInvalidMemoryOrder>()();
+  return 0;
+}
diff --git a/libcxx/test/libcxx/atomics/atomics.ref/assert.store.pass.cpp b/libcxx/test/libcxx/atomics/atomics.ref/assert.store.pass.cpp
new file mode 100644
index 0000000..ab0d4a2
--- /dev/null
+++ b/libcxx/test/libcxx/atomics/atomics.ref/assert.store.pass.cpp
@@ -0,0 +1,63 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-hardening-mode=none || libcpp-hardening-mode=fast
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+// ADDITIONAL_COMPILE_FLAGS: -Wno-user-defined-warnings
+
+// <atomic>
+
+// void store(T desired, memory_order order = memory_order::seq_cst) const noexcept;
+//
+// Preconditions: order is memory_order::relaxed, memory_order::release, or memory_order::seq_cst.
+
+#include <atomic>
+
+#include "atomic_helpers.h"
+#include "check_assertion.h"
+
+template <typename T>
+struct TestStoreInvalidMemoryOrder {
+  void operator()() const {
+    { // no assertion should trigger here
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+      a.store(T(2), std::memory_order_relaxed);
+    }
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ([] {
+          T x(T(1));
+          std::atomic_ref<T> const a(x);
+          a.store(T(2), std::memory_order_consume);
+        }()),
+        "atomic_ref: memory order argument to atomic store operation is invalid");
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ([] {
+          T x(T(1));
+          std::atomic_ref<T> const a(x);
+          a.store(T(2), std::memory_order_acquire);
+        }()),
+        "atomic_ref: memory order argument to atomic store operation is invalid");
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ([] {
+          T x(T(1));
+          std::atomic_ref<T> const a(x);
+          a.store(T(2), std::memory_order_acq_rel);
+        }()),
+        "atomic_ref: memory order argument to atomic store operation is invalid");
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestStoreInvalidMemoryOrder>()();
+  return 0;
+}
diff --git a/libcxx/test/libcxx/atomics/atomics.ref/assert.wait.pass.cpp b/libcxx/test/libcxx/atomics/atomics.ref/assert.wait.pass.cpp
new file mode 100644
index 0000000..dcec2fb
--- /dev/null
+++ b/libcxx/test/libcxx/atomics/atomics.ref/assert.wait.pass.cpp
@@ -0,0 +1,55 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-hardening-mode=none || libcpp-hardening-mode=fast
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+// ADDITIONAL_COMPILE_FLAGS: -Wno-user-defined-warnings
+
+// <atomic>
+
+// void wait(T old, memory_order order = memory_order::seq_cst) const noexcept;
+//
+// Preconditions: order is memory_order::relaxed, memory_order::consume, memory_order::acquire, or memory_order::seq_cst.
+
+#include <atomic>
+
+#include "atomic_helpers.h"
+#include "check_assertion.h"
+
+template <typename T>
+struct TestWaitInvalidMemoryOrder {
+  void operator()() const {
+    { // no assertion should trigger here
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+      a.wait(T(2), std::memory_order_relaxed);
+    }
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ([] {
+          T x(T(1));
+          std::atomic_ref<T> const a(x);
+          a.wait(T(2), std::memory_order_release);
+        }()),
+        "atomic_ref: memory order argument to atomic wait operation is invalid");
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ([] {
+          T x(T(1));
+          std::atomic_ref<T> const a(x);
+          a.wait(T(2), std::memory_order_acq_rel);
+        }()),
+        "atomic_ref: memory order argument to atomic wait operation is invalid");
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestWaitInvalidMemoryOrder>()();
+  return 0;
+}
diff --git a/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp b/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
index f94ceaf..aa3ce21 100644
--- a/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
+++ b/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
@@ -21,6 +21,9 @@
 // GCC doesn't support the aligned-allocation flags.
 // XFAIL: gcc
 
+// TODO(mordante) fix this test after updating clang in Docker
+// UNSUPPORTED: clang-15, clang-16, clang-17, clang-18, clang-19
+
 // RUN: %{build} -faligned-allocation -fsized-deallocation
 // RUN: %{run}
 // RUN: %{build} -faligned-allocation -fno-sized-deallocation -DNO_SIZE
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.exception_handling.pass.cpp
deleted file mode 100644
index dda642b..0000000
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::fill(ExecutionPolicy) and std::fill_n(ExecutionPolicy) terminate on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-#ifndef TEST_HAS_NO_EXCEPTIONS
-struct ThrowOnCopy {
-  ThrowOnCopy& operator=(const ThrowOnCopy&) { throw int{}; }
-};
-#endif
-
-int main(int, char**) {
-  ThrowOnCopy a[2]{};
-  int b[2]{};
-
-  test_execution_policies([&](auto&& policy) {
-    // std::fill
-    EXPECT_STD_TERMINATE([&] { (void)std::fill(policy, std::begin(a), std::end(a), ThrowOnCopy{}); });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        (void)std::fill(
-            policy, util::throw_on_move_iterator(std::begin(b), 1), util::throw_on_move_iterator(std::end(b), 1), 0);
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-
-    // std::fill_n
-    EXPECT_STD_TERMINATE([&] { (void)std::fill_n(policy, std::begin(a), std::size(a), ThrowOnCopy{}); });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        (void)std::fill_n(policy, util::throw_on_move_iterator(std::begin(b), 1), std::size(b), 0);
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/pstl.exception_handling.pass.cpp
deleted file mode 100644
index bb8ab42..0000000
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::move(ExecutionPolicy) terminates on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([](auto&& policy) {
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        int b[] = {1, 2};
-        (void)std::move(policy,
-                        util::throw_on_move_iterator(std::begin(a), 1),
-                        util::throw_on_move_iterator(std::end(a), 1),
-                        util::throw_on_move_iterator(std::begin(b), 1));
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.replace/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.replace/pstl.exception_handling.pass.cpp
deleted file mode 100644
index c02496b..0000000
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.replace/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::replace(ExecutionPolicy), std::replace_if(ExecutionPolicy), std::replace_copy(ExecutionPolicy)
-// and std::replace_copy_if(ExecutionPolicy) terminate on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-struct ThrowOnCompare {};
-
-#ifndef TEST_HAS_NO_EXCEPTIONS
-bool operator==(ThrowOnCompare, ThrowOnCompare) { throw int{}; }
-#endif
-
-int main(int, char**) {
-  test_execution_policies([&](auto&& policy) {
-    // std::replace
-    EXPECT_STD_TERMINATE([&] {
-      ThrowOnCompare a[2]{};
-      (void)std::replace(policy, std::begin(a), std::end(a), ThrowOnCompare{}, ThrowOnCompare{});
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::replace(
-            policy, util::throw_on_move_iterator(std::begin(a), 1), util::throw_on_move_iterator(std::end(a), 1), 1, 2);
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-
-    // std::replace_if
-    EXPECT_STD_TERMINATE([&] {
-      ThrowOnCompare a[2]{};
-      (void)std::replace_if(
-          policy, std::begin(a), std::end(a), [](ThrowOnCompare&) -> bool { throw int{}; }, ThrowOnCompare{});
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::replace_if(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            [](int) { return true; },
-            2);
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-
-    // std::replace_copy
-    EXPECT_STD_TERMINATE([&] {
-      ThrowOnCompare a[2]{};
-      (void)std::replace_copy(policy, std::begin(a), std::end(a), std::begin(a), ThrowOnCompare{}, ThrowOnCompare{});
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::replace_copy(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            util::throw_on_move_iterator(std::begin(a), 1),
-            1,
-            2);
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-
-    // std::replace_copy_if
-    EXPECT_STD_TERMINATE([&] {
-      ThrowOnCompare a[2]{};
-      (void)std::replace_copy_if(
-          policy,
-          std::begin(a),
-          std::end(a),
-          std::begin(a),
-          [](ThrowOnCompare& i) { return i == i; },
-          ThrowOnCompare{});
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::replace_copy_if(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            util::throw_on_move_iterator(std::begin(a), 1),
-            [](int) { return true; },
-            2);
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/pstl.exception_handling.pass.cpp
deleted file mode 100644
index 88d177a..0000000
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::find(ExecutionPolicy), std::find_if(ExecutionPolicy) and std::find_if_not(ExecutionPolicy) terminate
-// on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([](auto&& policy) {
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        int b[] = {1, 2};
-        (void)std::rotate_copy(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            util::throw_on_move_iterator(std::begin(b), 1));
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.transform/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.transform/pstl.exception_handling.pass.cpp
deleted file mode 100644
index 4392040..0000000
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.transform/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::transform(ExecutionPolicy) terminates on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([&](auto&& policy) {
-    EXPECT_STD_TERMINATE([&] {
-      int a[2]{};
-      int b[2]{};
-      int c[2]{};
-      (void)std::transform(
-          policy, std::begin(a), std::end(a), std::begin(b), std::begin(c), [](auto v, auto) -> decltype(v) {
-            throw int{};
-          });
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::transform(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            util::throw_on_move_iterator(std::begin(a), 1),
-            [](int i) { return i; });
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-
-    EXPECT_STD_TERMINATE([&] {
-      int a[2]{};
-      int b[2]{};
-      (void)std::transform(policy, std::begin(a), std::end(a), std::begin(b), [](auto v) -> decltype(v) {
-        throw int{};
-      });
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::transform(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::begin(a), 1),
-            std::plus{});
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.all_of/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.all_of/pstl.exception_handling.pass.cpp
deleted file mode 100644
index d1c031b..0000000
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.all_of/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::all_of(ExecutionPolicy) terminates on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([](auto&& policy) {
-    EXPECT_STD_TERMINATE([&] {
-      int a[] = {1, 2};
-      (void)std::all_of(policy, std::begin(a), std::end(a), [](int i) -> bool { throw i; });
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::all_of(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            [](int) { return true; });
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.any_of/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.any_of/pstl.exception_handling.pass.cpp
deleted file mode 100644
index 58fe79b..0000000
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.any_of/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::any_of(ExecutionPolicy) terminates on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([](auto&& policy) {
-    EXPECT_STD_TERMINATE([&] {
-      int a[] = {1, 2};
-      (void)std::any_of(policy, std::begin(a), std::end(a), [](int i) -> bool { throw i; });
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::any_of(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            [](int) { return true; });
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/pstl.exception_handling.pass.cpp
deleted file mode 100644
index 1bcd858..0000000
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::equal(ExecutionPolicy) terminates on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([](auto&& policy) {
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        int b[] = {1, 2};
-        (void)std::equal(policy,
-                         util::throw_on_move_iterator(std::begin(a), 1),
-                         util::throw_on_move_iterator(std::end(a), 1),
-                         util::throw_on_move_iterator(std::begin(b), 1));
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        int b[] = {1, 2};
-        (void)std::equal(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            util::throw_on_move_iterator(std::begin(b), 1),
-            util::throw_on_move_iterator(std::end(b), 1));
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/pstl.exception_handling.pass.cpp
deleted file mode 100644
index b0ee4f8..0000000
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::find(ExecutionPolicy), std::find_if(ExecutionPolicy) and std::find_if_not(ExecutionPolicy) terminate
-// on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-struct ThrowOnCompare {};
-
-#ifndef TEST_HAS_NO_EXCEPTIONS
-bool operator==(ThrowOnCompare, ThrowOnCompare) { throw int{}; }
-#endif
-
-int main(int, char**) {
-  test_execution_policies([](auto&& policy) {
-    // std::find
-    EXPECT_STD_TERMINATE([&] {
-      ThrowOnCompare a[2] = {};
-      (void)std::find(policy, std::begin(a), std::end(a), ThrowOnCompare{});
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::find(
-            policy, util::throw_on_move_iterator(std::begin(a), 1), util::throw_on_move_iterator(std::end(a), 1), 0);
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-
-    // std::find_if
-    EXPECT_STD_TERMINATE([&] {
-      int a[] = {1, 2};
-      (void)std::find_if(policy, std::begin(a), std::end(a), [](int) -> bool { throw int{}; });
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::find_if(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            [](int) { return true; });
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-
-    // std::find_if_not
-    EXPECT_STD_TERMINATE([&] {
-      int a[] = {1, 2};
-      (void)std::find_if_not(policy, std::begin(a), std::end(a), [](int) -> bool { throw int{}; });
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::find_if_not(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            [](int) { return true; });
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/pstl.exception_handling.pass.cpp
deleted file mode 100644
index a63276f..0000000
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::for_each(ExecutionPolicy) and std::for_each_n(ExecutionPolicy) terminate on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([](auto&& policy) {
-    int a[] = {1, 2};
-    // std::for_each
-    EXPECT_STD_TERMINATE([&] { std::for_each(policy, std::begin(a), std::end(a), [](int) { throw int{}; }); });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        (void)std::for_each(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            [](int) {});
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-
-    // std::for_each_n
-    EXPECT_STD_TERMINATE([&] { std::for_each_n(policy, std::data(a), std::size(a), [](int) { throw int{}; }); });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        (void)std::for_each_n(policy, util::throw_on_move_iterator(std::begin(a), 1), std::size(a), [](int) {});
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.none_of/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.none_of/pstl.exception_handling.pass.cpp
deleted file mode 100644
index 26e6fea..0000000
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.none_of/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::none_of(ExecutionPolicy) terminates on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([](auto&& policy) {
-    EXPECT_STD_TERMINATE([&] {
-      int a[] = {1, 2};
-      (void)std::none_of(policy, std::begin(a), std::end(a), [](int i) -> bool { throw i; });
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::none_of(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            [](int) { return true; });
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.merge/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.merge/pstl.exception_handling.pass.cpp
deleted file mode 100644
index b48a5a9..0000000
--- a/libcxx/test/std/algorithms/alg.sorting/alg.merge/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::merge(ExecutionPolicy) terminates on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([](auto&& policy) {
-    EXPECT_STD_TERMINATE([&] {
-      int a[] = {1, 2};
-      std::merge(policy, std::begin(a), std::end(a), std::begin(a), std::end(a), std::begin(a), [](int, int) -> bool {
-        throw int{};
-      });
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::merge(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            util::throw_on_move_iterator(std::begin(a), 1),
-            std::less{});
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-
-  return 0;
-}
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.sort/stable.sort/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.sort/stable.sort/pstl.exception_handling.pass.cpp
deleted file mode 100644
index 1dc603c..0000000
--- a/libcxx/test/std/algorithms/alg.sorting/alg.sort/stable.sort/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::stable_sort(ExecutionPolicy) terminates on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([](auto&& policy) {
-    EXPECT_STD_TERMINATE([&] {
-      int a[] = {1, 2};
-      std::stable_sort(policy, std::begin(a), std::end(a), [](int, int) -> bool { throw int{}; });
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::stable_sort(
-            policy, util::throw_on_move_iterator(std::begin(a), 1), util::throw_on_move_iterator(std::end(a), 1));
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/numeric.ops/reduce/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/numeric.ops/reduce/pstl.exception_handling.pass.cpp
deleted file mode 100644
index d52889b..0000000
--- a/libcxx/test/std/algorithms/numeric.ops/reduce/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::reduce(ExecutionPolicy) terminates on user-thrown exceptions
-
-#include <numeric>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([&](auto&& policy) {
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::reduce(
-            policy, util::throw_on_move_iterator(std::begin(a), 1), util::throw_on_move_iterator(std::end(a), 1));
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-
-    EXPECT_STD_TERMINATE([&] {
-      int a[2]{};
-      (void)std::reduce(policy, std::begin(a), std::end(a), 1, [](int, int) -> int { throw 1; });
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::reduce(
-            policy, util::throw_on_move_iterator(std::begin(a), 1), util::throw_on_move_iterator(std::end(a), 1), 1);
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/numeric.ops/transform.reduce/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/numeric.ops/transform.reduce/pstl.exception_handling.pass.cpp
deleted file mode 100644
index 5ac0433..0000000
--- a/libcxx/test/std/algorithms/numeric.ops/transform.reduce/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::reduce(ExecutionPolicy) terminates on user-thrown exceptions
-
-#include <numeric>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([&](auto&& policy) {
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::transform_reduce(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            util::throw_on_move_iterator(std::begin(a), 1),
-            1);
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-
-    EXPECT_STD_TERMINATE([&] {
-      int a[2]{};
-      (void)std::transform_reduce(
-          policy, std::begin(a), std::end(a), 1, [](int, int) -> int { throw 1; }, [](int) -> int { return 0; });
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::transform_reduce(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            1,
-            std::plus{},
-            [](int) -> int { return 0; });
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/pstl.exception_handling.pass.cpp
new file mode 100644
index 0000000..bedb225
--- /dev/null
+++ b/libcxx/test/std/algorithms/pstl.exception_handling.pass.cpp
@@ -0,0 +1,339 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: no-exceptions
+// `check_assertion.h` requires Unix headers and regex support.
+// UNSUPPORTED: !has-unix-headers, no-localization
+
+// UNSUPPORTED: libcpp-has-no-incomplete-pstl
+
+// <algorithm>
+// <numeric>
+//
+// Check that PSTL algorithms terminate on user-thrown exceptions.
+
+#include <algorithm>
+#include <numeric>
+
+#include "check_assertion.h"
+#include "test_execution_policies.h"
+#include "test_iterators.h"
+
+template <class F>
+void assert_non_throwing(F f) {
+  // We wrap this whole test in EXPECT_STD_TERMINATE because if f() terminates, we want the test to pass,
+  // since this signals proper handling of user exceptions in the PSTL.
+  EXPECT_STD_TERMINATE([&] {
+    bool threw = false;
+    try {
+      f();
+    } catch (...) {
+      threw = true;
+    }
+    // If nothing was thrown, call std::terminate() to pass the EXPECT_STD_TERMINATE assertion.
+    // Otherwise, don't call std::terminate() to fail the assertion.
+    if (!threw)
+      std::terminate();
+  });
+}
+
+struct ThrowToken {
+  void activate() { active_ = true; }
+  void deactivate() { active_ = false; }
+  bool active() const { return active_; }
+
+private:
+  bool active_{false};
+};
+
+template <class Func>
+struct on_scope_exit {
+  explicit on_scope_exit(Func func) : func_(func) {}
+  ~on_scope_exit() { func_(); }
+
+private:
+  Func func_;
+};
+template <class Func>
+on_scope_exit(Func) -> on_scope_exit<Func>;
+
+int main(int, char**) {
+  test_execution_policies([&](auto&& policy) {
+    int a[] = {1, 2, 3, 4};
+    int b[] = {1, 2, 3};
+    int n   = 2;
+    int storage[999];
+    int val  = 99;
+    int init = 1;
+
+    // We generate a certain number of "tokens" and we activate exactly one on each iteration. We then
+    // throw in a given operation only when that token is active. That way we check that each argument
+    // of the algorithm is handled properly.
+    ThrowToken tokens[7];
+    for (ThrowToken& t : tokens) {
+      t.activate();
+      on_scope_exit _([&] { t.deactivate(); });
+
+      auto first1      = util::throw_on_move_iterator(std::begin(a), tokens[0].active() ? 1 : -1);
+      auto last1       = util::throw_on_move_iterator(std::end(a), tokens[1].active() ? 1 : -1);
+      auto first2      = util::throw_on_move_iterator(std::begin(b), tokens[2].active() ? 1 : -1);
+      auto last2       = util::throw_on_move_iterator(std::end(b), tokens[3].active() ? 1 : -1);
+      auto dest        = util::throw_on_move_iterator(std::end(storage), tokens[4].active() ? 1 : -1);
+      auto maybe_throw = [](ThrowToken const& token, auto f) {
+        return [&token, f](auto... args) {
+          if (token.active())
+            throw 1;
+          return f(args...);
+        };
+      };
+
+      {
+        auto pred = maybe_throw(tokens[5], [](int x) -> bool { return x % 2 == 0; });
+
+        // all_of(first, last, pred)
+        assert_non_throwing([=, &policy] { (void)std::all_of(policy, std::move(first1), std::move(last1), pred); });
+
+        // any_of(first, last, pred)
+        assert_non_throwing([=, &policy] { (void)std::any_of(policy, std::move(first1), std::move(last1), pred); });
+
+        // none_of(first, last, pred)
+        assert_non_throwing([=, &policy] { (void)std::none_of(policy, std::move(first1), std::move(last1), pred); });
+      }
+
+      {
+        // copy(first, last, dest)
+        assert_non_throwing([=, &policy] {
+          (void)std::copy(policy, std::move(first1), std::move(last1), std::move(dest));
+        });
+
+        // copy_n(first, n, dest)
+        assert_non_throwing([=, &policy] { (void)std::copy_n(policy, std::move(first1), n, std::move(dest)); });
+      }
+
+      {
+        auto pred = maybe_throw(tokens[5], [](int x) -> bool { return x % 2 == 0; });
+
+        // count(first, last, val)
+        assert_non_throwing([=, &policy] { (void)std::count(policy, std::move(first1), std::move(last1), val); });
+
+        // count_if(first, last, pred)
+        assert_non_throwing([=, &policy] { (void)std::count_if(policy, std::move(first1), std::move(last1), pred); });
+      }
+
+      {
+        auto binary_pred = maybe_throw(tokens[5], [](int x, int y) -> bool { return x == y; });
+
+        // equal(first1, last1, first2)
+        assert_non_throwing([=, &policy] {
+          (void)std::equal(policy, std::move(first1), std::move(last1), std::move(first2));
+        });
+
+        // equal(first1, last1, first2, binary_pred)
+        assert_non_throwing([=, &policy] {
+          (void)std::equal(policy, std::move(first1), std::move(last1), std::move(first2), binary_pred);
+        });
+
+        // equal(first1, last1, first2, last2)
+        assert_non_throwing([=, &policy] {
+          (void)std::equal(policy, std::move(first1), std::move(last1), std::move(first2), std::move(last2));
+        });
+
+        // equal(first1, last1, first2, last2, binary_pred)
+        assert_non_throwing([=, &policy] {
+          (void)std::equal(
+              policy, std::move(first1), std::move(last1), std::move(first2), std::move(last2), binary_pred);
+        });
+      }
+
+      {
+        // fill(first, last, val)
+        assert_non_throwing([=, &policy] { (void)std::fill(policy, std::move(first1), std::move(last1), val); });
+
+        // fill_n(first, n, val)
+        assert_non_throwing([=, &policy] { (void)std::fill_n(policy, std::move(first1), n, val); });
+      }
+
+      {
+        auto pred = maybe_throw(tokens[5], [](int x) -> bool { return x % 2 == 0; });
+
+        // find(first, last, val)
+        assert_non_throwing([=, &policy] { (void)std::find(policy, std::move(first1), std::move(last1), val); });
+
+        // find_if(first, last, pred)
+        assert_non_throwing([=, &policy] { (void)std::find_if(policy, std::move(first1), std::move(last1), pred); });
+
+        // find_if_not(first, last, pred)
+        assert_non_throwing([=, &policy] {
+          (void)std::find_if_not(policy, std::move(first1), std::move(last1), pred);
+        });
+      }
+
+      {
+        auto func = maybe_throw(tokens[5], [](int) {});
+
+        // for_each(first, last, func)
+        assert_non_throwing([=, &policy] { (void)std::for_each(policy, std::move(first1), std::move(last1), func); });
+
+        // for_each_n(first, n, func)
+        assert_non_throwing([=, &policy] { (void)std::for_each_n(policy, std::move(first1), n, func); });
+      }
+
+      {
+        auto gen = maybe_throw(tokens[5], []() -> int { return 42; });
+
+        // generate(first, last, func)
+        assert_non_throwing([=, &policy] { (void)std::generate(policy, std::move(first1), std::move(last1), gen); });
+
+        // generate_n(first, n, func)
+        assert_non_throwing([=, &policy] { (void)std::generate_n(policy, std::move(first1), n, gen); });
+      }
+
+      {
+        auto pred = maybe_throw(tokens[5], [](int x) -> bool { return x % 2 == 0; });
+
+        // is_partitioned(first, last, pred)
+        assert_non_throwing([=, &policy] {
+          (void)std::is_partitioned(policy, std::move(first1), std::move(last1), pred);
+        });
+      }
+
+      {
+        auto compare = maybe_throw(tokens[5], [](int x, int y) -> bool { return x < y; });
+
+        // merge(first1, last1, first2, last2, dest)
+        assert_non_throwing([=, &policy] {
+          (void)std::merge(
+              policy, std::move(first1), std::move(last1), std::move(first2), std::move(last2), std::move(dest));
+        });
+
+        // merge(first1, last1, first2, last2, dest, comp)
+        assert_non_throwing([=, &policy] {
+          (void)std::merge(
+              policy,
+              std::move(first1),
+              std::move(last1),
+              std::move(first2),
+              std::move(last2),
+              std::move(dest),
+              compare);
+        });
+      }
+
+      {
+        // move(first, last, dest)
+        assert_non_throwing([=, &policy] {
+          (void)std::move(policy, std::move(first1), std::move(last1), std::move(dest));
+        });
+      }
+
+      {
+        auto pred = maybe_throw(tokens[5], [](int x) -> bool { return x % 2 == 0; });
+
+        // replace_if(first, last, pred, val)
+        assert_non_throwing([=, &policy] {
+          (void)std::replace_if(policy, std::move(first1), std::move(last1), pred, val);
+        });
+
+        // replace(first, last, val1, val2)
+        assert_non_throwing([=, &policy] {
+          (void)std::replace(policy, std::move(first1), std::move(last1), val, val);
+        });
+
+        // replace_copy_if(first, last, dest, pred, val)
+        assert_non_throwing([=, &policy] {
+          (void)std::replace_copy_if(policy, std::move(first1), std::move(last1), std::move(dest), pred, val);
+        });
+
+        // replace_copy(first, last, dest, val1, val2)
+        assert_non_throwing([=, &policy] {
+          (void)std::replace_copy(policy, std::move(first1), std::move(last1), std::move(dest), val, val);
+        });
+      }
+
+      {
+        auto mid1 = util::throw_on_move_iterator(std::begin(a) + 2, tokens[5].active() ? 1 : -1);
+
+        // rotate_copy(first, mid, last, dest)
+        assert_non_throwing([=, &policy] {
+          (void)std::rotate_copy(policy, std::move(first1), std::move(mid1), std::move(last1), std::move(dest));
+        });
+      }
+
+      {
+        auto compare = maybe_throw(tokens[5], [](int x, int y) -> bool { return x < y; });
+
+        // sort(first, last)
+        assert_non_throwing([=, &policy] { (void)std::sort(policy, std::move(first1), std::move(last1)); });
+
+        // sort(first, last, comp)
+        assert_non_throwing([=, &policy] { (void)std::sort(policy, std::move(first1), std::move(last1), compare); });
+
+        // stable_sort(first, last)
+        assert_non_throwing([=, &policy] { (void)std::stable_sort(policy, std::move(first1), std::move(last1)); });
+
+        // stable_sort(first, last, comp)
+        assert_non_throwing([=, &policy] {
+          (void)std::stable_sort(policy, std::move(first1), std::move(last1), compare);
+        });
+      }
+
+      {
+        auto unary  = maybe_throw(tokens[5], [](int x) -> int { return x * 2; });
+        auto binary = maybe_throw(tokens[5], [](int x, int y) -> int { return x * y; });
+
+        // transform(first, last, dest, func)
+        assert_non_throwing([=, &policy] {
+          (void)std::transform(policy, std::move(first1), std::move(last1), std::move(dest), unary);
+        });
+
+        // transform(first1, last1, first2, dest, func)
+        assert_non_throwing([=, &policy] {
+          (void)std::transform(policy, std::move(first1), std::move(last1), std::move(first2), std::move(dest), binary);
+        });
+      }
+
+      {
+        auto reduction        = maybe_throw(tokens[5], [](int x, int y) -> int { return x + y; });
+        auto transform_unary  = maybe_throw(tokens[6], [](int x) -> int { return x * 2; });
+        auto transform_binary = maybe_throw(tokens[6], [](int x, int y) -> int { return x * y; });
+
+        // transform_reduce(first1, last1, first2, init)
+        assert_non_throwing([=, &policy] {
+          (void)std::transform_reduce(policy, std::move(first1), std::move(last1), std::move(first2), init);
+        });
+
+        // transform_reduce(first1, last1, init, reduce, transform)
+        assert_non_throwing([=, &policy] {
+          (void)std::transform_reduce(policy, std::move(first1), std::move(last1), init, reduction, transform_unary);
+        });
+
+        // transform_reduce(first1, last1, first2, init, reduce, transform)
+        assert_non_throwing([=, &policy] {
+          (void)std::transform_reduce(
+              policy, std::move(first1), std::move(last1), std::move(first2), init, reduction, transform_binary);
+        });
+      }
+
+      {
+        auto reduction = maybe_throw(tokens[5], [](int x, int y) -> int { return x + y; });
+
+        // reduce(first, last)
+        assert_non_throwing([=, &policy] { (void)std::reduce(policy, std::move(first1), std::move(last1)); });
+
+        // reduce(first, last, init)
+        assert_non_throwing([=, &policy] { (void)std::reduce(policy, std::move(first1), std::move(last1), init); });
+
+        // reduce(first, last, init, binop)
+        assert_non_throwing([=, &policy] {
+          (void)std::reduce(policy, std::move(first1), std::move(last1), init, reduction);
+        });
+      }
+    }
+  });
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/assign.pass.cpp b/libcxx/test/std/atomics/atomics.ref/assign.pass.cpp
new file mode 100644
index 0000000..3887211
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/assign.pass.cpp
@@ -0,0 +1,50 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+// XFAIL: !has-1024-bit-atomics
+
+// T operator=(T) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_helper.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestAssign {
+  void operator()() const {
+    {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      std::same_as<T> decltype(auto) y = (a = T(2));
+      assert(y == T(2));
+      assert(x == T(2));
+
+      ASSERT_NOEXCEPT(a = T(0));
+      static_assert(std::is_nothrow_assignable_v<std::atomic_ref<T>, T>);
+
+      static_assert(!std::is_copy_assignable_v<std::atomic_ref<T>>);
+    }
+
+    {
+      auto assign = [](std::atomic_ref<T> const& y, T, T new_val) { y = new_val; };
+      auto load   = [](std::atomic_ref<T> const& y) { return y.load(); };
+      test_seq_cst<T>(assign, load);
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestAssign>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/bitwise_and_assign.pass.cpp b/libcxx/test/std/atomics/atomics.ref/bitwise_and_assign.pass.cpp
new file mode 100644
index 0000000..2be1e99
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/bitwise_and_assign.pass.cpp
@@ -0,0 +1,60 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+
+// integral-type operator&=(integral-type) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_macros.h"
+
+template <typename T>
+concept has_bitwise_and_assign = requires { std::declval<T const>() &= std::declval<T>(); };
+
+template <typename T>
+struct TestDoesNotHaveBitwiseAndAssign {
+  void operator()() const { static_assert(!has_bitwise_and_assign<std::atomic_ref<T>>); }
+};
+
+template <typename T>
+struct TestBitwiseAndAssign {
+  void operator()() const {
+    static_assert(std::is_integral_v<T>);
+
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    std::same_as<T> decltype(auto) y = (a &= T(1));
+    assert(y == T(1));
+    assert(x == T(1));
+    ASSERT_NOEXCEPT(a &= T(0));
+
+    y = (a &= T(2));
+    assert(y == T(0));
+    assert(x == T(0));
+  }
+};
+
+int main(int, char**) {
+  TestEachIntegralType<TestBitwiseAndAssign>()();
+
+  TestEachFloatingPointType<TestDoesNotHaveBitwiseAndAssign>()();
+
+  TestEachPointerType<TestDoesNotHaveBitwiseAndAssign>()();
+
+  TestDoesNotHaveBitwiseAndAssign<bool>()();
+  TestDoesNotHaveBitwiseAndAssign<UserAtomicType>()();
+  TestDoesNotHaveBitwiseAndAssign<LargeUserAtomicType>()();
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/bitwise_or_assign.pass.cpp b/libcxx/test/std/atomics/atomics.ref/bitwise_or_assign.pass.cpp
new file mode 100644
index 0000000..5c22c8a2
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/bitwise_or_assign.pass.cpp
@@ -0,0 +1,56 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+
+// integral-type operator|=(integral-type) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_macros.h"
+
+template <typename T>
+concept has_bitwise_or_assign = requires { std::declval<T const>() |= std::declval<T>(); };
+
+template < typename T>
+struct TestDoesNotHaveBitwiseOrAssign {
+  void operator()() const { static_assert(!has_bitwise_or_assign<std::atomic_ref<T>>); }
+};
+
+template <typename T>
+struct TestBitwiseOrAssign {
+  void operator()() const {
+    static_assert(std::is_integral_v<T>);
+
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    std::same_as<T> decltype(auto) y = (a |= T(2));
+    assert(y == T(3));
+    assert(x == T(3));
+    ASSERT_NOEXCEPT(a |= T(0));
+  }
+};
+
+int main(int, char**) {
+  TestEachIntegralType<TestBitwiseOrAssign>()();
+
+  TestEachFloatingPointType<TestDoesNotHaveBitwiseOrAssign>()();
+
+  TestEachPointerType<TestDoesNotHaveBitwiseOrAssign>()();
+
+  TestDoesNotHaveBitwiseOrAssign<bool>()();
+  TestDoesNotHaveBitwiseOrAssign<UserAtomicType>()();
+  TestDoesNotHaveBitwiseOrAssign<LargeUserAtomicType>()();
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/bitwise_xor_assign.pass.cpp b/libcxx/test/std/atomics/atomics.ref/bitwise_xor_assign.pass.cpp
new file mode 100644
index 0000000..4dc4fd3
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/bitwise_xor_assign.pass.cpp
@@ -0,0 +1,56 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+
+// integral-type operator|=(integral-type) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_macros.h"
+
+template <typename T>
+concept has_bitwise_xor_assign = requires { std::declval<T const>() ^= std::declval<T>(); };
+
+template <typename T>
+struct TestDoesNotHaveBitwiseXorAssign {
+  void operator()() const { static_assert(!has_bitwise_xor_assign<std::atomic_ref<float>>); }
+};
+
+template <typename T>
+struct TestBitwiseXorAssign {
+  void operator()() const {
+    static_assert(std::is_integral_v<T>);
+
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    std::same_as<T> decltype(auto) y = (a ^= T(2));
+    assert(y == T(3));
+    assert(x == T(3));
+    ASSERT_NOEXCEPT(a ^= T(0));
+  }
+};
+
+int main(int, char**) {
+  TestEachIntegralType<TestBitwiseXorAssign>()();
+
+  TestEachFloatingPointType<TestDoesNotHaveBitwiseXorAssign>()();
+
+  TestEachPointerType<TestDoesNotHaveBitwiseXorAssign>()();
+
+  TestDoesNotHaveBitwiseXorAssign<bool>()();
+  TestDoesNotHaveBitwiseXorAssign<UserAtomicType>()();
+  TestDoesNotHaveBitwiseXorAssign<LargeUserAtomicType>()();
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp b/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp
new file mode 100644
index 0000000..72b2f44
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp
@@ -0,0 +1,221 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+// XFAIL: !has-1024-bit-atomics
+
+// bool compare_exchange_strong(T&, T, memory_order, memory_order) const noexcept;
+// bool compare_exchange_strong(T&, T, memory_order = memory_order::seq_cst) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_helper.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestCompareExchangeStrong {
+  void operator()() const {
+    {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      T t(T(1));
+      std::same_as<bool> decltype(auto) y = a.compare_exchange_strong(t, T(2));
+      assert(y == true);
+      assert(a == T(2));
+      assert(t == T(1));
+      y = a.compare_exchange_strong(t, T(3));
+      assert(y == false);
+      assert(a == T(2));
+      assert(t == T(2));
+
+      ASSERT_NOEXCEPT(a.compare_exchange_strong(t, T(2)));
+    }
+    {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      T t(T(1));
+      std::same_as<bool> decltype(auto) y = a.compare_exchange_strong(t, T(2), std::memory_order_seq_cst);
+      assert(y == true);
+      assert(a == T(2));
+      assert(t == T(1));
+      y = a.compare_exchange_strong(t, T(3), std::memory_order_seq_cst);
+      assert(y == false);
+      assert(a == T(2));
+      assert(t == T(2));
+
+      ASSERT_NOEXCEPT(a.compare_exchange_strong(t, T(2), std::memory_order_seq_cst));
+    }
+    {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      T t(T(1));
+      std::same_as<bool> decltype(auto) y =
+          a.compare_exchange_strong(t, T(2), std::memory_order_release, std::memory_order_relaxed);
+      assert(y == true);
+      assert(a == T(2));
+      assert(t == T(1));
+      y = a.compare_exchange_strong(t, T(3), std::memory_order_release, std::memory_order_relaxed);
+      assert(y == false);
+      assert(a == T(2));
+      assert(t == T(2));
+
+      ASSERT_NOEXCEPT(a.compare_exchange_strong(t, T(2), std::memory_order_release, std::memory_order_relaxed));
+    }
+
+    // success memory_order::release
+    {
+      auto store = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        auto r = x.compare_exchange_strong(old_val, new_val, std::memory_order::release, std::memory_order::relaxed);
+        assert(r);
+      };
+
+      auto load = [](std::atomic_ref<T> const& x) { return x.load(std::memory_order::acquire); };
+      test_acquire_release<T>(store, load);
+
+      auto store_one_arg = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        auto r = x.compare_exchange_strong(old_val, new_val, std::memory_order::release);
+        assert(r);
+      };
+      test_acquire_release<T>(store_one_arg, load);
+    }
+
+    // success memory_order::acquire
+    {
+      auto store = [](std::atomic_ref<T> const& x, T, T new_val) { x.store(new_val, std::memory_order::release); };
+
+      auto load = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_strong(val, val, std::memory_order::acquire, std::memory_order::relaxed)) {
+        }
+        return val;
+      };
+      test_acquire_release<T>(store, load);
+
+      auto load_one_arg = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_strong(val, val, std::memory_order::acquire)) {
+        }
+        return val;
+      };
+      test_acquire_release<T>(store, load_one_arg);
+    }
+
+    // success memory_order::acq_rel
+    {
+      auto store = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        auto r = x.compare_exchange_strong(old_val, new_val, std::memory_order::acq_rel, std::memory_order::relaxed);
+        assert(r);
+      };
+      auto load = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_strong(val, val, std::memory_order::acq_rel, std::memory_order::relaxed)) {
+        }
+        return val;
+      };
+      test_acquire_release<T>(store, load);
+
+      auto store_one_arg = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        auto r = x.compare_exchange_strong(old_val, new_val, std::memory_order::acq_rel);
+        assert(r);
+      };
+      auto load_one_arg = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_strong(val, val, std::memory_order::acq_rel)) {
+        }
+        return val;
+      };
+      test_acquire_release<T>(store_one_arg, load_one_arg);
+    }
+
+    // success memory_order::seq_cst
+    {
+      auto store = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        auto r = x.compare_exchange_strong(old_val, new_val, std::memory_order::seq_cst, std::memory_order::relaxed);
+        assert(r);
+      };
+      auto load = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_strong(val, val, std::memory_order::seq_cst, std::memory_order::relaxed)) {
+        }
+        return val;
+      };
+      test_seq_cst<T>(store, load);
+
+      auto store_one_arg = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        auto r = x.compare_exchange_strong(old_val, new_val, std::memory_order::seq_cst);
+        assert(r);
+      };
+      auto load_one_arg = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_strong(val, val, std::memory_order::seq_cst)) {
+        }
+        return val;
+      };
+      test_seq_cst<T>(store_one_arg, load_one_arg);
+    }
+
+    // failure memory_order::acquire
+    {
+      auto store = [](std::atomic_ref<T> const& x, T, T new_val) { x.store(new_val, std::memory_order::release); };
+      auto load  = [](std::atomic_ref<T> const& x) {
+        auto result = x.load(std::memory_order::relaxed);
+        T unexpected(T(255));
+        bool r =
+            x.compare_exchange_strong(unexpected, unexpected, std::memory_order::relaxed, std::memory_order::acquire);
+        assert(!r);
+        return result;
+      };
+      test_acquire_release<T>(store, load);
+
+      auto load_one_arg = [](std::atomic_ref<T> const& x) {
+        auto result = x.load(std::memory_order::relaxed);
+        T unexpected(T(255));
+        bool r = x.compare_exchange_strong(unexpected, unexpected, std::memory_order::acquire);
+        assert(!r);
+        return result;
+      };
+      test_acquire_release<T>(store, load_one_arg);
+
+      // acq_rel replaced by acquire
+      auto load_one_arg_acq_rel = [](std::atomic_ref<T> const& x) {
+        auto result = x.load(std::memory_order::relaxed);
+        T unexpected(T(255));
+        bool r = x.compare_exchange_strong(unexpected, unexpected, std::memory_order::acq_rel);
+        assert(!r);
+        return result;
+      };
+      test_acquire_release<T>(store, load_one_arg_acq_rel);
+    }
+
+    // failure memory_order::seq_cst
+    {
+      auto store = [](std::atomic_ref<T> const& x, T, T new_val) { x.store(new_val, std::memory_order::seq_cst); };
+      auto load  = [](std::atomic_ref<T> const& x) {
+        auto result = x.load(std::memory_order::relaxed);
+        T unexpected(T(255));
+        bool r =
+            x.compare_exchange_strong(unexpected, unexpected, std::memory_order::relaxed, std::memory_order::seq_cst);
+        assert(!r);
+        return result;
+      };
+      test_seq_cst<T>(store, load);
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestCompareExchangeStrong>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp b/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp
new file mode 100644
index 0000000..5219a8e
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp
@@ -0,0 +1,226 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+// XFAIL: !has-1024-bit-atomics
+
+// bool compare_exchange_weak(T&, T, memory_order, memory_order) const noexcept;
+// bool compare_exchange_weak(T&, T, memory_order = memory_order::seq_cst) const noexcept;
+
+#include <atomic>
+#include <concepts>
+#include <cassert>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_helper.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestCompareExchangeWeak {
+  void operator()() const {
+    {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      T t(T(1));
+      std::same_as<bool> decltype(auto) y = a.compare_exchange_weak(t, T(2));
+      assert(y == true);
+      assert(a == T(2));
+      assert(t == T(1));
+      y = a.compare_exchange_weak(t, T(3));
+      assert(y == false);
+      assert(a == T(2));
+      assert(t == T(2));
+
+      ASSERT_NOEXCEPT(a.compare_exchange_weak(t, T(2)));
+    }
+    {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      T t(T(1));
+      std::same_as<bool> decltype(auto) y = a.compare_exchange_weak(t, T(2), std::memory_order_seq_cst);
+      assert(y == true);
+      assert(a == T(2));
+      assert(t == T(1));
+      y = a.compare_exchange_weak(t, T(3), std::memory_order_seq_cst);
+      assert(y == false);
+      assert(a == T(2));
+      assert(t == T(2));
+
+      ASSERT_NOEXCEPT(a.compare_exchange_weak(t, T(2), std::memory_order_seq_cst));
+    }
+    {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      T t(T(1));
+      std::same_as<bool> decltype(auto) y =
+          a.compare_exchange_weak(t, T(2), std::memory_order_release, std::memory_order_relaxed);
+      assert(y == true);
+      assert(a == T(2));
+      assert(t == T(1));
+      y = a.compare_exchange_weak(t, T(3), std::memory_order_release, std::memory_order_relaxed);
+      assert(y == false);
+      assert(a == T(2));
+      assert(t == T(2));
+
+      ASSERT_NOEXCEPT(a.compare_exchange_weak(t, T(2), std::memory_order_release, std::memory_order_relaxed));
+    }
+
+    // success memory_order::release
+    {
+      auto store = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        // could fail spuriously, so put it in a loop
+        while (!x.compare_exchange_weak(old_val, new_val, std::memory_order::release, std::memory_order::relaxed)) {
+        }
+      };
+
+      auto load = [](std::atomic_ref<T> const& x) { return x.load(std::memory_order::acquire); };
+      test_acquire_release<T>(store, load);
+
+      auto store_one_arg = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        // could fail spuriously, so put it in a loop
+        while (!x.compare_exchange_weak(old_val, new_val, std::memory_order::release)) {
+        }
+      };
+      test_acquire_release<T>(store_one_arg, load);
+    }
+
+    // success memory_order::acquire
+    {
+      auto store = [](std::atomic_ref<T> const& x, T, T new_val) { x.store(new_val, std::memory_order::release); };
+      auto load  = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_weak(val, val, std::memory_order::acquire, std::memory_order::relaxed)) {
+        }
+        return val;
+      };
+      test_acquire_release<T>(store, load);
+
+      auto load_one_arg = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_weak(val, val, std::memory_order::acquire)) {
+        }
+        return val;
+      };
+      test_acquire_release<T>(store, load_one_arg);
+    }
+
+    // success memory_order::acq_rel
+    {
+      auto store = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        // could fail spuriously, so put it in a loop
+        while (!x.compare_exchange_weak(old_val, new_val, std::memory_order::acq_rel, std::memory_order::relaxed)) {
+        }
+      };
+      auto load = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_weak(val, val, std::memory_order::acq_rel, std::memory_order::relaxed)) {
+        }
+        return val;
+      };
+      test_acquire_release<T>(store, load);
+
+      auto store_one_arg = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        // could fail spuriously, so put it in a loop
+        while (!x.compare_exchange_weak(old_val, new_val, std::memory_order::acq_rel)) {
+        }
+      };
+      auto load_one_arg = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_weak(val, val, std::memory_order::acq_rel)) {
+        }
+        return val;
+      };
+      test_acquire_release<T>(store_one_arg, load_one_arg);
+    }
+
+    // success memory_order::seq_cst
+    {
+      auto store = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        // could fail spuriously, so put it in a loop
+        while (!x.compare_exchange_weak(old_val, new_val, std::memory_order::seq_cst, std::memory_order::relaxed)) {
+        }
+      };
+      auto load = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_weak(val, val, std::memory_order::seq_cst, std::memory_order::relaxed)) {
+        }
+        return val;
+      };
+      test_seq_cst<T>(store, load);
+
+      auto store_one_arg = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        // could fail spuriously, so put it in a loop
+        while (!x.compare_exchange_weak(old_val, new_val, std::memory_order::seq_cst)) {
+        }
+      };
+      auto load_one_arg = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_weak(val, val, std::memory_order::seq_cst)) {
+        }
+        return val;
+      };
+      test_seq_cst<T>(store_one_arg, load_one_arg);
+    }
+
+    // failure memory_order::acquire
+    {
+      auto store = [](std::atomic_ref<T> const& x, T, T new_val) { x.store(new_val, std::memory_order::release); };
+      auto load  = [](std::atomic_ref<T> const& x) {
+        auto result = x.load(std::memory_order::relaxed);
+        T unexpected(T(255));
+        bool r =
+            x.compare_exchange_weak(unexpected, unexpected, std::memory_order::relaxed, std::memory_order::acquire);
+        assert(!r);
+        return result;
+      };
+      test_acquire_release<T>(store, load);
+
+      auto load_one_arg = [](std::atomic_ref<T> const& x) {
+        auto result = x.load(std::memory_order::relaxed);
+        T unexpected(T(255));
+        bool r = x.compare_exchange_weak(unexpected, unexpected, std::memory_order::acquire);
+        assert(!r);
+        return result;
+      };
+      test_acquire_release<T>(store, load_one_arg);
+
+      // acq_rel replaced by acquire
+      auto load_one_arg_acq_rel = [](std::atomic_ref<T> const& x) {
+        auto result = x.load(std::memory_order::relaxed);
+        T unexpected(T(255));
+        bool r = x.compare_exchange_weak(unexpected, unexpected, std::memory_order::acq_rel);
+        assert(!r);
+        return result;
+      };
+      test_acquire_release<T>(store, load_one_arg_acq_rel);
+    }
+
+    // failure memory_order::seq_cst
+    {
+      auto store = [](std::atomic_ref<T> const& x, T, T new_val) { x.store(new_val, std::memory_order::seq_cst); };
+      auto load  = [](std::atomic_ref<T> const& x) {
+        auto result = x.load(std::memory_order::relaxed);
+        T unexpected(T(255));
+        bool r =
+            x.compare_exchange_weak(unexpected, unexpected, std::memory_order::relaxed, std::memory_order::seq_cst);
+        assert(!r);
+        return result;
+      };
+      test_seq_cst<T>(store, load);
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestCompareExchangeWeak>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/convert.pass.cpp b/libcxx/test/std/atomics/atomics.ref/convert.pass.cpp
new file mode 100644
index 0000000..2a58a5e
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/convert.pass.cpp
@@ -0,0 +1,45 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+// XFAIL: !has-1024-bit-atomics
+
+// operator T() const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_helper.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestConvert {
+  void operator()() const {
+    T x(T(1));
+
+    T copy = x;
+    std::atomic_ref<T> const a(copy);
+
+    T converted = a;
+    assert(converted == x);
+
+    ASSERT_NOEXCEPT(T(a));
+    static_assert(std::is_nothrow_convertible_v<std::atomic_ref<T>, T>);
+
+    auto store = [](std::atomic_ref<T> const& y, T, T new_val) { y.store(new_val); };
+    auto load  = [](std::atomic_ref<T> const& y) { return static_cast<T>(y); };
+    test_seq_cst<T>(store, load);
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestConvert>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/ctor.pass.cpp b/libcxx/test/std/atomics/atomics.ref/ctor.pass.cpp
new file mode 100644
index 0000000..d6c6474
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/ctor.pass.cpp
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// <atomic>
+
+// explicit atomic_ref(T&);
+
+#include <atomic>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestCtor {
+  void operator()() const {
+    // check that the constructor is explicit
+    static_assert(!std::is_convertible_v<T, std::atomic_ref<T>>);
+    static_assert(std::is_constructible_v<std::atomic_ref<T>, T&>);
+
+    T x(T(0));
+    std::atomic_ref<T> a(x);
+    (void)a;
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestCtor>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/deduction.pass.cpp b/libcxx/test/std/atomics/atomics.ref/deduction.pass.cpp
new file mode 100644
index 0000000..24a399a
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/deduction.pass.cpp
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// <atomic>
+
+// explicit atomic_ref(T&);
+
+#include <atomic>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestDeduction {
+  void operator()() const {
+    T x(T(0));
+    std::atomic_ref a(x);
+    ASSERT_SAME_TYPE(decltype(a), std::atomic_ref<T>);
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestDeduction>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/exchange.pass.cpp b/libcxx/test/std/atomics/atomics.ref/exchange.pass.cpp
new file mode 100644
index 0000000..cd998d4
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/exchange.pass.cpp
@@ -0,0 +1,45 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+// XFAIL: !has-1024-bit-atomics
+
+// T exchange(T, memory_order = memory_order::seq_cst) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestExchange {
+  void operator()() const {
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    {
+      std::same_as<T> decltype(auto) y = a.exchange(T(2));
+      assert(y == T(1));
+      ASSERT_NOEXCEPT(a.exchange(T(2)));
+    }
+
+    {
+      std::same_as<T> decltype(auto) y = a.exchange(T(3), std::memory_order_seq_cst);
+      assert(y == T(2));
+      ASSERT_NOEXCEPT(a.exchange(T(3), std::memory_order_seq_cst));
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestExchange>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/fetch_add.pass.cpp b/libcxx/test/std/atomics/atomics.ref/fetch_add.pass.cpp
new file mode 100644
index 0000000..908a687
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/fetch_add.pass.cpp
@@ -0,0 +1,113 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+
+// integral-type fetch_add(integral-type, memory_order = memory_order::seq_cst) const noexcept;
+// floating-point-type fetch_add(floating-point-type, memory_order = memory_order::seq_cst) const noexcept;
+// T* fetch_add(difference_type, memory_order = memory_order::seq_cst) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_helper.h"
+#include "test_macros.h"
+
+template <typename T>
+concept has_fetch_add = requires {
+  std::declval<T const>().fetch_add(std::declval<T>());
+  std::declval<T const>().fetch_add(std::declval<T>(), std::declval<std::memory_order>());
+};
+
+template <typename T>
+struct TestDoesNotHaveFetchAdd {
+  void operator()() const { static_assert(!has_fetch_add<std::atomic_ref<T>>); }
+};
+
+template <typename T>
+struct TestFetchAdd {
+  void operator()() const {
+    if constexpr (std::is_arithmetic_v<T>) {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      {
+        std::same_as<T> decltype(auto) y = a.fetch_add(T(2));
+        assert(y == T(1));
+        assert(x == T(3));
+        ASSERT_NOEXCEPT(a.fetch_add(T(0)));
+      }
+
+      {
+        std::same_as<T> decltype(auto) y = a.fetch_add(T(4), std::memory_order_relaxed);
+        assert(y == T(3));
+        assert(x == T(7));
+        ASSERT_NOEXCEPT(a.fetch_add(T(0), std::memory_order_relaxed));
+      }
+    } else if constexpr (std::is_pointer_v<T>) {
+      using U = std::remove_pointer_t<T>;
+      U t[9]  = {};
+      T p{&t[1]};
+      std::atomic_ref<T> const a(p);
+
+      {
+        std::same_as<T> decltype(auto) y = a.fetch_add(2);
+        assert(y == &t[1]);
+        assert(a == &t[3]);
+        ASSERT_NOEXCEPT(a.fetch_add(0));
+      }
+
+      {
+        std::same_as<T> decltype(auto) y = a.fetch_add(4, std::memory_order_relaxed);
+        assert(y == &t[3]);
+        assert(a == &t[7]);
+        ASSERT_NOEXCEPT(a.fetch_add(0, std::memory_order_relaxed));
+      }
+    } else {
+      static_assert(std::is_void_v<T>);
+    }
+
+    // memory_order::release
+    {
+      auto fetch_add = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        x.fetch_add(new_val - old_val, std::memory_order::release);
+      };
+      auto load = [](std::atomic_ref<T> const& x) { return x.load(std::memory_order::acquire); };
+      test_acquire_release<T>(fetch_add, load);
+    }
+
+    // memory_order::seq_cst
+    {
+      auto fetch_add_no_arg = [](std::atomic_ref<T> const& x, T old_val, T new_val) { x.fetch_add(new_val - old_val); };
+      auto fetch_add_with_order = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        x.fetch_add(new_val - old_val, std::memory_order::seq_cst);
+      };
+      auto load = [](std::atomic_ref<T> const& x) { return x.load(); };
+      test_seq_cst<T>(fetch_add_no_arg, load);
+      test_seq_cst<T>(fetch_add_with_order, load);
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachIntegralType<TestFetchAdd>()();
+
+  TestFetchAdd<float>()();
+  TestFetchAdd<double>()();
+
+  TestEachPointerType<TestFetchAdd>()();
+
+  TestDoesNotHaveFetchAdd<bool>()();
+  TestDoesNotHaveFetchAdd<UserAtomicType>()();
+  TestDoesNotHaveFetchAdd<LargeUserAtomicType>()();
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/fetch_and.pass.cpp b/libcxx/test/std/atomics/atomics.ref/fetch_and.pass.cpp
new file mode 100644
index 0000000..8f0bec2
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/fetch_and.pass.cpp
@@ -0,0 +1,69 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+
+// integral-type fetch_and(integral-type, memory_order = memory_order::seq_cst) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_macros.h"
+
+template <typename T>
+concept has_fetch_and = requires {
+  std::declval<T const>().fetch_and(std::declval<T>());
+  std::declval<T const>().fetch_and(std::declval<T>(), std::declval<std::memory_order>());
+};
+
+template <typename T>
+struct TestDoesNotHaveFetchAnd {
+  void operator()() const { static_assert(!has_fetch_and<std::atomic_ref<T>>); }
+};
+
+template <typename T>
+struct TestFetchAnd {
+  void operator()() const {
+    static_assert(std::is_integral_v<T>);
+
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    {
+      std::same_as<T> decltype(auto) y = a.fetch_and(T(2));
+      assert(y == T(1));
+      assert(x == T(0));
+      ASSERT_NOEXCEPT(a.fetch_and(T(0)));
+    }
+
+    x = T(1);
+
+    {
+      std::same_as<T> decltype(auto) y = a.fetch_and(T(2), std::memory_order_relaxed);
+      assert(y == T(1));
+      assert(x == T(0));
+      ASSERT_NOEXCEPT(a.fetch_and(T(0), std::memory_order_relaxed));
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachIntegralType<TestFetchAnd>()();
+
+  TestEachFloatingPointType<TestDoesNotHaveFetchAnd>()();
+
+  TestEachPointerType<TestDoesNotHaveFetchAnd>()();
+
+  TestDoesNotHaveFetchAnd<bool>()();
+  TestDoesNotHaveFetchAnd<UserAtomicType>()();
+  TestDoesNotHaveFetchAnd<LargeUserAtomicType>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/fetch_or.pass.cpp b/libcxx/test/std/atomics/atomics.ref/fetch_or.pass.cpp
new file mode 100644
index 0000000..2045868
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/fetch_or.pass.cpp
@@ -0,0 +1,68 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+
+// integral-type fetch_or(integral-type, memory_order = memory_order::seq_cst) const noexcept;
+
+#include <atomic>
+#include <concepts>
+#include <cassert>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_macros.h"
+
+template <typename T>
+concept has_fetch_or = requires {
+  std::declval<T const>().fetch_or(std::declval<T>());
+  std::declval<T const>().fetch_or(std::declval<T>(), std::declval<std::memory_order>());
+};
+
+template <typename T>
+struct TestDoesNotHaveFetchOr {
+  void operator()() const { static_assert(!has_fetch_or<std::atomic_ref<T>>); }
+};
+
+template <typename T>
+struct TestFetchOr {
+  void operator()() const {
+    static_assert(std::is_integral_v<T>);
+
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    {
+      std::same_as<T> decltype(auto) y = a.fetch_or(T(2));
+      assert(y == T(1));
+      assert(x == T(3));
+      ASSERT_NOEXCEPT(a.fetch_or(T(0)));
+    }
+
+    {
+      std::same_as<T> decltype(auto) y = a.fetch_or(T(2), std::memory_order_relaxed);
+      assert(y == T(3));
+      assert(x == T(3));
+      ASSERT_NOEXCEPT(a.fetch_or(T(0), std::memory_order_relaxed));
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachIntegralType<TestFetchOr>()();
+
+  TestEachFloatingPointType<TestDoesNotHaveFetchOr>()();
+
+  TestEachPointerType<TestDoesNotHaveFetchOr>()();
+
+  TestDoesNotHaveFetchOr<bool>()();
+  TestDoesNotHaveFetchOr<UserAtomicType>()();
+  TestDoesNotHaveFetchOr<LargeUserAtomicType>()();
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/fetch_sub.pass.cpp b/libcxx/test/std/atomics/atomics.ref/fetch_sub.pass.cpp
new file mode 100644
index 0000000..5456045
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/fetch_sub.pass.cpp
@@ -0,0 +1,113 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+
+// integral-type fetch_sub(integral-type, memory_order = memory_order::seq_cst) const noexcept;
+// floating-point-type fetch_sub(floating-point-type, memory_order = memory_order::seq_cst) const noexcept;
+// T* fetch_sub(difference_type, memory_order = memory_order::seq_cst) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_helper.h"
+#include "test_macros.h"
+
+template <typename T>
+concept has_fetch_sub = requires {
+  std::declval<T const>().fetch_sub(std::declval<T>());
+  std::declval<T const>().fetch_sub(std::declval<T>(), std::declval<std::memory_order>());
+};
+
+template <typename T>
+struct TestDoesNotHaveFetchSub {
+  void operator()() const { static_assert(!has_fetch_sub<std::atomic_ref<T>>); }
+};
+
+template <typename T>
+struct TestFetchSub {
+  void operator()() const {
+    if constexpr (std::is_arithmetic_v<T>) {
+      T x(T(7));
+      std::atomic_ref<T> const a(x);
+
+      {
+        std::same_as<T> decltype(auto) y = a.fetch_sub(T(4));
+        assert(y == T(7));
+        assert(x == T(3));
+        ASSERT_NOEXCEPT(a.fetch_sub(T(0)));
+      }
+
+      {
+        std::same_as<T> decltype(auto) y = a.fetch_sub(T(2), std::memory_order_relaxed);
+        assert(y == T(3));
+        assert(x == T(1));
+        ASSERT_NOEXCEPT(a.fetch_sub(T(0), std::memory_order_relaxed));
+      }
+    } else if constexpr (std::is_pointer_v<T>) {
+      using U = std::remove_pointer_t<T>;
+      U t[9]  = {};
+      T p{&t[7]};
+      std::atomic_ref<T> const a(p);
+
+      {
+        std::same_as<T> decltype(auto) y = a.fetch_sub(4);
+        assert(y == &t[7]);
+        assert(a == &t[3]);
+        ASSERT_NOEXCEPT(a.fetch_sub(0));
+      }
+
+      {
+        std::same_as<T> decltype(auto) y = a.fetch_sub(2, std::memory_order_relaxed);
+        assert(y == &t[3]);
+        assert(a == &t[1]);
+        ASSERT_NOEXCEPT(a.fetch_sub(0, std::memory_order_relaxed));
+      }
+    } else {
+      static_assert(std::is_void_v<T>);
+    }
+
+    // memory_order::release
+    {
+      auto fetch_sub = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        x.fetch_sub(old_val - new_val, std::memory_order::release);
+      };
+      auto load = [](std::atomic_ref<T> const& x) { return x.load(std::memory_order::acquire); };
+      test_acquire_release<T>(fetch_sub, load);
+    }
+
+    // memory_order::seq_cst
+    {
+      auto fetch_sub_no_arg = [](std::atomic_ref<T> const& x, T old_val, T new_val) { x.fetch_sub(old_val - new_val); };
+      auto fetch_sub_with_order = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        x.fetch_sub(old_val - new_val, std::memory_order::seq_cst);
+      };
+      auto load = [](std::atomic_ref<T> const& x) { return x.load(); };
+      test_seq_cst<T>(fetch_sub_no_arg, load);
+      test_seq_cst<T>(fetch_sub_with_order, load);
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachIntegralType<TestFetchSub>()();
+
+  TestFetchSub<float>()();
+  TestFetchSub<double>()();
+
+  TestEachPointerType<TestFetchSub>()();
+
+  TestDoesNotHaveFetchSub<bool>()();
+  TestDoesNotHaveFetchSub<UserAtomicType>()();
+  TestDoesNotHaveFetchSub<LargeUserAtomicType>()();
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/fetch_xor.pass.cpp b/libcxx/test/std/atomics/atomics.ref/fetch_xor.pass.cpp
new file mode 100644
index 0000000..aade87f
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/fetch_xor.pass.cpp
@@ -0,0 +1,68 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+
+// integral-type fetch_xor(integral-type, memory_order = memory_order::seq_cst) const noexcept;
+
+#include <atomic>
+#include <concepts>
+#include <cassert>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_macros.h"
+
+template <typename T>
+concept has_fetch_xor = requires {
+  std::declval<T const>().fetch_xor(std::declval<T>());
+  std::declval<T const>().fetch_xor(std::declval<T>(), std::declval<std::memory_order>());
+};
+
+template <typename T>
+struct TestDoesNotHaveFetchXor {
+  void operator()() const { static_assert(!has_fetch_xor<std::atomic_ref<T>>); }
+};
+
+template <typename T>
+struct TestFetchXor {
+  void operator()() const {
+    static_assert(std::is_integral_v<T>);
+
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    {
+      std::same_as<T> decltype(auto) y = a.fetch_xor(T(2));
+      assert(y == T(1));
+      assert(x == T(3));
+      ASSERT_NOEXCEPT(a.fetch_xor(T(0)));
+    }
+
+    {
+      std::same_as<T> decltype(auto) y = a.fetch_xor(T(2), std::memory_order_relaxed);
+      assert(y == T(3));
+      assert(x == T(1));
+      ASSERT_NOEXCEPT(a.fetch_xor(T(0), std::memory_order_relaxed));
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachIntegralType<TestFetchXor>()();
+
+  TestEachFloatingPointType<TestDoesNotHaveFetchXor>()();
+
+  TestEachPointerType<TestDoesNotHaveFetchXor>()();
+
+  TestDoesNotHaveFetchXor<bool>()();
+  TestDoesNotHaveFetchXor<UserAtomicType>()();
+  TestDoesNotHaveFetchXor<LargeUserAtomicType>()();
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/increment_decrement.pass.cpp b/libcxx/test/std/atomics/atomics.ref/increment_decrement.pass.cpp
new file mode 100644
index 0000000..c84c89b
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/increment_decrement.pass.cpp
@@ -0,0 +1,97 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+
+// integral-type operator++(int) const noexcept;
+// integral-type operator--(int) const noexcept;
+// integral-type operator++() const noexcept;
+// integral-type operator--() const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_macros.h"
+
+template <typename T>
+concept has_pre_increment_operator = requires { ++std::declval<T const>(); };
+
+template <typename T>
+concept has_post_increment_operator = requires { std::declval<T const>()++; };
+
+template <typename T>
+concept has_pre_decrement_operator = requires { --std::declval<T const>(); };
+
+template <typename T>
+concept has_post_decrement_operator = requires { std::declval<T const>()--; };
+
+template <typename T>
+constexpr bool does_not_have_increment_nor_decrement_operators() {
+  return !has_pre_increment_operator<T> && !has_pre_decrement_operator<T> && !has_post_increment_operator<T> &&
+         !has_post_decrement_operator<T>;
+}
+
+template <typename T>
+struct TestDoesNotHaveIncrementDecrement {
+  void operator()() const { static_assert(does_not_have_increment_nor_decrement_operators<T>()); }
+};
+
+template <typename T>
+struct TestIncrementDecrement {
+  void operator()() const {
+    static_assert(std::is_integral_v<T>);
+
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    {
+      std::same_as<T> decltype(auto) y = ++a;
+      assert(y == T(2));
+      assert(x == T(2));
+      ASSERT_NOEXCEPT(++a);
+    }
+
+    {
+      std::same_as<T> decltype(auto) y = --a;
+      assert(y == T(1));
+      assert(x == T(1));
+      ASSERT_NOEXCEPT(--a);
+    }
+
+    {
+      std::same_as<T> decltype(auto) y = a++;
+      assert(y == T(1));
+      assert(x == T(2));
+      ASSERT_NOEXCEPT(a++);
+    }
+
+    {
+      std::same_as<T> decltype(auto) y = a--;
+      assert(y == T(2));
+      assert(x == T(1));
+      ASSERT_NOEXCEPT(a--);
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachIntegralType<TestIncrementDecrement>()();
+
+  TestEachFloatingPointType<TestDoesNotHaveIncrementDecrement>()();
+
+  TestEachPointerType<TestDoesNotHaveIncrementDecrement>()();
+
+  TestDoesNotHaveIncrementDecrement<bool>()();
+  TestDoesNotHaveIncrementDecrement<UserAtomicType>()();
+  TestDoesNotHaveIncrementDecrement<LargeUserAtomicType>()();
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/is_always_lock_free.pass.cpp b/libcxx/test/std/atomics/atomics.ref/is_always_lock_free.pass.cpp
new file mode 100644
index 0000000..94f65e3b4
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/is_always_lock_free.pass.cpp
@@ -0,0 +1,71 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// <atomic>
+
+// static constexpr bool is_always_lock_free;
+// bool is_lock_free() const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+
+#include "test_macros.h"
+
+template <typename T>
+void check_always_lock_free(std::atomic_ref<T> const a) {
+  std::same_as<const bool> decltype(auto) is_always_lock_free = std::atomic_ref<T>::is_always_lock_free;
+  if (is_always_lock_free) {
+    std::same_as<bool> decltype(auto) is_lock_free = a.is_lock_free();
+    assert(is_lock_free);
+  }
+  ASSERT_NOEXCEPT(a.is_lock_free());
+}
+
+#define CHECK_ALWAYS_LOCK_FREE(T)                                                                                      \
+  do {                                                                                                                 \
+    typedef T type;                                                                                                    \
+    type obj{};                                                                                                        \
+    check_always_lock_free(std::atomic_ref<type>(obj));                                                                \
+  } while (0)
+
+void test() {
+  int i = 0;
+  check_always_lock_free(std::atomic_ref<int>(i));
+
+  float f = 0.f;
+  check_always_lock_free(std::atomic_ref<float>(f));
+
+  int* p = &i;
+  check_always_lock_free(std::atomic_ref<int*>(p));
+
+  CHECK_ALWAYS_LOCK_FREE(struct Empty{});
+  CHECK_ALWAYS_LOCK_FREE(struct OneInt { int i; });
+  CHECK_ALWAYS_LOCK_FREE(struct IntArr2 { int i[2]; });
+  CHECK_ALWAYS_LOCK_FREE(struct FloatArr3 { float i[3]; });
+  CHECK_ALWAYS_LOCK_FREE(struct LLIArr2 { long long int i[2]; });
+  CHECK_ALWAYS_LOCK_FREE(struct LLIArr4 { long long int i[4]; });
+  CHECK_ALWAYS_LOCK_FREE(struct LLIArr8 { long long int i[8]; });
+  CHECK_ALWAYS_LOCK_FREE(struct LLIArr16 { long long int i[16]; });
+  CHECK_ALWAYS_LOCK_FREE(struct Padding {
+    char c; /* padding */
+    long long int i;
+  });
+  CHECK_ALWAYS_LOCK_FREE(union IntFloat {
+    int i;
+    float f;
+  });
+  CHECK_ALWAYS_LOCK_FREE(enum class CharEnumClass : char{foo});
+}
+
+int main(int, char**) {
+  test();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/load.pass.cpp b/libcxx/test/std/atomics/atomics.ref/load.pass.cpp
new file mode 100644
index 0000000..feed0fb
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/load.pass.cpp
@@ -0,0 +1,62 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+// XFAIL: !has-1024-bit-atomics
+
+// T load(memory_order = memory_order::seq_cst) const noexcept;
+
+#include <atomic>
+#include <concepts>
+#include <cassert>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_helper.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestLoad {
+  void operator()() const {
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    {
+      std::same_as<T> decltype(auto) y = a.load();
+      assert(y == T(1));
+      ASSERT_NOEXCEPT(a.load());
+    }
+
+    {
+      std::same_as<T> decltype(auto) y = a.load(std::memory_order_seq_cst);
+      assert(y == T(1));
+      ASSERT_NOEXCEPT(a.load(std::memory_order_seq_cst));
+    }
+
+    // memory_order::seq_cst
+    {
+      auto store           = [](std::atomic_ref<T> const& y, T, T new_val) { y.store(new_val); };
+      auto load_no_arg     = [](std::atomic_ref<T> const& y) { return y.load(); };
+      auto load_with_order = [](std::atomic_ref<T> const& y) { return y.load(std::memory_order::seq_cst); };
+      test_seq_cst<T>(store, load_no_arg);
+      test_seq_cst<T>(store, load_with_order);
+    }
+
+    // memory_order::release
+    {
+      auto store = [](std::atomic_ref<T> const& y, T, T new_val) { y.store(new_val, std::memory_order::release); };
+      auto load  = [](std::atomic_ref<T> const& y) { return y.load(std::memory_order::acquire); };
+      test_acquire_release<T>(store, load);
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestLoad>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/member_types.pass.cpp b/libcxx/test/std/atomics/atomics.ref/member_types.pass.cpp
new file mode 100644
index 0000000..d4e2f01
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/member_types.pass.cpp
@@ -0,0 +1,132 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <atomic>
+
+// template <class T>
+// struct atomic_ref
+// {
+//    using value_type = T;
+//    using difference_type = value_type;      // only for atomic_ref<Integral> and
+//                                             // atomic_ref<Floating> specializations
+//    using difference_type = std::ptrdiff_t;  // only for atomic_ref<T*> specializations
+//
+//    explicit atomic_ref(T&);
+//    atomic_ref(const atomic_ref&) noexcept;
+//    atomic_ref& operator=(const atomic_ref&) = delete;
+// };
+
+#include <atomic>
+#include <type_traits>
+
+#include "test_macros.h"
+
+template <class T>
+concept has_difference_type = requires { typename T::difference_type; };
+
+template <class T>
+void check_member_types() {
+  if constexpr ((std::is_integral_v<T> && !std::is_same_v<T, bool>) || std::is_floating_point_v<T>) {
+    ASSERT_SAME_TYPE(typename std::atomic_ref<T>::value_type, T);
+    ASSERT_SAME_TYPE(typename std::atomic_ref<T>::difference_type, T);
+  } else if constexpr (std::is_pointer_v<T>) {
+    ASSERT_SAME_TYPE(typename std::atomic_ref<T>::value_type, T);
+    ASSERT_SAME_TYPE(typename std::atomic_ref<T>::difference_type, std::ptrdiff_t);
+  } else {
+    ASSERT_SAME_TYPE(typename std::atomic_ref<T>::value_type, T);
+    static_assert(!has_difference_type<std::atomic_ref<T>>);
+  }
+}
+
+template <class T>
+void test() {
+  // value_type and difference_type (except for primary template)
+  check_member_types<T>();
+
+  static_assert(std::is_nothrow_copy_constructible_v<std::atomic_ref<T>>);
+
+  static_assert(!std::is_copy_assignable_v<std::atomic_ref<T>>);
+
+  // explicit constructor
+  static_assert(!std::is_convertible_v<T, std::atomic_ref<T>>);
+  static_assert(std::is_constructible_v<std::atomic_ref<T>, T&>);
+}
+
+void testall() {
+  // Primary template
+  struct Empty {};
+  test<Empty>();
+  struct Trivial {
+    int a;
+    float b;
+  };
+  test<Trivial>();
+  test<bool>();
+
+  // Partial specialization for pointer types
+  test<void*>();
+
+  // Specialization for integral types
+  // + character types
+  test<char>();
+  test<char8_t>();
+  test<char16_t>();
+  test<char32_t>();
+  test<wchar_t>();
+  // + standard signed integer types
+  test<signed char>();
+  test<short>();
+  test<int>();
+  test<long>();
+  test<long long>();
+  // + standard unsigned integer types
+  test<unsigned char>();
+  test<unsigned short>();
+  test<unsigned int>();
+  test<unsigned long>();
+  test<unsigned long long>();
+  // + any other types needed by the typedefs in the header <cstdint>
+  test<int8_t>();
+  test<int16_t>();
+  test<int32_t>();
+  test<int64_t>();
+  test<int_fast8_t>();
+  test<int_fast16_t>();
+  test<int_fast32_t>();
+  test<int_fast64_t>();
+  test<int_least8_t>();
+  test<int_least16_t>();
+  test<int_least32_t>();
+  test<int_least64_t>();
+  test<intmax_t>();
+  test<intptr_t>();
+  test<uint8_t>();
+  test<uint16_t>();
+  test<uint32_t>();
+  test<uint64_t>();
+  test<uint_fast8_t>();
+  test<uint_fast16_t>();
+  test<uint_fast32_t>();
+  test<uint_fast64_t>();
+  test<uint_least8_t>();
+  test<uint_least16_t>();
+  test<uint_least32_t>();
+  test<uint_least64_t>();
+  test<uintmax_t>();
+  test<uintptr_t>();
+
+  // Specialization for floating-point types
+  // + floating-point types
+  test<float>();
+  test<double>();
+  test<long double>();
+  // + TODO extended floating-point types
+}
+
+int main(int, char**) { return 0; }
diff --git a/libcxx/test/std/atomics/atomics.ref/notify_all.pass.cpp b/libcxx/test/std/atomics/atomics.ref/notify_all.pass.cpp
new file mode 100644
index 0000000..382b19f
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/notify_all.pass.cpp
@@ -0,0 +1,78 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-threads
+// XFAIL: availability-synchronization_library-missing
+// XFAIL: !has-64-bit-atomics
+// XFAIL: !has-1024-bit-atomics
+
+// void notify_all() const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <thread>
+#include <type_traits>
+#include <vector>
+
+#include "atomic_helpers.h"
+#include "make_test_thread.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestNotifyAll {
+  void operator()() const {
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    bool done                      = false;
+    std::atomic<int> started_num   = 0;
+    std::atomic<int> wait_done_num = 0;
+
+    constexpr auto number_of_threads = 8;
+    std::vector<std::thread> threads;
+    threads.reserve(number_of_threads);
+
+    for (auto j = 0; j < number_of_threads; ++j) {
+      threads.push_back(support::make_test_thread([&a, &started_num, &done, &wait_done_num] {
+        started_num.fetch_add(1, std::memory_order::relaxed);
+
+        a.wait(T(1));
+        wait_done_num.fetch_add(1, std::memory_order::relaxed);
+
+        // likely to fail if wait did not block
+        assert(done);
+      }));
+    }
+
+    while (started_num.load(std::memory_order::relaxed) != number_of_threads) {
+      std::this_thread::yield();
+    }
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+
+    done = true;
+    a.store(T(3));
+    a.notify_all();
+
+    // notify_all should unblock all the threads so that the loop below won't stuck
+    while (wait_done_num.load(std::memory_order::relaxed) != number_of_threads) {
+      std::this_thread::yield();
+    }
+
+    for (auto& thread : threads) {
+      thread.join();
+    }
+
+    ASSERT_NOEXCEPT(a.notify_all());
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestNotifyAll>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/notify_one.pass.cpp b/libcxx/test/std/atomics/atomics.ref/notify_one.pass.cpp
new file mode 100644
index 0000000..611e674
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/notify_one.pass.cpp
@@ -0,0 +1,46 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-threads
+// XFAIL: availability-synchronization_library-missing
+// XFAIL: !has-64-bit-atomics
+// XFAIL: !has-1024-bit-atomics
+
+// void notify_one() const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <thread>
+#include <type_traits>
+#include <vector>
+
+#include "atomic_helpers.h"
+#include "make_test_thread.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestNotifyOne {
+  void operator()() const {
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    std::thread t = support::make_test_thread([&]() {
+      a.store(T(3));
+      a.notify_one();
+    });
+    a.wait(T(1));
+    assert(a.load() == T(3));
+    t.join();
+    ASSERT_NOEXCEPT(a.notify_one());
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestNotifyOne>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/operator_minus_equals.pass.cpp b/libcxx/test/std/atomics/atomics.ref/operator_minus_equals.pass.cpp
new file mode 100644
index 0000000..571d626
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/operator_minus_equals.pass.cpp
@@ -0,0 +1,79 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+
+// integral-type operator-=(integral-type) const noexcept;
+// floating-point-type operator-=(floating-point-type) const noexcept;
+// T* operator-=(difference_type) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_helper.h"
+#include "test_macros.h"
+
+template <typename T>
+concept has_operator_minus_equals = requires { std::declval<T const>() -= std::declval<T>(); };
+
+template <typename T>
+struct TestDoesNotHaveOperatorMinusEquals {
+  void operator()() const { static_assert(!has_operator_minus_equals<std::atomic_ref<T>>); }
+};
+
+template <typename T>
+struct TestOperatorMinusEquals {
+  void operator()() const {
+    if constexpr (std::is_arithmetic_v<T>) {
+      T x(T(3));
+      std::atomic_ref<T> const a(x);
+
+      std::same_as<T> decltype(auto) y = (a -= T(2));
+      assert(y == T(1));
+      assert(x == T(1));
+      ASSERT_NOEXCEPT(a -= T(0));
+    } else if constexpr (std::is_pointer_v<T>) {
+      using U = std::remove_pointer_t<T>;
+      U t[9]  = {};
+      T p{&t[3]};
+      std::atomic_ref<T> const a(p);
+
+      std::same_as<T> decltype(auto) y = (a -= 2);
+      assert(y == &t[1]);
+      assert(a == &t[1]);
+      ASSERT_NOEXCEPT(a -= 0);
+    } else {
+      static_assert(std::is_void_v<T>);
+    }
+
+    // memory_order::seq_cst
+    {
+      auto minus_equals = [](std::atomic_ref<T> const& x, T old_val, T new_val) { x -= (old_val - new_val); };
+      auto load         = [](std::atomic_ref<T> const& x) { return x.load(); };
+      test_seq_cst<T>(minus_equals, load);
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachIntegralType<TestOperatorMinusEquals>()();
+
+  TestOperatorMinusEquals<float>()();
+  TestOperatorMinusEquals<double>()();
+
+  TestEachPointerType<TestOperatorMinusEquals>()();
+
+  TestDoesNotHaveOperatorMinusEquals<bool>()();
+  TestDoesNotHaveOperatorMinusEquals<UserAtomicType>()();
+  TestDoesNotHaveOperatorMinusEquals<LargeUserAtomicType>()();
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/operator_plus_equals.pass.cpp b/libcxx/test/std/atomics/atomics.ref/operator_plus_equals.pass.cpp
new file mode 100644
index 0000000..de48ea5
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/operator_plus_equals.pass.cpp
@@ -0,0 +1,79 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+
+// integral-type operator+=(integral-type) const noexcept;
+// floating-point-type operator+=(floating-point-type) const noexcept;
+// T* operator+=(difference_type) const noexcept;
+
+#include <atomic>
+#include <concepts>
+#include <cassert>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_helper.h"
+#include "test_macros.h"
+
+template <typename T>
+concept has_operator_plus_equals = requires { std::declval<T const>() += std::declval<T>(); };
+
+template <typename T>
+struct TestDoesNotHaveOperatorPlusEquals {
+  void operator()() const { static_assert(!has_operator_plus_equals<std::atomic_ref<T>>); }
+};
+
+template <typename T>
+struct TestOperatorPlusEquals {
+  void operator()() const {
+    if constexpr (std::is_arithmetic_v<T>) {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      std::same_as<T> decltype(auto) y = (a += T(2));
+      assert(y == T(3));
+      assert(x == T(3));
+      ASSERT_NOEXCEPT(a += T(0));
+    } else if constexpr (std::is_pointer_v<T>) {
+      using U = std::remove_pointer_t<T>;
+      U t[9]  = {};
+      T p{&t[1]};
+      std::atomic_ref<T> const a(p);
+
+      std::same_as<T> decltype(auto) y = (a += 2);
+      assert(y == &t[3]);
+      assert(a == &t[3]);
+      ASSERT_NOEXCEPT(a += 0);
+    } else {
+      static_assert(std::is_void_v<T>);
+    }
+
+    // memory_order::seq_cst
+    {
+      auto plus_equals = [](std::atomic_ref<T> const& x, T old_val, T new_val) { x += (new_val - old_val); };
+      auto load        = [](std::atomic_ref<T> const& x) { return x.load(); };
+      test_seq_cst<T>(plus_equals, load);
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachIntegralType<TestOperatorPlusEquals>()();
+
+  TestOperatorPlusEquals<float>()();
+  TestOperatorPlusEquals<double>()();
+
+  TestEachPointerType<TestOperatorPlusEquals>()();
+
+  TestDoesNotHaveOperatorPlusEquals<bool>()();
+  TestDoesNotHaveOperatorPlusEquals<UserAtomicType>()();
+  TestDoesNotHaveOperatorPlusEquals<LargeUserAtomicType>()();
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp b/libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp
new file mode 100644
index 0000000..86e0cba
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp
@@ -0,0 +1,39 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// static constexpr size_t required_alignment;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+
+template <typename T>
+constexpr void check_required_alignment() {
+  std::same_as<const std::size_t> decltype(auto) required_alignment = std::atomic_ref<T>::required_alignment;
+  assert(required_alignment >= alignof(T));
+}
+
+constexpr bool test() {
+  check_required_alignment<int>();
+  check_required_alignment<float>();
+  check_required_alignment<int*>();
+  struct Empty {};
+  check_required_alignment<Empty>();
+  struct Trivial {
+    int a;
+  };
+  check_required_alignment<Trivial>();
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/requires-trivially-copyable.verify.cpp b/libcxx/test/std/atomics/atomics.ref/requires-trivially-copyable.verify.cpp
new file mode 100644
index 0000000..9a8b036
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/requires-trivially-copyable.verify.cpp
@@ -0,0 +1,26 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// <atomic_ref>
+
+// template<class T>
+// class atomic_ref;
+
+// The program is ill-formed if is_trivially_copyable_v<T> is false.
+
+#include <atomic>
+
+void trivially_copyable() {
+  struct X {
+    X() = default;
+    X(X const&) {} // -> not trivially copyable
+  } x;
+  // expected-error-re@*:* {{static assertion failed {{.*}}atomic_ref<T> requires that 'T' be a trivially copyable type}}
+  std::atomic_ref<X> r(x);
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/store.pass.cpp b/libcxx/test/std/atomics/atomics.ref/store.pass.cpp
new file mode 100644
index 0000000..ea01a3d
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/store.pass.cpp
@@ -0,0 +1,61 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+// XFAIL: !has-1024-bit-atomics
+
+// void store(T, memory_order = memory_order::seq_cst) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_helper.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestStore {
+  void operator()() const {
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    a.store(T(2));
+    assert(x == T(2));
+    ASSERT_NOEXCEPT(a.store(T(1)));
+
+    a.store(T(3), std::memory_order_seq_cst);
+    assert(x == T(3));
+    ASSERT_NOEXCEPT(a.store(T(0), std::memory_order_seq_cst));
+
+    // TODO memory_order::relaxed
+
+    // memory_order::seq_cst
+    {
+      auto store_no_arg     = [](std::atomic_ref<T> const& y, T, T new_val) { y.store(new_val); };
+      auto store_with_order = [](std::atomic_ref<T> const& y, T, T new_val) {
+        y.store(new_val, std::memory_order::seq_cst);
+      };
+      auto load = [](std::atomic_ref<T> const& y) { return y.load(); };
+      test_seq_cst<T>(store_no_arg, load);
+      test_seq_cst<T>(store_with_order, load);
+    }
+
+    // memory_order::release
+    {
+      auto store = [](std::atomic_ref<T> const& y, T, T new_val) { y.store(new_val, std::memory_order::release); };
+      auto load  = [](std::atomic_ref<T> const& y) { return y.load(std::memory_order::acquire); };
+      test_acquire_release<T>(store, load);
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestStore>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/test_helper.h b/libcxx/test/std/atomics/atomics.ref/test_helper.h
new file mode 100644
index 0000000..225a70c
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/test_helper.h
@@ -0,0 +1,136 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_STD_ATOMICS_ATOMIC_REF_TEST_HELPER_H
+#define TEST_STD_ATOMICS_ATOMIC_REF_TEST_HELPER_H
+
+#include <atomic>
+#include <cassert>
+#include <cmath>
+#include <vector>
+
+#include "test_macros.h"
+
+#ifndef TEST_HAS_NO_THREADS
+#  include "make_test_thread.h"
+#  include <thread>
+#endif
+
+template <class T>
+bool equals(T x, T y) {
+  return x == y;
+}
+
+template <class T>
+T make_value(int i) {
+  assert(i == 0 || i == 1);
+  if constexpr (std::is_pointer_v<T>) {
+    // So that pointers returned can be subtracted from one another
+    static std::remove_const_t<std::remove_pointer_t<T>> d[2];
+    return &d[i];
+  } else {
+    return T(i);
+  }
+}
+
+// Test that all threads see the exact same sequence of events
+// Test will pass 100% if store_op and load_op are correctly
+// affecting the memory with seq_cst order
+template <class T, class StoreOp, class LoadOp>
+void test_seq_cst(StoreOp store_op, LoadOp load_op) {
+#ifndef TEST_HAS_NO_THREADS
+  for (int i = 0; i < 100; ++i) {
+    T old_value(make_value<T>(0));
+    T new_value(make_value<T>(1));
+
+    T copy_x = old_value;
+    std::atomic_ref<T> const x(copy_x);
+    T copy_y = old_value;
+    std::atomic_ref<T> const y(copy_y);
+
+    std::atomic_bool x_updated_first(false);
+    std::atomic_bool y_updated_first(false);
+
+    auto t1 = support::make_test_thread([&] { store_op(x, old_value, new_value); });
+
+    auto t2 = support::make_test_thread([&] { store_op(y, old_value, new_value); });
+
+    auto t3 = support::make_test_thread([&] {
+      while (!equals(load_op(x), new_value)) {
+        std::this_thread::yield();
+      }
+      if (!equals(load_op(y), new_value)) {
+        x_updated_first.store(true, std::memory_order_relaxed);
+      }
+    });
+
+    auto t4 = support::make_test_thread([&] {
+      while (!equals(load_op(y), new_value)) {
+        std::this_thread::yield();
+      }
+      if (!equals(load_op(x), new_value)) {
+        y_updated_first.store(true, std::memory_order_relaxed);
+      }
+    });
+
+    t1.join();
+    t2.join();
+    t3.join();
+    t4.join();
+    // thread 3 and thread 4 cannot see different orders of storing x and y
+    assert(!(x_updated_first && y_updated_first));
+  }
+#else
+  (void)store_op;
+  (void)load_op;
+#endif
+}
+
+// Test that all writes before the store are seen by other threads after the load
+// Test will pass 100% if store_op and load_op are correctly
+// affecting the memory with acquire-release order
+template <class T, class StoreOp, class LoadOp>
+void test_acquire_release(StoreOp store_op, LoadOp load_op) {
+#ifndef TEST_HAS_NO_THREADS
+  for (auto i = 0; i < 100; ++i) {
+    T old_value(make_value<T>(0));
+    T new_value(make_value<T>(1));
+
+    T copy = old_value;
+    std::atomic_ref<T> const at(copy);
+    int non_atomic = 5;
+
+    constexpr auto number_of_threads = 8;
+    std::vector<std::thread> threads;
+    threads.reserve(number_of_threads);
+
+    for (auto j = 0; j < number_of_threads; ++j) {
+      threads.push_back(support::make_test_thread([&at, &non_atomic, load_op, new_value] {
+        while (!equals(load_op(at), new_value)) {
+          std::this_thread::yield();
+        }
+        // Other thread's writes before the release store are visible
+        // in this thread's read after the acquire load
+        assert(non_atomic == 6);
+      }));
+    }
+
+    non_atomic = 6;
+    store_op(at, old_value, new_value);
+
+    for (auto& thread : threads) {
+      thread.join();
+    }
+  }
+#else
+  (void)store_op;
+  (void)load_op;
+#endif
+}
+
+#endif // TEST_STD_ATOMICS_ATOMIC_REF_TEST_HELPER_H
diff --git a/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp b/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp
new file mode 100644
index 0000000..e5310fe
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp
@@ -0,0 +1,88 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-threads
+// XFAIL: availability-synchronization_library-missing
+// XFAIL: !has-64-bit-atomics
+// XFAIL: !has-1024-bit-atomics
+
+// void wait(T, memory_order = memory_order::seq_cst) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "make_test_thread.h"
+#include "test_helper.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestWait {
+  void operator()() const {
+    {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      assert(a.load() == T(1));
+      a.wait(T(0));
+      std::thread t1 = support::make_test_thread([&]() {
+        a.store(T(3));
+        a.notify_one();
+      });
+      a.wait(T(1));
+      assert(a.load() == T(3));
+      t1.join();
+      ASSERT_NOEXCEPT(a.wait(T(0)));
+
+      assert(a.load() == T(3));
+      a.wait(T(0), std::memory_order_seq_cst);
+      std::thread t2 = support::make_test_thread([&]() {
+        a.store(T(5));
+        a.notify_one();
+      });
+      a.wait(T(3), std::memory_order_seq_cst);
+      assert(a.load() == T(5));
+      t2.join();
+      ASSERT_NOEXCEPT(a.wait(T(0), std::memory_order_seq_cst));
+    }
+
+    // memory_order::acquire
+    {
+      auto store = [](std::atomic_ref<T> const& x, T, T new_val) { x.store(new_val, std::memory_order::release); };
+      auto load  = [](std::atomic_ref<T> const& x) {
+        auto result = x.load(std::memory_order::relaxed);
+        x.wait(T(255), std::memory_order::acquire);
+        return result;
+      };
+      test_acquire_release<T>(store, load);
+    }
+
+    // memory_order::seq_cst
+    {
+      auto store       = [](std::atomic_ref<T> const& x, T, T new_val) { x.store(new_val); };
+      auto load_no_arg = [](std::atomic_ref<T> const& x) {
+        auto result = x.load(std::memory_order::relaxed);
+        x.wait(T(255));
+        return result;
+      };
+      auto load_with_order = [](std::atomic_ref<T> const& x) {
+        auto result = x.load(std::memory_order::relaxed);
+        x.wait(T(255), std::memory_order::seq_cst);
+        return result;
+      };
+      test_seq_cst<T>(store, load_no_arg);
+      test_seq_cst<T>(store, load_with_order);
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestWait>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/types.compile.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/types.compile.pass.cpp
index 1a4e6df..b381236 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/types.compile.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/types.compile.pass.cpp
@@ -17,8 +17,11 @@
 
 template <class T>
 void test() {
+  // LWG 3045. atomic<floating-point> doesn't have value_type or difference_type
+  // https://cplusplus.github.io/LWG/issue3045
   static_assert(std::is_same_v<typename std::atomic<T>::value_type, T>);
   static_assert(std::is_same_v<typename std::atomic<T>::difference_type, T>);
+
   static_assert(std::is_standard_layout_v<std::atomic<T>>);
   static_assert(std::is_trivially_destructible_v<std::atomic<T>>);
 }
diff --git a/libcxx/test/std/containers/associative/map/map.value_compare/types.pass.cpp b/libcxx/test/std/containers/associative/map/map.value_compare/types.pass.cpp
index 1d60699..52111dd 100644
--- a/libcxx/test/std/containers/associative/map/map.value_compare/types.pass.cpp
+++ b/libcxx/test/std/containers/associative/map/map.value_compare/types.pass.cpp
@@ -10,7 +10,7 @@
 
 // class value_compare
 
-// REQUIRES: c++98 || c++03 || c++11 || c++14
+// REQUIRES: c++03 || c++11 || c++14
 
 #include <map>
 #include <string>
diff --git a/libcxx/test/std/containers/associative/multimap/multimap.value_compare/types.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.value_compare/types.pass.cpp
index 6ecaf92..0d0c74f 100644
--- a/libcxx/test/std/containers/associative/multimap/multimap.value_compare/types.pass.cpp
+++ b/libcxx/test/std/containers/associative/multimap/multimap.value_compare/types.pass.cpp
@@ -10,7 +10,7 @@
 
 // class value_compare
 
-// REQUIRES: c++98 || c++03 || c++11 || c++14
+// REQUIRES: c++03 || c++11 || c++14
 
 #include <map>
 #include <string>
diff --git a/libcxx/test/std/experimental/simd/simd.class/simd_copy.pass.cpp b/libcxx/test/std/experimental/simd/simd.class/simd_copy.pass.cpp
new file mode 100644
index 0000000..8fcc811
--- /dev/null
+++ b/libcxx/test/std/experimental/simd/simd.class/simd_copy.pass.cpp
@@ -0,0 +1,173 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// FIXME: Fatal error with following targets (remove XFAIL when fixed):
+//   Pass-by-value arguments with alignment greater than register width are not supported.
+// XFAIL: target=powerpc{{.*}}-ibm-aix7.2.5.7
+
+// <experimental/simd>
+//
+// [simd.class]
+// template<class U, class Flags> void copy_from(const U* mem, Flags);
+// template<class U, class Flags> void copy_to(U* mem, Flags) const;
+
+#include "../test_utils.h"
+
+namespace ex = std::experimental::parallelism_v2;
+
+template <class T, class SimdAbi, std::size_t array_size>
+struct ElementAlignedCopyFromHelper {
+  template <class U>
+  void operator()() const {
+    U buffer[array_size];
+    for (size_t i = 0; i < array_size; ++i)
+      buffer[i] = static_cast<U>(i);
+    ex::simd<T, SimdAbi> origin_simd;
+    origin_simd.copy_from(buffer, ex::element_aligned_tag());
+    assert_simd_values_equal(origin_simd, buffer);
+  }
+};
+
+template <class T, class SimdAbi, std::size_t array_size>
+struct VectorAlignedCopyFromHelper {
+  template <class U>
+  void operator()() const {
+    alignas(ex::memory_alignment_v<ex::simd<T, SimdAbi>, U>) U buffer[array_size];
+    for (size_t i = 0; i < array_size; ++i)
+      buffer[i] = static_cast<U>(i);
+    ex::simd<T, SimdAbi> origin_simd;
+    origin_simd.copy_from(buffer, ex::vector_aligned_tag());
+    assert_simd_values_equal(origin_simd, buffer);
+  }
+};
+
+template <class T, class SimdAbi, std::size_t array_size>
+struct OveralignedCopyFromHelper {
+  template <class U>
+  void operator()() const {
+    alignas(bit_ceil(sizeof(U) + 1)) U buffer[array_size];
+    for (size_t i = 0; i < array_size; ++i)
+      buffer[i] = static_cast<U>(i);
+    ex::simd<T, SimdAbi> origin_simd;
+    origin_simd.copy_from(buffer, ex::overaligned_tag<bit_ceil(sizeof(U) + 1)>());
+    assert_simd_values_equal(origin_simd, buffer);
+  }
+};
+
+template <class T, std::size_t>
+struct CheckSimdCopyFrom {
+  template <class SimdAbi>
+  void operator()() {
+    constexpr std::size_t array_size = ex::simd_size_v<T, SimdAbi>;
+
+    types::for_each(simd_test_types(), ElementAlignedCopyFromHelper<T, SimdAbi, array_size>());
+    types::for_each(simd_test_types(), VectorAlignedCopyFromHelper<T, SimdAbi, array_size>());
+    types::for_each(simd_test_types(), OveralignedCopyFromHelper<T, SimdAbi, array_size>());
+  }
+};
+
+template <class T, class SimdAbi, std::size_t array_size>
+struct ElementAlignedCopyToHelper {
+  template <class U>
+  void operator()() const {
+    U buffer[array_size];
+    ex::simd<T, SimdAbi> origin_simd([](T i) { return i; });
+    origin_simd.copy_to(buffer, ex::element_aligned_tag());
+    assert_simd_values_equal(origin_simd, buffer);
+  }
+};
+
+template <class T, class SimdAbi, std::size_t array_size>
+struct VectorAlignedCopyToHelper {
+  template <class U>
+  void operator()() const {
+    alignas(ex::memory_alignment_v<ex::simd<T, SimdAbi>, U>) U buffer[array_size];
+    ex::simd<T, SimdAbi> origin_simd([](T i) { return i; });
+    origin_simd.copy_to(buffer, ex::vector_aligned_tag());
+    assert_simd_values_equal(origin_simd, buffer);
+  }
+};
+
+template <class T, class SimdAbi, std::size_t array_size>
+struct OveralignedCopyToHelper {
+  template <class U>
+  void operator()() const {
+    alignas(bit_ceil(sizeof(U) + 1)) U buffer[array_size];
+    ex::simd<T, SimdAbi> origin_simd([](T i) { return i; });
+    origin_simd.copy_to(buffer, ex::overaligned_tag<bit_ceil(sizeof(U) + 1)>());
+    assert_simd_values_equal(origin_simd, buffer);
+  }
+};
+
+template <class T, std::size_t>
+struct CheckSimdCopyTo {
+  template <class SimdAbi>
+  void operator()() {
+    constexpr std::size_t array_size = ex::simd_size_v<T, SimdAbi>;
+
+    types::for_each(simd_test_types(), ElementAlignedCopyToHelper<T, SimdAbi, array_size>());
+    types::for_each(simd_test_types(), VectorAlignedCopyToHelper<T, SimdAbi, array_size>());
+    types::for_each(simd_test_types(), OveralignedCopyToHelper<T, SimdAbi, array_size>());
+  }
+};
+
+template <class U, class T, class Flags, class SimdAbi = ex::simd_abi::compatible<T>, class = void>
+struct has_copy_from : std::false_type {};
+
+template <class U, class T, class Flags, class SimdAbi>
+struct has_copy_from<U,
+                     T,
+                     Flags,
+                     SimdAbi,
+                     std::void_t<decltype(std::declval<ex::simd<T, SimdAbi>>().copy_from(
+                         std::declval<const U*>(), std::declval<Flags>()))>> : std::true_type {};
+
+template <class U, class T, class Flags, class SimdAbi = ex::simd_abi::compatible<T>, class = void>
+struct has_copy_to : std::false_type {};
+
+template <class U, class T, class Flags, class SimdAbi>
+struct has_copy_to<
+    U,
+    T,
+    Flags,
+    SimdAbi,
+    std::void_t<decltype(std::declval<ex::simd<T, SimdAbi>>().copy_to(std::declval<U*>(), std::declval<Flags>()))>>
+    : std::true_type {};
+
+template <class T, std::size_t>
+struct CheckSimdCopyTraits {
+  template <class SimdAbi>
+  void operator()() {
+    // These functions shall not participate in overload resolution unless
+    // is_simd_flag_type_v<Flags> is true, and
+    // U is a vectorizable type.
+    static_assert(has_copy_from<int, T, ex::element_aligned_tag, SimdAbi>::value);
+    static_assert(has_copy_to<int, T, ex::element_aligned_tag, SimdAbi>::value);
+
+    // is_simd_flag_type_v<Flags> is false
+    static_assert(!has_copy_from<int, T, T, SimdAbi>::value);
+    static_assert(!has_copy_to<int, T, T, SimdAbi>::value);
+    static_assert(!has_copy_from<int, T, SimdAbi, SimdAbi>::value);
+    static_assert(!has_copy_to<int, T, SimdAbi, SimdAbi>::value);
+
+    // U is not a vectorizable type.
+    static_assert(!has_copy_from<SimdAbi, T, ex::element_aligned_tag, SimdAbi>::value);
+    static_assert(!has_copy_to<SimdAbi, T, ex::element_aligned_tag, SimdAbi>::value);
+    static_assert(!has_copy_from<ex::element_aligned_tag, T, ex::element_aligned_tag, SimdAbi>::value);
+    static_assert(!has_copy_to<ex::element_aligned_tag, T, ex::element_aligned_tag, SimdAbi>::value);
+  }
+};
+
+int main(int, char**) {
+  test_all_simd_abi<CheckSimdCopyFrom>();
+  test_all_simd_abi<CheckSimdCopyTo>();
+  test_all_simd_abi<CheckSimdCopyTraits>();
+  return 0;
+}
diff --git a/libcxx/test/std/experimental/simd/simd.mask.class/simd_mask_copy.pass.cpp b/libcxx/test/std/experimental/simd/simd.mask.class/simd_mask_copy.pass.cpp
new file mode 100644
index 0000000..0c3b4c9
--- /dev/null
+++ b/libcxx/test/std/experimental/simd/simd.mask.class/simd_mask_copy.pass.cpp
@@ -0,0 +1,127 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// <experimental/simd>
+//
+// [simd.class]
+// template<class Flags> void copy_from(const value_type* mem, Flags);
+// template<class Flags> void copy_to(value_type* mem, Flags);
+
+#include "../test_utils.h"
+
+namespace ex = std::experimental::parallelism_v2;
+
+template <class T, std::size_t>
+struct CheckSimdMaskCopyFrom {
+  template <class SimdAbi>
+  void operator()() {
+    constexpr std::size_t array_size = ex::simd_size_v<T, SimdAbi>;
+
+    // element aligned tag
+    constexpr std::size_t element_alignas_size = alignof(bool);
+    alignas(element_alignas_size) bool element_buffer[array_size];
+    for (size_t i = 0; i < array_size; ++i)
+      element_buffer[i] = static_cast<bool>(i % 2);
+    ex::simd_mask<T, SimdAbi> element_mask;
+    element_mask.copy_from(element_buffer, ex::element_aligned_tag());
+    assert_simd_mask_values_equal(element_mask, element_buffer);
+
+    // vector aligned tag
+    constexpr std::size_t vector_alignas_size = ex::memory_alignment_v<ex::simd_mask<T, SimdAbi>>;
+    alignas(vector_alignas_size) bool vector_buffer[array_size];
+    for (size_t i = 0; i < array_size; ++i)
+      vector_buffer[i] = static_cast<bool>(i % 2);
+    ex::simd_mask<T, SimdAbi> vector_mask;
+    vector_mask.copy_from(vector_buffer, ex::vector_aligned_tag());
+    assert_simd_mask_values_equal(vector_mask, vector_buffer);
+
+    // overaligned tag
+    constexpr std::size_t over_alignas_size = bit_ceil(sizeof(bool) + 1);
+    alignas(over_alignas_size) bool overaligned_buffer[array_size];
+    for (size_t i = 0; i < array_size; ++i)
+      overaligned_buffer[i] = static_cast<bool>(i % 2);
+    ex::simd_mask<T, SimdAbi> overaligned_mask;
+    overaligned_mask.copy_from(overaligned_buffer, ex::overaligned_tag<over_alignas_size>());
+    assert_simd_mask_values_equal(overaligned_mask, overaligned_buffer);
+  }
+};
+
+template <class T, std::size_t>
+struct CheckSimdMaskCopyTo {
+  template <class SimdAbi>
+  void operator()() {
+    constexpr std::size_t array_size = ex::simd_size_v<T, SimdAbi>;
+
+    // element aligned tag
+    constexpr std::size_t element_alignas_size = alignof(bool);
+    alignas(element_alignas_size) bool element_buffer[array_size];
+    ex::simd_mask<T, SimdAbi> element_mask(true);
+    element_mask.copy_to(element_buffer, ex::element_aligned_tag());
+    assert_simd_mask_values_equal(element_mask, element_buffer);
+
+    // vector aligned tag
+    constexpr std::size_t vector_alignas_size = ex::memory_alignment_v<ex::simd_mask<T, SimdAbi>>;
+    alignas(vector_alignas_size) bool vector_buffer[array_size];
+    ex::simd_mask<T, SimdAbi> vector_mask(false);
+    vector_mask.copy_to(vector_buffer, ex::vector_aligned_tag());
+    assert_simd_mask_values_equal(vector_mask, vector_buffer);
+
+    // overaligned tag
+    constexpr std::size_t over_alignas_size = bit_ceil(sizeof(bool) + 1);
+    alignas(over_alignas_size) bool overaligned_buffer[array_size];
+    ex::simd_mask<T, SimdAbi> overaligned_mask(true);
+    overaligned_mask.copy_to(overaligned_buffer, ex::overaligned_tag<over_alignas_size>());
+    assert_simd_mask_values_equal(overaligned_mask, overaligned_buffer);
+  }
+};
+
+template <class T, class Flags, class SimdAbi = ex::simd_abi::compatible<T>, class = void>
+struct has_copy_from : std::false_type {};
+
+template <class T, class Flags, class SimdAbi>
+struct has_copy_from<T,
+                     Flags,
+                     SimdAbi,
+                     std::void_t<decltype(std::declval<ex::simd_mask<T, SimdAbi>>().copy_from(
+                         std::declval<const bool*>(), std::declval<Flags>()))>> : std::true_type {};
+
+template <class T, class Flags, class SimdAbi = ex::simd_abi::compatible<T>, class = void>
+struct has_copy_to : std::false_type {};
+
+template <class T, class Flags, class SimdAbi>
+struct has_copy_to<T,
+                   Flags,
+                   SimdAbi,
+                   std::void_t<decltype(std::declval<ex::simd_mask<T, SimdAbi>>().copy_to(
+                       std::declval<bool*>(), std::declval<Flags>()))>> : std::true_type {};
+
+template <class T, std::size_t>
+struct CheckSimdMaskCopyTraits {
+  template <class SimdAbi>
+  void operator()() {
+    // These functions shall not participate in overload resolution unless
+    // is_simd_flag_type_v<Flags> is true
+    static_assert(has_copy_from<T, ex::element_aligned_tag, SimdAbi>::value);
+    static_assert(has_copy_to<T, ex::element_aligned_tag, SimdAbi>::value);
+
+    // is_simd_flag_type_v<Flags> is false
+    static_assert(!has_copy_from<T, T, SimdAbi>::value);
+    static_assert(!has_copy_to<T, T, SimdAbi>::value);
+    static_assert(!has_copy_from<T, SimdAbi, SimdAbi>::value);
+    static_assert(!has_copy_to<T, SimdAbi, SimdAbi>::value);
+  }
+};
+
+int main(int, char**) {
+  test_all_simd_abi<CheckSimdMaskCopyFrom>();
+  test_all_simd_abi<CheckSimdMaskCopyTo>();
+  test_all_simd_abi<CheckSimdMaskCopyTraits>();
+  return 0;
+}
diff --git a/libcxx/test/std/iterators/predef.iterators/counted.iterator/implicit_ctad.pass.cpp b/libcxx/test/std/iterators/predef.iterators/counted.iterator/implicit_ctad.pass.cpp
index 2786dfb..5b4853a 100644
--- a/libcxx/test/std/iterators/predef.iterators/counted.iterator/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/iterators/predef.iterators/counted.iterator/implicit_ctad.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++98, c++03, c++11, c++14, c++17
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // counted_iterator
 
diff --git a/libcxx/test/std/iterators/predef.iterators/insert.iterators/back.insert.iterator/implicit_ctad.pass.cpp b/libcxx/test/std/iterators/predef.iterators/insert.iterators/back.insert.iterator/implicit_ctad.pass.cpp
index 10729e0..3c2e6af9 100644
--- a/libcxx/test/std/iterators/predef.iterators/insert.iterators/back.insert.iterator/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/iterators/predef.iterators/insert.iterators/back.insert.iterator/implicit_ctad.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <iterator>
 
diff --git a/libcxx/test/std/iterators/predef.iterators/insert.iterators/front.insert.iterator/implicit_ctad.pass.cpp b/libcxx/test/std/iterators/predef.iterators/insert.iterators/front.insert.iterator/implicit_ctad.pass.cpp
index f91d472..f9b086a 100644
--- a/libcxx/test/std/iterators/predef.iterators/insert.iterators/front.insert.iterator/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/iterators/predef.iterators/insert.iterators/front.insert.iterator/implicit_ctad.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <iterator>
 
diff --git a/libcxx/test/std/iterators/predef.iterators/move.iterators/move.iterator/implicit_ctad.pass.cpp b/libcxx/test/std/iterators/predef.iterators/move.iterators/move.iterator/implicit_ctad.pass.cpp
index e574446..b84a070 100644
--- a/libcxx/test/std/iterators/predef.iterators/move.iterators/move.iterator/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/iterators/predef.iterators/move.iterators/move.iterator/implicit_ctad.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <iterator>
 
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp
index 21663cd..0241e7c 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp
@@ -8,11 +8,11 @@
 
 // test sized operator delete[] replacement.
 
+// TODO(mordante) fix this test after updating clang in Docker
+// UNSUPPORTED: clang-15, clang-16, clang-17, clang-18, clang-19
 // UNSUPPORTED: sanitizer-new-delete, c++03, c++11
-
-// NOTE: Clang does not enable sized-deallocation in C++14 and beyond by
-// default. It is only enabled when -fsized-deallocation is given.
-// XFAIL: clang, apple-clang
+// XFAIL: apple-clang
+// XFAIL: using-built-library-before-llvm-11
 
 #include <new>
 #include <cstddef>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp
index a8701ce..2ab6916 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp
@@ -8,11 +8,11 @@
 
 // test sized operator delete replacement.
 
+// TODO(mordante) fix this test after updating clang in Docker
+// UNSUPPORTED: clang-15, clang-16, clang-17, clang-18, clang-19
 // UNSUPPORTED: sanitizer-new-delete, c++03, c++11
-
-// NOTE: Clang does not enable sized-deallocation in C++14 and beyond by
-// default. It is only enabled when -fsized-deallocation is given.
-// XFAIL: clang, apple-clang
+// XFAIL: apple-clang
+// XFAIL: using-built-library-before-llvm-11
 
 #include <new>
 #include <cstddef>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp
index c802ab7..fbd1c7c 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp
@@ -6,6 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+// The fix for LWG2381 (https://github.com/llvm/llvm-project/pull/77948) changed
+// behavior of FP parsing, while Apple back-deployment targets remain broken due
+// to the dylib.
+// UNSUPPORTED: using-built-library-before-llvm-19
+
 // <locale>
 
 // class num_get<charT, InputIterator>
@@ -116,9 +121,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == INFINITY);
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0);
     }
     {
         const char str[] = "INF";
@@ -128,9 +133,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == INFINITY);
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0);
     }
     {
         const char str[] = "-inf";
@@ -140,9 +145,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == -INFINITY);
+        assert(base(iter) == str + 1);
+        assert(err == ios.failbit);
+        assert(v == 0.0);
     }
     {
         const char str[] = "-INF";
@@ -152,9 +157,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == -INFINITY);
+        assert(base(iter) == str + 1);
+        assert(err == ios.failbit);
+        assert(v == 0.0);
     }
     {
         const char str[] = "nan";
@@ -164,9 +169,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(std::isnan(v));
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0);
     }
     {
         const char str[] = "NAN";
@@ -176,9 +181,129 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(std::isnan(v));
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0);
+    }
+    {
+      const char str[] = "p00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
+    }
+    {
+      const char str[] = "P00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
+    }
+    {
+      const char str[] = "+p00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
+    }
+    {
+      const char str[] = "+P00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
+    }
+    {
+      const char str[] = "-p00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
+    }
+    {
+      const char str[] = "-P00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
+    }
+    {
+      const char str[] = "e00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
+    }
+    {
+      const char str[] = "E00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
+    }
+    {
+      const char str[] = "+e00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
+    }
+    {
+      const char str[] = "+E00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
+    }
+    {
+      const char str[] = "-e00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
+    }
+    {
+      const char str[] = "-E00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
     }
     {
         v = -1;
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp
index 79c8480..b5ac7d8 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp
@@ -6,6 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+// The fix for LWG2381 (https://github.com/llvm/llvm-project/pull/77948) changed
+// behavior of FP parsing, while Apple back-deployment targets remain broken due
+// to the dylib.
+// UNSUPPORTED: using-built-library-before-llvm-19
+
 // <locale>
 
 // class num_get<charT, InputIterator>
@@ -105,9 +110,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == INFINITY);
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0f);
     }
     {
         const char str[] = "INF";
@@ -117,9 +122,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == INFINITY);
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0f);
     }
     {
         const char str[] = "-inf";
@@ -129,9 +134,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == -INFINITY);
+        assert(base(iter) == str + 1);
+        assert(err == ios.failbit);
+        assert(v == 0.0f);
     }
     {
         const char str[] = "-INF";
@@ -141,9 +146,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == -INFINITY);
+        assert(base(iter) == str + 1);
+        assert(err == ios.failbit);
+        assert(v == 0.0f);
     }
     {
         const char str[] = "nan";
@@ -153,9 +158,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(std::isnan(v));
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0f);
     }
     {
         const char str[] = "NAN";
@@ -165,9 +170,129 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(std::isnan(v));
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0f);
+    }
+    {
+      const char str[] = "p00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
+    }
+    {
+      const char str[] = "P00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
+    }
+    {
+      const char str[] = "+p00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
+    }
+    {
+      const char str[] = "+P00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
+    }
+    {
+      const char str[] = "-p00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
+    }
+    {
+      const char str[] = "-P00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
+    }
+    {
+      const char str[] = "e00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
+    }
+    {
+      const char str[] = "E00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
+    }
+    {
+      const char str[] = "+e00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
+    }
+    {
+      const char str[] = "+E00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
+    }
+    {
+      const char str[] = "-e00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
+    }
+    {
+      const char str[] = "-E00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
     }
     {
         v = -1;
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp
index e2b2aeaf..9617899 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp
@@ -6,6 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+// The fix for LWG2381 (https://github.com/llvm/llvm-project/pull/77948) changed
+// behavior of FP parsing, while Apple back-deployment targets remain broken due
+// to the dylib.
+// UNSUPPORTED: using-built-library-before-llvm-19
+
 // <locale>
 
 // class num_get<charT, InputIterator>
@@ -105,9 +110,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == INFINITY);
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0l);
     }
     {
         const char str[] = "INF";
@@ -117,9 +122,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == INFINITY);
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0l);
     }
     {
         const char str[] = "-inf";
@@ -129,9 +134,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == -INFINITY);
+        assert(base(iter) == str + 1);
+        assert(err == ios.failbit);
+        assert(v == 0.0l);
     }
     {
         const char str[] = "-INF";
@@ -141,9 +146,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == -INFINITY);
+        assert(base(iter) == str + 1);
+        assert(err == ios.failbit);
+        assert(v == 0.0l);
     }
     {
         const char str[] = "nan";
@@ -153,9 +158,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(std::isnan(v));
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0l);
     }
     {
         const char str[] = "NAN";
@@ -165,9 +170,129 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(std::isnan(v));
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0l);
+    }
+    {
+      const char str[] = "p00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
+    }
+    {
+      const char str[] = "P00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
+    }
+    {
+      const char str[] = "+p00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
+    }
+    {
+      const char str[] = "+P00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
+    }
+    {
+      const char str[] = "-p00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
+    }
+    {
+      const char str[] = "-P00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
+    }
+    {
+      const char str[] = "e00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
+    }
+    {
+      const char str[] = "E00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
+    }
+    {
+      const char str[] = "+e00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
+    }
+    {
+      const char str[] = "+E00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
+    }
+    {
+      const char str[] = "-e00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
+    }
+    {
+      const char str[] = "-E00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
     }
     {
         const char str[] = "1.189731495357231765021264e+49321";
diff --git a/libcxx/test/std/algorithms/numeric.ops/reduce/pstl.reduce.pass.cpp b/libcxx/test/std/numerics/numeric.ops/reduce/pstl.reduce.pass.cpp
index b083c4f..f5748d7 100644
--- a/libcxx/test/std/algorithms/numeric.ops/reduce/pstl.reduce.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/reduce/pstl.reduce.pass.cpp
@@ -10,7 +10,7 @@
 
 // UNSUPPORTED: libcpp-has-no-incomplete-pstl
 
-// <algorithm>
+// <numeric>
 
 // template<class ExecutionPolicy, class ForwardIterator>
 //   typename iterator_traits<ForwardIterator>::value_type
diff --git a/libcxx/test/std/algorithms/numeric.ops/transform.reduce/pstl.transform_reduce.binary.pass.cpp b/libcxx/test/std/numerics/numeric.ops/transform.reduce/pstl.transform_reduce.binary.pass.cpp
index 18b56f2..6d8bb47 100644
--- a/libcxx/test/std/algorithms/numeric.ops/transform.reduce/pstl.transform_reduce.binary.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/transform.reduce/pstl.transform_reduce.binary.pass.cpp
@@ -10,7 +10,7 @@
 
 // UNSUPPORTED: libcpp-has-no-incomplete-pstl
 
-// <algorithm>
+// <numeric>
 
 // template<class ExecutionPolicy,
 //          class ForwardIterator1, class ForwardIterator2, class T>
diff --git a/libcxx/test/std/algorithms/numeric.ops/transform.reduce/pstl.transform_reduce.unary.pass.cpp b/libcxx/test/std/numerics/numeric.ops/transform.reduce/pstl.transform_reduce.unary.pass.cpp
index a32a4f8..4cea3d4 100644
--- a/libcxx/test/std/algorithms/numeric.ops/transform.reduce/pstl.transform_reduce.unary.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/transform.reduce/pstl.transform_reduce.unary.pass.cpp
@@ -10,7 +10,7 @@
 
 // UNSUPPORTED: libcpp-has-no-incomplete-pstl
 
-// <algorithm>
+// <numeric>
 
 // template<class ExecutionPolicy,
 //          class ForwardIterator, class T,
diff --git a/libcxx/test/std/strings/string.view/string.view.deduct/implicit.pass.cpp b/libcxx/test/std/strings/string.view/string.view.deduct/implicit.pass.cpp
index c76c4a0..36584f7 100644
--- a/libcxx/test/std/strings/string.view/string.view.deduct/implicit.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.deduct/implicit.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <string_view>
 
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_for.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_for.pass.cpp
index 4215020..6a054f7 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_for.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_for.pass.cpp
@@ -5,9 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
-// ALLOW_RETRIES: 2
+
+// UNSUPPORTED: no-threads, c++03
 
 // <condition_variable>
 
@@ -19,77 +18,92 @@
 //              const chrono::duration<Rep, Period>& rel_time);
 
 #include <condition_variable>
+#include <atomic>
+#include <cassert>
+#include <chrono>
 #include <mutex>
 #include <thread>
-#include <chrono>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-std::condition_variable cv;
-std::mutex mut;
-
-int test1 = 0;
-int test2 = 0;
-
-bool expect_timeout = false;
-
-void f()
-{
-    typedef std::chrono::system_clock Clock;
-    typedef std::chrono::milliseconds milliseconds;
-    std::unique_lock<std::mutex> lk(mut);
-    assert(test2 == 0);
-    test1 = 1;
-    cv.notify_one();
-    Clock::time_point t0 = Clock::now();
-    Clock::time_point wait_end = t0 + milliseconds(250);
-    Clock::duration d;
-    do {
-        d = wait_end - Clock::now();
-        if (d <= milliseconds(0)) break;
-    } while (test2 == 0 && cv.wait_for(lk, d) == std::cv_status::no_timeout);
-    Clock::time_point t1 = Clock::now();
-    if (!expect_timeout)
-    {
-        assert(t1 - t0 < milliseconds(250));
-        assert(test2 != 0);
-    }
-    else
-    {
-        assert(t1 - t0 - milliseconds(250) < milliseconds(50));
-        assert(test2 == 0);
-    }
+template <class Function>
+std::chrono::microseconds measure(Function f) {
+  std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
+  f();
+  std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 }
 
-int main(int, char**)
-{
-    {
-        std::unique_lock<std::mutex> lk(mut);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        test2 = 1;
-        lk.unlock();
-        cv.notify_one();
-        t.join();
-    }
-    test1 = 0;
-    test2 = 0;
-    expect_timeout = true;
-    {
-        std::unique_lock<std::mutex> lk(mut);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        lk.unlock();
-        t.join();
-    }
+int main(int, char**) {
+  // Test unblocking via a call to notify_one() in another thread.
+  //
+  // To test this, we set a very long timeout in wait_for() and we wait
+  // again in case we get awoken spuriously. Note that it can actually
+  // happen that we get awoken spuriously and fail to recognize it
+  // (making this test useless), but the likelihood should be small.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> likely_spurious(true);
+    auto timeout = std::chrono::seconds(3600);
+    std::condition_variable cv;
+    std::mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      auto elapsed = measure([&] {
+        ready = true;
+        do {
+          std::cv_status result = cv.wait_for(lock, timeout);
+          assert(result == std::cv_status::no_timeout);
+        } while (likely_spurious);
+      });
+
+      // This can technically fail if we have many spurious awakenings, but in practice the
+      // tolerance is so high that it shouldn't be a problem.
+      assert(elapsed < timeout);
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This blocks the condition variable inside its wait call
+      // so we can notify it while it is waiting.
+      std::unique_lock<std::mutex> lock(mutex);
+      cv.notify_one();
+      likely_spurious = false;
+      lock.unlock();
+    });
+
+    t2.join();
+    t1.join();
+  }
+
+  // Test unblocking via a timeout.
+  //
+  // To test this, we create a thread that waits on a condition variable
+  // with a certain timeout, and we never awaken it. To guard against
+  // spurious wakeups, we wait again whenever we are awoken for a reason
+  // other than a timeout.
+  {
+    auto timeout = std::chrono::milliseconds(250);
+    std::condition_variable cv;
+    std::mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      std::cv_status result;
+      do {
+        auto elapsed = measure([&] { result = cv.wait_for(lock, timeout); });
+        if (result == std::cv_status::timeout)
+          assert(elapsed >= timeout);
+      } while (result != std::cv_status::timeout);
+    });
+
+    t1.join();
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_for_pred.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_for_pred.pass.cpp
index 872bcb6d..76fc739 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_for_pred.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_for_pred.pass.cpp
@@ -5,9 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
-// ALLOW_RETRIES: 2
+
+// UNSUPPORTED: no-threads, c++03
 
 // <condition_variable>
 
@@ -20,82 +19,141 @@
 //              Predicate pred);
 
 #include <condition_variable>
+#include <atomic>
+#include <cassert>
+#include <chrono>
 #include <mutex>
 #include <thread>
-#include <chrono>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-class Pred
-{
-    int& i_;
-public:
-    explicit Pred(int& i) : i_(i) {}
-
-    bool operator()() {return i_ != 0;}
-};
-
-std::condition_variable cv;
-std::mutex mut;
-
-int test1 = 0;
-int test2 = 0;
-
-int runs = 0;
-
-void f()
-{
-    typedef std::chrono::system_clock Clock;
-    typedef std::chrono::milliseconds milliseconds;
-    std::unique_lock<std::mutex> lk(mut);
-    assert(test2 == 0);
-    test1 = 1;
-    cv.notify_one();
-    Clock::time_point t0 = Clock::now();
-    bool r = cv.wait_for(lk, milliseconds(250), Pred(test2));
-    ((void)r); // Prevent unused warning
-    Clock::time_point t1 = Clock::now();
-    if (runs == 0)
-    {
-        assert(t1 - t0 < milliseconds(250));
-        assert(test2 != 0);
-    }
-    else
-    {
-        assert(t1 - t0 - milliseconds(250) < milliseconds(50));
-        assert(test2 == 0);
-    }
-    ++runs;
+template <class Function>
+std::chrono::microseconds measure(Function f) {
+  std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
+  f();
+  std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 }
 
-int main(int, char**)
-{
-    {
-        std::unique_lock<std::mutex>lk(mut);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        test2 = 1;
-        lk.unlock();
-        cv.notify_one();
-        t.join();
-    }
-    test1 = 0;
-    test2 = 0;
-    {
-        std::unique_lock<std::mutex>lk(mut);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        lk.unlock();
-        t.join();
-    }
+int main(int, char**) {
+  // Test unblocking via a call to notify_one() in another thread.
+  //
+  // To test this, we set a very long timeout in wait_for() and we try to minimize
+  // the likelihood that we got awoken by a spurious wakeup by updating the
+  // likely_spurious flag only immediately before we perform the notification.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> likely_spurious(true);
+    auto timeout = std::chrono::seconds(3600);
+    std::condition_variable cv;
+    std::mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      auto elapsed = measure([&] {
+        ready       = true;
+        bool result = cv.wait_for(lock, timeout, [&] { return !likely_spurious; });
+        assert(result); // return value should be true since we didn't time out
+      });
+      assert(elapsed < timeout);
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      std::unique_lock<std::mutex> lock(mutex);
+
+      likely_spurious = false;
+      lock.unlock();
+      cv.notify_one();
+    });
+
+    t2.join();
+    t1.join();
+  }
+
+  // Test unblocking via a timeout.
+  //
+  // To test this, we create a thread that waits on a condition variable with a certain
+  // timeout, and we never awaken it. The "stop waiting" predicate always returns false,
+  // which means that we can't get out of the wait via a spurious wakeup.
+  {
+    auto timeout = std::chrono::milliseconds(250);
+    std::condition_variable cv;
+    std::mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      auto elapsed = measure([&] {
+        bool result = cv.wait_for(lock, timeout, [] { return false; }); // never stop waiting (until timeout)
+        assert(!result); // return value should be false since the predicate returns false after the timeout
+      });
+      assert(elapsed >= timeout);
+    });
+
+    t1.join();
+  }
+
+  // Test unblocking via a spurious wakeup.
+  //
+  // To test this, we set a fairly long timeout in wait_for() and we basically never
+  // wake up the condition variable. This way, we are hoping to get out of the wait
+  // via a spurious wakeup.
+  //
+  // However, since spurious wakeups are not required to even happen, this test is
+  // only trying to trigger that code path, but not actually asserting that it is
+  // taken. In particular, we do need to eventually ensure we get out of the wait
+  // by standard means, so we actually wake up the thread at the end.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> awoken(false);
+    auto timeout = std::chrono::seconds(3600);
+    std::condition_variable cv;
+    std::mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      auto elapsed = measure([&] {
+        ready       = true;
+        bool result = cv.wait_for(lock, timeout, [&] { return true; });
+        awoken      = true;
+        assert(result); // return value should be true since we didn't time out
+      });
+      assert(elapsed < timeout); // can technically fail if t2 never executes and we timeout, but very unlikely
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      std::unique_lock<std::mutex> lock(mutex);
+      lock.unlock();
+
+      // Give some time for t1 to be awoken spuriously so that code path is used.
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+
+      // We would want to assert that the thread has been awoken after this time,
+      // however nothing guarantees us that it ever gets spuriously awoken, so
+      // we can't really check anything. This is still left here as documentation.
+      bool woke = awoken.load();
+      assert(woke || !woke);
+
+      // Whatever happened, actually awaken the condition variable to ensure the test
+      // doesn't keep running until the timeout.
+      cv.notify_one();
+    });
+
+    t2.join();
+    t1.join();
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_pred.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_pred.pass.cpp
index 15feba5..5ce5bcc 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_pred.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_pred.pass.cpp
@@ -5,9 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
-// ALLOW_RETRIES: 2
+
+// UNSUPPORTED: no-threads, c++03
 
 // <condition_variable>
 
@@ -17,51 +16,98 @@
 //   void wait(unique_lock<mutex>& lock, Predicate pred);
 
 #include <condition_variable>
+#include <atomic>
+#include <cassert>
 #include <mutex>
 #include <thread>
-#include <functional>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-std::condition_variable cv;
-std::mutex mut;
-
-int test1 = 0;
-int test2 = 0;
-
-class Pred
-{
-    int& i_;
-public:
-    explicit Pred(int& i) : i_(i) {}
-
-    bool operator()() {return i_ != 0;}
-};
-
-void f()
-{
-    std::unique_lock<std::mutex> lk(mut);
-    assert(test2 == 0);
-    test1 = 1;
-    cv.notify_one();
-    cv.wait(lk, Pred(test2));
-    assert(test2 != 0);
-}
+int main(int, char**) {
+  // Test unblocking via a call to notify_one() in another thread.
+  //
+  // To test this, we try to minimize the likelihood that we got awoken by a
+  // spurious wakeup by updating the likely_spurious flag only immediately
+  // before we perform the notification.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> likely_spurious(true);
+    std::condition_variable cv;
+    std::mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      ready = true;
+      cv.wait(lock, [&] { return !likely_spurious; });
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      std::unique_lock<std::mutex> lock(mutex);
+
+      likely_spurious = false;
+      lock.unlock();
+      cv.notify_one();
+    });
+
+    t2.join();
+    t1.join();
+  }
+
+  // Test unblocking via a spurious wakeup.
+  //
+  // To test this, we basically never wake up the condition variable. This way, we
+  // are hoping to get out of the wait via a spurious wakeup.
+  //
+  // However, since spurious wakeups are not required to even happen, this test is
+  // only trying to trigger that code path, but not actually asserting that it is
+  // taken. In particular, we do need to eventually ensure we get out of the wait
+  // by standard means, so we actually wake up the thread at the end.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> awoken(false);
+    std::condition_variable cv;
+    std::mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      ready = true;
+      cv.wait(lock, [&] { return true; });
+      awoken = true;
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      std::unique_lock<std::mutex> lock(mutex);
+      lock.unlock();
+
+      // Give some time for t1 to be awoken spuriously so that code path is used.
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+
+      // We would want to assert that the thread has been awoken after this time,
+      // however nothing guarantees us that it ever gets spuriously awoken, so
+      // we can't really check anything. This is still left here as documentation.
+      bool woke = awoken.load();
+      assert(woke || !woke);
+
+      // Whatever happened, actually awaken the condition variable to ensure the test finishes.
+      cv.notify_one();
+    });
 
-int main(int, char**)
-{
-    std::unique_lock<std::mutex>lk(mut);
-    std::thread t = support::make_test_thread(f);
-    assert(test1 == 0);
-    while (test1 == 0)
-        cv.wait(lk);
-    assert(test1 != 0);
-    test2 = 1;
-    lk.unlock();
-    cv.notify_one();
-    t.join();
+    t2.join();
+    t1.join();
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_until.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_until.pass.cpp
index 03205e6..6f3a5a0 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_until.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_until.pass.cpp
@@ -5,9 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
-// ALLOW_RETRIES: 2
+
+// UNSUPPORTED: no-threads, c++03
 
 // <condition_variable>
 
@@ -19,100 +18,100 @@
 //              const chrono::time_point<Clock, Duration>& abs_time);
 
 #include <condition_variable>
+#include <atomic>
+#include <cassert>
+#include <chrono>
 #include <mutex>
 #include <thread>
-#include <chrono>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-struct TestClock
-{
-    typedef std::chrono::milliseconds duration;
-    typedef duration::rep             rep;
-    typedef duration::period          period;
-    typedef std::chrono::time_point<TestClock> time_point;
-    static const bool is_steady =  true;
-
-    static time_point now()
-    {
-        using namespace std::chrono;
-        return time_point(duration_cast<duration>(
-                steady_clock::now().time_since_epoch()
-                                                 ));
-    }
+struct TestClock {
+  typedef std::chrono::milliseconds duration;
+  typedef duration::rep rep;
+  typedef duration::period period;
+  typedef std::chrono::time_point<TestClock> time_point;
+  static const bool is_steady = true;
+
+  static time_point now() {
+    using namespace std::chrono;
+    return time_point(duration_cast<duration>(steady_clock::now().time_since_epoch()));
+  }
 };
 
-std::condition_variable cv;
-std::mutex mut;
-
-int test1 = 0;
-int test2 = 0;
-
-int runs = 0;
-
-template <typename Clock>
-void f()
-{
-    std::unique_lock<std::mutex> lk(mut);
-    assert(test2 == 0);
-    test1 = 1;
-    cv.notify_one();
-    typename Clock::time_point t0 = Clock::now();
-    typename Clock::time_point t = t0 + std::chrono::milliseconds(250);
-    while (test2 == 0 && cv.wait_until(lk, t) == std::cv_status::no_timeout)
-        ;
-    typename Clock::time_point t1 = Clock::now();
-    if (runs == 0)
-    {
-        assert(t1 - t0 < std::chrono::milliseconds(250));
-        assert(test2 != 0);
-    }
-    else
-    {
-        assert(t1 - t0 - std::chrono::milliseconds(250) < std::chrono::milliseconds(50));
-        assert(test2 == 0);
-    }
-    ++runs;
-}
+template <class Clock>
+void test() {
+  // Test unblocking via a call to notify_one() in another thread.
+  //
+  // To test this, we set a very long timeout in wait_until() and we wait
+  // again in case we get awoken spuriously. Note that it can actually
+  // happen that we get awoken spuriously and fail to recognize it
+  // (making this test useless), but the likelihood should be small.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> likely_spurious(true);
+    auto timeout = Clock::now() + std::chrono::seconds(3600);
+    std::condition_variable cv;
+    std::mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      ready = true;
+      do {
+        std::cv_status result = cv.wait_until(lock, timeout);
+        assert(result == std::cv_status::no_timeout);
+      } while (likely_spurious);
+
+      // This can technically fail if we have many spurious awakenings, but in practice the
+      // tolerance is so high that it shouldn't be a problem.
+      assert(Clock::now() < timeout);
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This blocks the condition variable inside its wait call
+      // so we can notify it while it is waiting.
+      std::unique_lock<std::mutex> lock(mutex);
+      cv.notify_one();
+      likely_spurious = false;
+      lock.unlock();
+    });
+
+    t2.join();
+    t1.join();
+  }
+
+  // Test unblocking via a timeout.
+  //
+  // To test this, we create a thread that waits on a condition variable
+  // with a certain timeout, and we never awaken it. To guard against
+  // spurious wakeups, we wait again whenever we are awoken for a reason
+  // other than a timeout.
+  {
+    auto timeout = Clock::now() + std::chrono::milliseconds(250);
+    std::condition_variable cv;
+    std::mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      std::cv_status result;
+      do {
+        result = cv.wait_until(lock, timeout);
+        if (result == std::cv_status::timeout)
+          assert(Clock::now() >= timeout);
+      } while (result != std::cv_status::timeout);
+    });
 
-template <typename Clock>
-void run_test()
-{
-    runs = 0;
-    test1 = 0;
-    test2 = 0;
-    {
-        std::unique_lock<std::mutex>lk(mut);
-        std::thread t = support::make_test_thread(f<Clock>);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        test2 = 1;
-        lk.unlock();
-        cv.notify_one();
-        t.join();
-    }
-    test1 = 0;
-    test2 = 0;
-    {
-        std::unique_lock<std::mutex>lk(mut);
-        std::thread t = support::make_test_thread(f<Clock>);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        lk.unlock();
-        t.join();
-    }
+    t1.join();
+  }
 }
 
-int main(int, char**)
-{
-    run_test<TestClock>();
-    run_test<std::chrono::steady_clock>();
-    run_test<std::chrono::system_clock>();
-    return 0;
+int main(int, char**) {
+  test<TestClock>();
+  test<std::chrono::steady_clock>();
+  return 0;
 }
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_until_pred.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_until_pred.pass.cpp
index fb8bd6e..847d0c1 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_until_pred.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_until_pred.pass.cpp
@@ -6,8 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// UNSUPPORTED: no-threads
-// ALLOW_RETRIES: 2
+// UNSUPPORTED: no-threads, c++03
 
 // <condition_variable>
 
@@ -20,99 +19,145 @@
 //                Predicate pred);
 
 #include <condition_variable>
+#include <atomic>
+#include <cassert>
+#include <chrono>
 #include <mutex>
 #include <thread>
-#include <chrono>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-struct Clock
-{
-    typedef std::chrono::milliseconds duration;
-    typedef duration::rep             rep;
-    typedef duration::period          period;
-    typedef std::chrono::time_point<Clock> time_point;
-    static const bool is_steady =  true;
-
-    static time_point now()
-    {
-        using namespace std::chrono;
-        return time_point(duration_cast<duration>(
-                steady_clock::now().time_since_epoch()
-                                                 ));
-    }
+struct TestClock {
+  typedef std::chrono::milliseconds duration;
+  typedef duration::rep rep;
+  typedef duration::period period;
+  typedef std::chrono::time_point<TestClock> time_point;
+  static const bool is_steady = true;
+
+  static time_point now() {
+    using namespace std::chrono;
+    return time_point(duration_cast<duration>(steady_clock::now().time_since_epoch()));
+  }
 };
 
-class Pred
-{
-    int& i_;
-public:
-    explicit Pred(int& i) : i_(i) {}
+template <class Clock>
+void test() {
+  // Test unblocking via a call to notify_one() in another thread.
+  //
+  // To test this, we set a very long timeout in wait_until() and we try to minimize
+  // the likelihood that we got awoken by a spurious wakeup by updating the
+  // likely_spurious flag only immediately before we perform the notification.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> likely_spurious(true);
+    auto timeout = Clock::now() + std::chrono::seconds(3600);
+    std::condition_variable cv;
+    std::mutex mutex;
 
-    bool operator()() {return i_ != 0;}
-};
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      ready       = true;
+      bool result = cv.wait_until(lock, timeout, [&] { return !likely_spurious; });
+      assert(result); // return value should be true since we didn't time out
+      assert(Clock::now() < timeout);
+    });
 
-std::condition_variable cv;
-std::mutex mut;
-
-int test1 = 0;
-int test2 = 0;
-
-int runs = 0;
-
-void f()
-{
-    std::unique_lock<std::mutex> lk(mut);
-    assert(test2 == 0);
-    test1 = 1;
-    cv.notify_one();
-    Clock::time_point t0 = Clock::now();
-    Clock::time_point t = t0 + Clock::duration(250);
-    bool r = cv.wait_until(lk, t, Pred(test2));
-    Clock::time_point t1 = Clock::now();
-    if (runs == 0)
-    {
-        assert(t1 - t0 < Clock::duration(250));
-        assert(test2 != 0);
-        assert(r);
-    }
-    else
-    {
-        assert(t1 - t0 - Clock::duration(250) < Clock::duration(50));
-        assert(test2 == 0);
-        assert(!r);
-    }
-    ++runs;
-}
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      std::unique_lock<std::mutex> lock(mutex);
+
+      likely_spurious = false;
+      lock.unlock();
+      cv.notify_one();
+    });
+
+    t2.join();
+    t1.join();
+  }
+
+  // Test unblocking via a timeout.
+  //
+  // To test this, we create a thread that waits on a condition variable with a certain
+  // timeout, and we never awaken it. The "stop waiting" predicate always returns false,
+  // which means that we can't get out of the wait via a spurious wakeup.
+  {
+    auto timeout = Clock::now() + std::chrono::milliseconds(250);
+    std::condition_variable cv;
+    std::mutex mutex;
 
-int main(int, char**)
-{
-    {
-        std::unique_lock<std::mutex> lk(mut);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        test2 = 1;
-        lk.unlock();
-        cv.notify_one();
-        t.join();
-    }
-    test1 = 0;
-    test2 = 0;
-    {
-        std::unique_lock<std::mutex> lk(mut);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        lk.unlock();
-        t.join();
-    }
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      bool result = cv.wait_until(lock, timeout, [] { return false; }); // never stop waiting (until timeout)
+      assert(!result); // return value should be false since the predicate returns false after the timeout
+      assert(Clock::now() >= timeout);
+    });
+
+    t1.join();
+  }
+
+  // Test unblocking via a spurious wakeup.
+  //
+  // To test this, we set a fairly long timeout in wait_until() and we basically never
+  // wake up the condition variable. This way, we are hoping to get out of the wait
+  // via a spurious wakeup.
+  //
+  // However, since spurious wakeups are not required to even happen, this test is
+  // only trying to trigger that code path, but not actually asserting that it is
+  // taken. In particular, we do need to eventually ensure we get out of the wait
+  // by standard means, so we actually wake up the thread at the end.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> awoken(false);
+    auto timeout = Clock::now() + std::chrono::seconds(3600);
+    std::condition_variable cv;
+    std::mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      ready       = true;
+      bool result = cv.wait_until(lock, timeout, [&] { return true; });
+      awoken      = true;
+      assert(result);                 // return value should be true since we didn't time out
+      assert(Clock::now() < timeout); // can technically fail if t2 never executes and we timeout, but very unlikely
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      std::unique_lock<std::mutex> lock(mutex);
+      lock.unlock();
+
+      // Give some time for t1 to be awoken spuriously so that code path is used.
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+
+      // We would want to assert that the thread has been awoken after this time,
+      // however nothing guarantees us that it ever gets spuriously awoken, so
+      // we can't really check anything. This is still left here as documentation.
+      bool woke = awoken.load();
+      assert(woke || !woke);
+
+      // Whatever happened, actually awaken the condition variable to ensure the test
+      // doesn't keep running until the timeout.
+      cv.notify_one();
+    });
+
+    t2.join();
+    t1.join();
+  }
+}
 
+int main(int, char**) {
+  test<TestClock>();
+  test<std::chrono::steady_clock>();
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_for.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_for.pass.cpp
index 95acef9..eab3808 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_for.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_for.pass.cpp
@@ -6,8 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// UNSUPPORTED: no-threads
-// ALLOW_RETRIES: 2
+// UNSUPPORTED: no-threads, c++03
 
 // <condition_variable>
 
@@ -18,81 +17,105 @@
 //   wait_for(Lock& lock, const chrono::duration<Rep, Period>& rel_time);
 
 #include <condition_variable>
+#include <atomic>
+#include <cassert>
+#include <chrono>
 #include <mutex>
 #include <thread>
-#include <chrono>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-std::condition_variable_any cv;
-
-typedef std::timed_mutex L0;
-typedef std::unique_lock<L0> L1;
-
-L0 m0;
-
-int test1 = 0;
-int test2 = 0;
-
-bool expect_timeout = false;
-
-void f()
-{
-    typedef std::chrono::system_clock Clock;
-    typedef std::chrono::milliseconds milliseconds;
-    L1 lk(m0);
-    assert(test2 == 0);
-    test1 = 1;
-    cv.notify_one();
-    Clock::time_point t0 = Clock::now();
-    Clock::time_point wait_end = t0 + milliseconds(250);
-    Clock::duration d;
-    do {
-        d = wait_end - Clock::now();
-        if (d <= milliseconds(0)) break;
-    } while (test2 == 0 && cv.wait_for(lk, d) == std::cv_status::no_timeout);
-    Clock::time_point t1 = Clock::now();
-    if (!expect_timeout)
-    {
-        assert(t1 - t0 < milliseconds(250));
-        assert(test2 != 0);
-    }
-    else
-    {
-        assert(t1 - t0 - milliseconds(250) < milliseconds(50));
-        assert(test2 == 0);
-    }
+template <class Mutex>
+struct MyLock : std::unique_lock<Mutex> {
+  using std::unique_lock<Mutex>::unique_lock;
+};
+
+template <class Function>
+std::chrono::microseconds measure(Function f) {
+  std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
+  f();
+  std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 }
 
-int main(int, char**)
-{
-    {
-        L1 lk(m0);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        test2 = 1;
-        lk.unlock();
-        cv.notify_one();
-        t.join();
-    }
-    test1 = 0;
-    test2 = 0;
-    expect_timeout = true;
-    {
-        L1 lk(m0);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        lk.unlock();
-        t.join();
-    }
+template <class Lock>
+void test() {
+  using Mutex = typename Lock::mutex_type;
+  // Test unblocking via a call to notify_one() in another thread.
+  //
+  // To test this, we set a very long timeout in wait_for() and we wait
+  // again in case we get awoken spuriously. Note that it can actually
+  // happen that we get awoken spuriously and fail to recognize it
+  // (making this test useless), but the likelihood should be small.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> likely_spurious(true);
+    auto timeout = std::chrono::seconds(3600);
+    std::condition_variable_any cv;
+    Mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      auto elapsed = measure([&] {
+        ready = true;
+        do {
+          std::cv_status result = cv.wait_for(lock, timeout);
+          assert(result == std::cv_status::no_timeout);
+        } while (likely_spurious);
+      });
+
+      // This can technically fail if we have many spurious awakenings, but in practice the
+      // tolerance is so high that it shouldn't be a problem.
+      assert(elapsed < timeout);
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This blocks the condition variable inside its wait call
+      // so we can notify it while it is waiting.
+      Lock lock(mutex);
+      cv.notify_one();
+      likely_spurious = false;
+      lock.unlock();
+    });
+
+    t2.join();
+    t1.join();
+  }
+
+  // Test unblocking via a timeout.
+  //
+  // To test this, we create a thread that waits on a condition variable
+  // with a certain timeout, and we never awaken it. To guard against
+  // spurious wakeups, we wait again whenever we are awoken for a reason
+  // other than a timeout.
+  {
+    auto timeout = std::chrono::milliseconds(250);
+    std::condition_variable_any cv;
+    Mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      std::cv_status result;
+      do {
+        auto elapsed = measure([&] { result = cv.wait_for(lock, timeout); });
+        if (result == std::cv_status::timeout)
+          assert(elapsed >= timeout);
+      } while (result != std::cv_status::timeout);
+    });
+
+    t1.join();
+  }
+}
 
+int main(int, char**) {
+  test<std::unique_lock<std::mutex>>();
+  test<std::unique_lock<std::timed_mutex>>();
+  test<MyLock<std::mutex>>();
+  test<MyLock<std::timed_mutex>>();
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_for_pred.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_for_pred.pass.cpp
index 0b56002..2dc3693 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_for_pred.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_for_pred.pass.cpp
@@ -6,8 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// UNSUPPORTED: no-threads
-// ALLOW_RETRIES: 2
+// UNSUPPORTED: no-threads, c++03
 
 // <condition_variable>
 
@@ -19,89 +18,148 @@
 //            Predicate pred);
 
 #include <condition_variable>
+#include <atomic>
+#include <cassert>
+#include <chrono>
 #include <mutex>
 #include <thread>
-#include <chrono>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-class Pred
-{
-    int& i_;
-public:
-    explicit Pred(int& i) : i_(i) {}
-
-    bool operator()() {return i_ != 0;}
+template <class Mutex>
+struct MyLock : std::unique_lock<Mutex> {
+  using std::unique_lock<Mutex>::unique_lock;
 };
 
-std::condition_variable_any cv;
-
-typedef std::timed_mutex L0;
-typedef std::unique_lock<L0> L1;
-
-L0 m0;
-
-int test1 = 0;
-int test2 = 0;
-
-int runs = 0;
-bool expect_result = false;
-
-void f()
-{
-    typedef std::chrono::system_clock Clock;
-    typedef std::chrono::milliseconds milliseconds;
-    L1 lk(m0);
-    assert(test2 == 0);
-    test1 = 1;
-    cv.notify_one();
-    Clock::time_point t0 = Clock::now();
-    bool result = cv.wait_for(lk, milliseconds(250), Pred(test2));
-    assert(result == expect_result);
-    Clock::time_point t1 = Clock::now();
-    if (runs == 0)
-    {
-        assert(t1 - t0 < milliseconds(250));
-        assert(test2 != 0);
-    }
-    else
-    {
-        assert(t1 - t0 - milliseconds(250) < milliseconds(50));
-        assert(test2 == 0);
-    }
-    ++runs;
+template <class Function>
+std::chrono::microseconds measure(Function f) {
+  std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
+  f();
+  std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 }
 
-int main(int, char**)
-{
-    {
-        expect_result = true;
-        L1 lk(m0);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        test2 = 1;
-        lk.unlock();
-        cv.notify_one();
-        t.join();
-    }
-    test1 = 0;
-    test2 = 0;
-    {
-        expect_result = false;
-        L1 lk(m0);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        lk.unlock();
-        t.join();
-    }
-
-  return 0;
+template <class Lock>
+void test() {
+  using Mutex = typename Lock::mutex_type;
+  // Test unblocking via a call to notify_one() in another thread.
+  //
+  // To test this, we set a very long timeout in wait_for() and we try to minimize
+  // the likelihood that we got awoken by a spurious wakeup by updating the
+  // likely_spurious flag only immediately before we perform the notification.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> likely_spurious(true);
+    auto timeout = std::chrono::seconds(3600);
+    std::condition_variable_any cv;
+    Mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      auto elapsed = measure([&] {
+        ready       = true;
+        bool result = cv.wait_for(lock, timeout, [&] { return !likely_spurious; });
+        assert(result); // return value should be true since we didn't time out
+      });
+      assert(elapsed < timeout);
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      Lock lock(mutex);
+
+      likely_spurious = false;
+      lock.unlock();
+      cv.notify_one();
+    });
+
+    t2.join();
+    t1.join();
+  }
+
+  // Test unblocking via a timeout.
+  //
+  // To test this, we create a thread that waits on a condition variable with a certain
+  // timeout, and we never awaken it. The "stop waiting" predicate always returns false,
+  // which means that we can't get out of the wait via a spurious wakeup.
+  {
+    auto timeout = std::chrono::milliseconds(250);
+    std::condition_variable_any cv;
+    Mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      auto elapsed = measure([&] {
+        bool result = cv.wait_for(lock, timeout, [] { return false; }); // never stop waiting (until timeout)
+        assert(!result); // return value should be false since the predicate returns false after the timeout
+      });
+      assert(elapsed >= timeout);
+    });
+
+    t1.join();
+  }
+
+  // Test unblocking via a spurious wakeup.
+  //
+  // To test this, we set a fairly long timeout in wait_for() and we basically never
+  // wake up the condition variable. This way, we are hoping to get out of the wait
+  // via a spurious wakeup.
+  //
+  // However, since spurious wakeups are not required to even happen, this test is
+  // only trying to trigger that code path, but not actually asserting that it is
+  // taken. In particular, we do need to eventually ensure we get out of the wait
+  // by standard means, so we actually wake up the thread at the end.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> awoken(false);
+    auto timeout = std::chrono::seconds(3600);
+    std::condition_variable_any cv;
+    Mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      auto elapsed = measure([&] {
+        ready       = true;
+        bool result = cv.wait_for(lock, timeout, [&] { return true; });
+        awoken      = true;
+        assert(result); // return value should be true since we didn't time out
+      });
+      assert(elapsed < timeout); // can technically fail if t2 never executes and we timeout, but very unlikely
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      Lock lock(mutex);
+      lock.unlock();
+
+      // Give some time for t1 to be awoken spuriously so that code path is used.
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+
+      // We would want to assert that the thread has been awoken after this time,
+      // however nothing guarantees us that it ever gets spuriously awoken, so
+      // we can't really check anything. This is still left here as documentation.
+      bool woke = awoken.load();
+      assert(woke || !woke);
+
+      // Whatever happened, actually awaken the condition variable to ensure the test
+      // doesn't keep running until the timeout.
+      cv.notify_one();
+    });
+
+    t2.join();
+    t1.join();
+  }
 }
+
+int main(int, char**) { return 0; }
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_pred.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_pred.pass.cpp
index a5e2813..48efbf1 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_pred.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_pred.pass.cpp
@@ -5,9 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
-// ALLOW_RETRIES: 2
+
+// UNSUPPORTED: no-threads, c++03
 
 // <condition_variable>
 
@@ -17,55 +16,113 @@
 //   void wait(Lock& lock, Predicate pred);
 
 #include <condition_variable>
+#include <atomic>
+#include <cassert>
 #include <mutex>
 #include <thread>
-#include <functional>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-std::condition_variable_any cv;
+template <class Mutex>
+struct MyLock : std::unique_lock<Mutex> {
+  using std::unique_lock<Mutex>::unique_lock;
+};
 
-typedef std::timed_mutex L0;
-typedef std::unique_lock<L0> L1;
+template <class Lock>
+void test() {
+  using Mutex = typename Lock::mutex_type;
 
-L0 m0;
+  // Test unblocking via a call to notify_one() in another thread.
+  //
+  // To test this, we try to minimize the likelihood that we got awoken by a
+  // spurious wakeup by updating the likely_spurious flag only immediately
+  // before we perform the notification.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> likely_spurious(true);
+    std::condition_variable_any cv;
+    Mutex mutex;
 
-int test1 = 0;
-int test2 = 0;
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      ready = true;
+      cv.wait(lock, [&] { return !likely_spurious; });
+    });
 
-class Pred
-{
-    int& i_;
-public:
-    explicit Pred(int& i) : i_(i) {}
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
 
-    bool operator()() {return i_ != 0;}
-};
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      Lock lock(mutex);
+
+      likely_spurious = false;
+      lock.unlock();
+      cv.notify_one();
+    });
+
+    t2.join();
+    t1.join();
+  }
+
+  // Test unblocking via a spurious wakeup.
+  //
+  // To test this, we basically never wake up the condition variable. This way, we
+  // are hoping to get out of the wait via a spurious wakeup.
+  //
+  // However, since spurious wakeups are not required to even happen, this test is
+  // only trying to trigger that code path, but not actually asserting that it is
+  // taken. In particular, we do need to eventually ensure we get out of the wait
+  // by standard means, so we actually wake up the thread at the end.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> awoken(false);
+    std::condition_variable_any cv;
+    Mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      ready = true;
+      cv.wait(lock, [&] { return true; });
+      awoken = true;
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      Lock lock(mutex);
+      lock.unlock();
+
+      // Give some time for t1 to be awoken spuriously so that code path is used.
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+
+      // We would want to assert that the thread has been awoken after this time,
+      // however nothing guarantees us that it ever gets spuriously awoken, so
+      // we can't really check anything. This is still left here as documentation.
+      bool woke = awoken.load();
+      assert(woke || !woke);
+
+      // Whatever happened, actually awaken the condition variable to ensure the test finishes.
+      cv.notify_one();
+    });
 
-void f()
-{
-    L1 lk(m0);
-    assert(test2 == 0);
-    test1 = 1;
-    cv.notify_one();
-    cv.wait(lk, Pred(test2));
-    assert(test2 != 0);
+    t2.join();
+    t1.join();
+  }
 }
 
-int main(int, char**)
-{
-    L1 lk(m0);
-    std::thread t = support::make_test_thread(f);
-    assert(test1 == 0);
-    while (test1 == 0)
-        cv.wait(lk);
-    assert(test1 != 0);
-    test2 = 1;
-    lk.unlock();
-    cv.notify_one();
-    t.join();
+int main(int, char**) {
+  test<std::unique_lock<std::mutex>>();
+  test<std::unique_lock<std::timed_mutex>>();
+  test<MyLock<std::mutex>>();
+  test<MyLock<std::timed_mutex>>();
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_until.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_until.pass.cpp
index 0f23343..6494bcd 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_until.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_until.pass.cpp
@@ -6,8 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// UNSUPPORTED: no-threads
-// ALLOW_RETRIES: 2
+// UNSUPPORTED: no-threads, c++03
 
 // <condition_variable>
 
@@ -18,93 +17,115 @@
 //   wait_until(Lock& lock, const chrono::time_point<Clock, Duration>& abs_time);
 
 #include <condition_variable>
+#include <atomic>
+#include <cassert>
+#include <chrono>
 #include <mutex>
 #include <thread>
-#include <chrono>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-struct Clock
-{
-    typedef std::chrono::milliseconds duration;
-    typedef duration::rep             rep;
-    typedef duration::period          period;
-    typedef std::chrono::time_point<Clock> time_point;
-    static const bool is_steady =  true;
-
-    static time_point now()
-    {
-        using namespace std::chrono;
-        return time_point(duration_cast<duration>(
-                steady_clock::now().time_since_epoch()
-                                                 ));
-    }
+struct TestClock {
+  typedef std::chrono::milliseconds duration;
+  typedef duration::rep rep;
+  typedef duration::period period;
+  typedef std::chrono::time_point<TestClock> time_point;
+  static const bool is_steady = true;
+
+  static time_point now() {
+    using namespace std::chrono;
+    return time_point(duration_cast<duration>(steady_clock::now().time_since_epoch()));
+  }
 };
 
-std::condition_variable_any cv;
-
-typedef std::timed_mutex L0;
-typedef std::unique_lock<L0> L1;
-
-L0 m0;
-
-int test1 = 0;
-int test2 = 0;
-
-int runs = 0;
-
-void f()
-{
-    L1 lk(m0);
-    assert(test2 == 0);
-    test1 = 1;
-    cv.notify_one();
-    Clock::time_point t0 = Clock::now();
-    Clock::time_point t = t0 + Clock::duration(250);
-    while (test2 == 0 && cv.wait_until(lk, t) == std::cv_status::no_timeout)
-        ;
-    Clock::time_point t1 = Clock::now();
-    if (runs == 0)
-    {
-        assert(t1 - t0 < Clock::duration(250));
-        assert(test2 != 0);
-    }
-    else
-    {
-        assert(t1 - t0 - Clock::duration(250) < Clock::duration(50));
-        assert(test2 == 0);
-    }
-    ++runs;
+template <class Mutex>
+struct MyLock : std::unique_lock<Mutex> {
+  using std::unique_lock<Mutex>::unique_lock;
+};
+
+template <class Lock, class Clock>
+void test() {
+  using Mutex = typename Lock::mutex_type;
+  // Test unblocking via a call to notify_one() in another thread.
+  //
+  // To test this, we set a very long timeout in wait_until() and we wait
+  // again in case we get awoken spuriously. Note that it can actually
+  // happen that we get awoken spuriously and fail to recognize it
+  // (making this test useless), but the likelihood should be small.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> likely_spurious(true);
+    auto timeout = Clock::now() + std::chrono::seconds(3600);
+    std::condition_variable_any cv;
+    Mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      ready = true;
+      do {
+        std::cv_status result = cv.wait_until(lock, timeout);
+        assert(result == std::cv_status::no_timeout);
+      } while (likely_spurious);
+
+      // This can technically fail if we have many spurious awakenings, but in practice the
+      // tolerance is so high that it shouldn't be a problem.
+      assert(Clock::now() < timeout);
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This blocks the condition variable inside its wait call
+      // so we can notify it while it is waiting.
+      Lock lock(mutex);
+      cv.notify_one();
+      likely_spurious = false;
+      lock.unlock();
+    });
+
+    t2.join();
+    t1.join();
+  }
+
+  // Test unblocking via a timeout.
+  //
+  // To test this, we create a thread that waits on a condition variable
+  // with a certain timeout, and we never awaken it. To guard against
+  // spurious wakeups, we wait again whenever we are awoken for a reason
+  // other than a timeout.
+  {
+    auto timeout = Clock::now() + std::chrono::milliseconds(250);
+    std::condition_variable_any cv;
+    Mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      std::cv_status result;
+      do {
+        result = cv.wait_until(lock, timeout);
+        if (result == std::cv_status::timeout)
+          assert(Clock::now() >= timeout);
+      } while (result != std::cv_status::timeout);
+    });
+
+    t1.join();
+  }
 }
 
-int main(int, char**)
-{
-    {
-        L1 lk(m0);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        test2 = 1;
-        lk.unlock();
-        cv.notify_one();
-        t.join();
-    }
-    test1 = 0;
-    test2 = 0;
-    {
-        L1 lk(m0);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        lk.unlock();
-        t.join();
-    }
+int main(int, char**) {
+  test<std::unique_lock<std::mutex>, TestClock>();
+  test<std::unique_lock<std::mutex>, std::chrono::steady_clock>();
+
+  test<std::unique_lock<std::timed_mutex>, TestClock>();
+  test<std::unique_lock<std::timed_mutex>, std::chrono::steady_clock>();
+
+  test<MyLock<std::mutex>, TestClock>();
+  test<MyLock<std::mutex>, std::chrono::steady_clock>();
 
+  test<MyLock<std::timed_mutex>, TestClock>();
+  test<MyLock<std::timed_mutex>, std::chrono::steady_clock>();
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_until_pred.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_until_pred.pass.cpp
index aa60ae4..ee7c172 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_until_pred.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_until_pred.pass.cpp
@@ -6,8 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// UNSUPPORTED: no-threads
-// ALLOW_RETRIES: 2
+// UNSUPPORTED: no-threads, c++03
 
 // <condition_variable>
 
@@ -20,103 +19,171 @@
 //                Predicate pred);
 
 #include <condition_variable>
+#include <atomic>
+#include <cassert>
+#include <chrono>
 #include <mutex>
 #include <thread>
-#include <chrono>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-struct Clock
-{
-    typedef std::chrono::milliseconds duration;
-    typedef duration::rep             rep;
-    typedef duration::period          period;
-    typedef std::chrono::time_point<Clock> time_point;
-    static const bool is_steady =  true;
-
-    static time_point now()
-    {
-        using namespace std::chrono;
-        return time_point(duration_cast<duration>(
-                steady_clock::now().time_since_epoch()
-                                                 ));
-    }
+struct TestClock {
+  typedef std::chrono::milliseconds duration;
+  typedef duration::rep rep;
+  typedef duration::period period;
+  typedef std::chrono::time_point<TestClock> time_point;
+  static const bool is_steady = true;
+
+  static time_point now() {
+    using namespace std::chrono;
+    return time_point(duration_cast<duration>(steady_clock::now().time_since_epoch()));
+  }
 };
 
-class Pred
-{
-    int& i_;
-public:
-    explicit Pred(int& i) : i_(i) {}
-
-    bool operator()() {return i_ != 0;}
+template <class Mutex>
+struct MyLock : std::unique_lock<Mutex> {
+  using std::unique_lock<Mutex>::unique_lock;
 };
 
-std::condition_variable_any cv;
-
-typedef std::timed_mutex L0;
-typedef std::unique_lock<L0> L1;
-
-L0 m0;
-
-int test1 = 0;
-int test2 = 0;
-
-int runs = 0;
-
-void f()
-{
-    L1 lk(m0);
-    assert(test2 == 0);
-    test1 = 1;
-    cv.notify_one();
-    Clock::time_point t0 = Clock::now();
-    Clock::time_point t = t0 + Clock::duration(250);
-    bool r = cv.wait_until(lk, t, Pred(test2));
-    Clock::time_point t1 = Clock::now();
-    if (runs == 0)
-    {
-        assert(t1 - t0 < Clock::duration(250));
-        assert(test2 != 0);
-        assert(r);
-    }
-    else
-    {
-        assert(t1 - t0 - Clock::duration(250) < Clock::duration(50));
-        assert(test2 == 0);
-        assert(!r);
-    }
-    ++runs;
+template <class Lock, class Clock>
+void test() {
+  using Mutex = typename Lock::mutex_type;
+  // Test unblocking via a call to notify_one() in another thread.
+  //
+  // To test this, we set a very long timeout in wait_until() and we try to minimize
+  // the likelihood that we got awoken by a spurious wakeup by updating the
+  // likely_spurious flag only immediately before we perform the notification.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> likely_spurious(true);
+    auto timeout = Clock::now() + std::chrono::seconds(3600);
+    std::condition_variable_any cv;
+    Mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      ready       = true;
+      bool result = cv.wait_until(lock, timeout, [&] { return !likely_spurious; });
+      assert(result); // return value should be true since we didn't time out
+      assert(Clock::now() < timeout);
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      Lock lock(mutex);
+
+      likely_spurious = false;
+      lock.unlock();
+      cv.notify_one();
+    });
+
+    t2.join();
+    t1.join();
+  }
+
+  // Test unblocking via a timeout.
+  //
+  // To test this, we create a thread that waits on a condition variable with a certain
+  // timeout, and we never awaken it. The "stop waiting" predicate always returns false,
+  // which means that we can't get out of the wait via a spurious wakeup.
+  {
+    auto timeout = Clock::now() + std::chrono::milliseconds(250);
+    std::condition_variable_any cv;
+    Mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      bool result = cv.wait_until(lock, timeout, [] { return false; }); // never stop waiting (until timeout)
+      assert(!result); // return value should be false since the predicate returns false after the timeout
+      assert(Clock::now() >= timeout);
+    });
+
+    t1.join();
+  }
+
+  // Test unblocking via a spurious wakeup.
+  //
+  // To test this, we set a fairly long timeout in wait_until() and we basically never
+  // wake up the condition variable. This way, we are hoping to get out of the wait
+  // via a spurious wakeup.
+  //
+  // However, since spurious wakeups are not required to even happen, this test is
+  // only trying to trigger that code path, but not actually asserting that it is
+  // taken. In particular, we do need to eventually ensure we get out of the wait
+  // by standard means, so we actually wake up the thread at the end.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> awoken(false);
+    auto timeout = Clock::now() + std::chrono::seconds(3600);
+    std::condition_variable_any cv;
+    Mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      ready       = true;
+      bool result = cv.wait_until(lock, timeout, [&] { return true; });
+      awoken      = true;
+      assert(result);                 // return value should be true since we didn't time out
+      assert(Clock::now() < timeout); // can technically fail if t2 never executes and we timeout, but very unlikely
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      Lock lock(mutex);
+      lock.unlock();
+
+      // Give some time for t1 to be awoken spuriously so that code path is used.
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+
+      // We would want to assert that the thread has been awoken after this time,
+      // however nothing guarantees us that it ever gets spuriously awoken, so
+      // we can't really check anything. This is still left here as documentation.
+      bool woke = awoken.load();
+      assert(woke || !woke);
+
+      // Whatever happened, actually awaken the condition variable to ensure the test
+      // doesn't keep running until the timeout.
+      cv.notify_one();
+    });
+
+    t2.join();
+    t1.join();
+  }
 }
 
-int main(int, char**)
-{
-    {
-        L1 lk(m0);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        test2 = 1;
-        lk.unlock();
-        cv.notify_one();
-        t.join();
-    }
-    test1 = 0;
-    test2 = 0;
-    {
-        L1 lk(m0);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        lk.unlock();
-        t.join();
-    }
+int main(int, char**) {
+  // Run on multiple threads to speed up the test, and because it ought to work anyways.
+  std::thread tests[] = {
+      support::make_test_thread([] {
+        test<std::unique_lock<std::mutex>, TestClock>();
+        test<std::unique_lock<std::mutex>, std::chrono::steady_clock>();
+      }),
+      support::make_test_thread([] {
+        test<std::unique_lock<std::timed_mutex>, TestClock>();
+        test<std::unique_lock<std::timed_mutex>, std::chrono::steady_clock>();
+      }),
+      support::make_test_thread([] {
+        test<MyLock<std::mutex>, TestClock>();
+        test<MyLock<std::mutex>, std::chrono::steady_clock>();
+      }),
+      support::make_test_thread([] {
+        test<MyLock<std::timed_mutex>, TestClock>();
+        test<MyLock<std::timed_mutex>, std::chrono::steady_clock>();
+      })};
+
+  for (std::thread& t : tests)
+    t.join();
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/implicit_ctad.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/implicit_ctad.pass.cpp
index b754417..9319ec0 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/implicit_ctad.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <mutex>
 
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/implicit_ctad.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/implicit_ctad.pass.cpp
index 7305b48..86bda3a 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/implicit_ctad.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <mutex>
 
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/implicit_ctad.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/implicit_ctad.pass.cpp
index 9a595f9..826ec2b 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/implicit_ctad.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <shared_mutex>
 
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex.pass.cpp
index 4940041..ece3301 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex.pass.cpp
@@ -5,10 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
-// ALLOW_RETRIES: 2
 
 // <shared_mutex>
 
@@ -19,9 +18,8 @@
 // template<class _Mutex> shared_lock(shared_lock<_Mutex>)
 //     -> shared_lock<_Mutex>;  // C++17
 
+#include <atomic>
 #include <cassert>
-#include <chrono>
-#include <cstdlib>
 #include <shared_mutex>
 #include <thread>
 #include <vector>
@@ -29,77 +27,77 @@
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-typedef std::chrono::system_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
-
-ms WaitTime = ms(250);
-
-// Thread sanitizer causes more overhead and will sometimes cause this test
-// to fail. To prevent this we give Thread sanitizer more time to complete the
-// test.
-#if !defined(TEST_IS_EXECUTED_IN_A_SLOW_ENVIRONMENT)
-ms Tolerance = ms(50);
-#else
-ms Tolerance = ms(50 * 5);
-#endif
+struct Monitor {
+  bool lock_shared_called   = false;
+  bool unlock_shared_called = false;
+};
 
-std::shared_timed_mutex m;
+struct TrackedMutex {
+  Monitor* monitor = nullptr;
 
-void f()
-{
-    time_point t0 = Clock::now();
-    time_point t1;
-    {
-    std::shared_lock<std::shared_timed_mutex> ul(m);
-    t1 = Clock::now();
-    }
-    ns d = t1 - t0 - WaitTime;
-    assert(d < Tolerance);  // within tolerance
-}
+  void lock_shared() {
+    if (monitor != nullptr)
+      monitor->lock_shared_called = true;
+  }
+  void unlock_shared() {
+    if (monitor != nullptr)
+      monitor->unlock_shared_called = true;
+  }
+};
 
-void g()
-{
-    time_point t0 = Clock::now();
-    time_point t1;
-    {
-    std::shared_lock<std::shared_timed_mutex> ul(m);
-    t1 = Clock::now();
-    }
-    ns d = t1 - t0;
-    assert(d < Tolerance);  // within tolerance
-}
+template <class Mutex>
+void test() {
+  // Basic sanity test
+  {
+    Mutex mutex;
+    std::vector<std::thread> threads;
+    std::atomic<bool> ready(false);
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        while (!ready) {
+          // spin
+        }
 
-int main(int, char**)
-{
-    std::vector<std::thread> v;
-    {
-        m.lock();
-        for (int i = 0; i < 5; ++i)
-            v.push_back(support::make_test_thread(f));
-        std::this_thread::sleep_for(WaitTime);
-        m.unlock();
-        for (auto& t : v)
-            t.join();
-    }
-    {
-        m.lock_shared();
-        for (auto& t : v)
-            t = support::make_test_thread(g);
-        std::thread q = support::make_test_thread(f);
-        std::this_thread::sleep_for(WaitTime);
-        m.unlock_shared();
-        for (auto& t : v)
-            t.join();
-        q.join();
+        std::shared_lock<Mutex> lock(mutex);
+        assert(lock.owns_lock());
+      }));
     }
 
+    ready = true;
+    for (auto& t : threads)
+      t.join();
+  }
+
+  // Test CTAD
+  {
+#if TEST_STD_VER >= 17
+    Mutex mutex;
+    std::shared_lock lock(mutex);
+    static_assert(std::is_same<decltype(lock), std::shared_lock<Mutex>>::value);
+#endif
+  }
+}
+
+int main(int, char**) {
 #if TEST_STD_VER >= 17
-    std::shared_lock sl(m);
-    static_assert((std::is_same<decltype(sl), std::shared_lock<decltype(m)>>::value), "" );
+  test<std::shared_mutex>();
 #endif
+  test<std::shared_timed_mutex>();
+  test<TrackedMutex>();
+
+  // Use shared_lock with a dummy mutex class that tracks whether each
+  // operation has been called or not.
+  {
+    Monitor monitor;
+    TrackedMutex mutex{&monitor};
+
+    std::shared_lock<TrackedMutex> lock(mutex);
+    assert(monitor.lock_shared_called);
+    assert(lock.owns_lock());
+
+    lock.unlock();
+    assert(monitor.unlock_shared_called);
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/lock.pass.cpp
index edb7c42..d36ca1d 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/lock.pass.cpp
@@ -5,10 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
-// ALLOW_RETRIES: 2
 
 // <shared_mutex>
 
@@ -16,10 +15,9 @@
 
 // void lock();
 
+#include <atomic>
 #include <cassert>
-#include <chrono>
-#include <cstdlib>
-#include <mutex>
+#include <mutex> // std::defer_lock
 #include <shared_mutex>
 #include <system_error>
 #include <thread>
@@ -28,71 +26,99 @@
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-std::shared_timed_mutex m;
+struct Monitor {
+  bool lock_shared_called   = false;
+  bool unlock_shared_called = false;
+};
 
-typedef std::chrono::system_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
+struct TrackedMutex {
+  Monitor* monitor = nullptr;
 
-ms WaitTime = ms(250);
+  void lock_shared() {
+    if (monitor != nullptr)
+      monitor->lock_shared_called = true;
+  }
+  void unlock_shared() {
+    if (monitor != nullptr)
+      monitor->unlock_shared_called = true;
+  }
+};
 
-// Thread sanitizer causes more overhead and will sometimes cause this test
-// to fail. To prevent this we give Thread sanitizer more time to complete the
-// test.
-#if !defined(TEST_IS_EXECUTED_IN_A_SLOW_ENVIRONMENT)
-ms Tolerance = ms(25);
-#else
-ms Tolerance = ms(25 * 5);
-#endif
+template <class Mutex>
+void test() {
+  // Basic sanity test
+  {
+    Mutex mutex;
+    std::vector<std::thread> threads;
+    std::atomic<bool> ready(false);
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        while (!ready) {
+          // spin
+        }
 
+        std::shared_lock<Mutex> lock(mutex, std::defer_lock);
+        lock.lock();
+        assert(lock.owns_lock());
+      }));
+    }
+
+    ready = true;
+    for (auto& t : threads)
+      t.join();
+  }
 
-void f()
-{
-    std::shared_lock<std::shared_timed_mutex> lk(m, std::defer_lock);
-    time_point t0 = Clock::now();
-    lk.lock();
-    time_point t1 = Clock::now();
-    assert(lk.owns_lock() == true);
-    ns d = t1 - t0 - WaitTime;
-    assert(d < Tolerance);  // within tolerance
+  // Try locking the same shared_lock again in the same thread. This should throw an exception.
+  {
+    Mutex mutex;
+    std::shared_lock<Mutex> lock(mutex, std::defer_lock);
+    lock.lock();
+    assert(lock.owns_lock());
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        lk.lock();
-        assert(false);
-    }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EDEADLK);
+    try {
+      lock.lock();
+      assert(false);
+    } catch (std::system_error const& e) {
+      assert(e.code() == std::errc::resource_deadlock_would_occur);
     }
 #endif
-    lk.unlock();
-    lk.release();
+  }
+
+  // Try locking a shared_lock that isn't associated to any mutex. This should throw an exception.
+  {
+    std::shared_lock<Mutex> lock; // no associated mutex
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        lk.lock();
-        assert(false);
-    }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EPERM);
+    try {
+      lock.lock();
+      assert(false);
+    } catch (std::system_error const& e) {
+      assert(e.code() == std::errc::operation_not_permitted);
     }
 #endif
+  }
 }
 
-int main(int, char**)
-{
-    m.lock();
-    std::vector<std::thread> v;
-    for (int i = 0; i < 5; ++i)
-        v.push_back(support::make_test_thread(f));
-    std::this_thread::sleep_for(WaitTime);
-    m.unlock();
-    for (auto& t : v)
-        t.join();
+int main(int, char**) {
+#if TEST_STD_VER >= 17
+  test<std::shared_mutex>();
+#endif
+  test<std::shared_timed_mutex>();
+  test<TrackedMutex>();
+
+  // Use shared_lock with a dummy mutex class that tracks whether each
+  // operation has been called or not.
+  {
+    Monitor monitor;
+    TrackedMutex mutex{&monitor};
+
+    std::shared_lock<TrackedMutex> lock(mutex, std::defer_lock);
+    lock.lock();
+    assert(monitor.lock_shared_called);
+    assert(lock.owns_lock());
+
+    lock.unlock();
+    assert(monitor.unlock_shared_called);
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/try_lock.pass.cpp
index 0e707fc..b614668 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/try_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/try_lock.pass.cpp
@@ -5,11 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
-//
-// ALLOW_RETRIES: 2
 
 // <shared_mutex>
 
@@ -17,60 +15,115 @@
 
 // bool try_lock();
 
+#include <atomic>
 #include <cassert>
-#include <mutex>
+#include <mutex> // std::defer_lock
 #include <shared_mutex>
 #include <system_error>
+#include <thread>
+#include <vector>
 
+#include "make_test_thread.h"
 #include "test_macros.h"
 
-bool try_lock_called = false;
+struct Monitor {
+  bool try_lock_shared_called = false;
+  bool unlock_shared_called   = false;
+};
 
-struct mutex
-{
-    bool try_lock_shared()
-    {
-        try_lock_called = !try_lock_called;
-        return try_lock_called;
-    }
-    void unlock_shared() {}
+struct TrackedMutex {
+  Monitor* monitor = nullptr;
+
+  bool try_lock_shared() {
+    if (monitor != nullptr)
+      monitor->try_lock_shared_called = true;
+    return true;
+  }
+  void unlock_shared() {
+    if (monitor != nullptr)
+      monitor->unlock_shared_called = true;
+  }
 };
 
-mutex m;
+template <class Mutex>
+void test() {
+  // Basic sanity test
+  {
+    Mutex mutex;
+    std::vector<std::thread> threads;
+    std::atomic<bool> ready(false);
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        while (!ready) {
+          // spin
+        }
 
-int main(int, char**)
-{
-    std::shared_lock<mutex> lk(m, std::defer_lock);
-    assert(lk.try_lock() == true);
-    assert(try_lock_called == true);
-    assert(lk.owns_lock() == true);
-#ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        TEST_IGNORE_NODISCARD lk.try_lock();
-        assert(false);
+        std::shared_lock<Mutex> lock(mutex, std::defer_lock);
+        bool result = lock.try_lock();
+        assert(result);
+        assert(lock.owns_lock());
+      }));
     }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EDEADLK);
+
+    ready = true;
+    for (auto& t : threads)
+      t.join();
+  }
+
+  // Make sure that we throw an exception if we try to re-lock a mutex that is
+  // already locked by the current thread.
+  {
+    Mutex mutex;
+
+    std::shared_lock<Mutex> lock(mutex, std::defer_lock);
+    assert(lock.try_lock());
+    assert(lock.owns_lock());
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    try {
+      TEST_IGNORE_NODISCARD lock.try_lock();
+      assert(false);
+    } catch (std::system_error const& e) {
+      assert(e.code() == std::errc::resource_deadlock_would_occur);
     }
 #endif
-    lk.unlock();
-    assert(lk.try_lock() == false);
-    assert(try_lock_called == false);
-    assert(lk.owns_lock() == false);
-    lk.release();
+  }
+
+  // Make sure that we throw an exception if we try to lock a shared_lock
+  // that is not associated to any mutex.
+  {
+    std::shared_lock<Mutex> lock; // not associated to a mutex
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        TEST_IGNORE_NODISCARD lk.try_lock();
-        assert(false);
-    }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EPERM);
+    try {
+      TEST_IGNORE_NODISCARD lock.try_lock();
+      assert(false);
+    } catch (std::system_error const& e) {
+      assert(e.code() == std::errc::operation_not_permitted);
     }
 #endif
+  }
+}
+
+int main(int, char**) {
+#if TEST_STD_VER >= 17
+  test<std::shared_mutex>();
+#endif
+  test<std::shared_timed_mutex>();
+  test<TrackedMutex>();
+
+  // Use shared_lock with a dummy mutex class that tracks whether each
+  // operation has been called or not.
+  {
+    Monitor monitor;
+    TrackedMutex mutex{&monitor};
+
+    std::shared_lock<TrackedMutex> lock(mutex, std::defer_lock);
+    bool result = lock.try_lock();
+    assert(result);
+    assert(monitor.try_lock_shared_called);
+    assert(lock.owns_lock());
 
+    lock.unlock();
+    assert(monitor.unlock_shared_called);
+  }
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp
index ffe651c..337ad4c 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <mutex>
 
diff --git a/libcxx/test/std/utilities/function.objects/func.search/func.search.bm/implicit_ctad.pass.cpp b/libcxx/test/std/utilities/function.objects/func.search/func.search.bm/implicit_ctad.pass.cpp
index 863b4a5..50c89d6 100644
--- a/libcxx/test/std/utilities/function.objects/func.search/func.search.bm/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/func.search/func.search.bm/implicit_ctad.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <functional>
 
diff --git a/libcxx/test/std/utilities/function.objects/func.search/func.search.bmh/implicit_ctad.pass.cpp b/libcxx/test/std/utilities/function.objects/func.search/func.search.bmh/implicit_ctad.pass.cpp
index 778f6d3..9cb4ef5 100644
--- a/libcxx/test/std/utilities/function.objects/func.search/func.search.bmh/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/func.search/func.search.bmh/implicit_ctad.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <functional>
 
diff --git a/libcxx/test/std/utilities/function.objects/func.search/func.search.default/implicit_ctad.pass.cpp b/libcxx/test/std/utilities/function.objects/func.search/func.search.default/implicit_ctad.pass.cpp
index 3c90295..6334ed1 100644
--- a/libcxx/test/std/utilities/function.objects/func.search/func.search.default/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/func.search/func.search.default/implicit_ctad.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <functional>
 
diff --git a/libcxx/test/std/utilities/function.objects/operations.implicit_ctad.pass.cpp b/libcxx/test/std/utilities/function.objects/operations.implicit_ctad.pass.cpp
index 03c46d2..bb4fb4b 100644
--- a/libcxx/test/std/utilities/function.objects/operations.implicit_ctad.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/operations.implicit_ctad.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <functional>
 
diff --git a/libcxx/test/std/utilities/utility/mem.res/mem.res.global/new_delete_resource.pass.cpp b/libcxx/test/std/utilities/utility/mem.res/mem.res.global/new_delete_resource.pass.cpp
index 68a82f6..7b31070 100644
--- a/libcxx/test/std/utilities/utility/mem.res/mem.res.global/new_delete_resource.pass.cpp
+++ b/libcxx/test/std/utilities/utility/mem.res/mem.res.global/new_delete_resource.pass.cpp
@@ -76,7 +76,7 @@ void test_allocate_deallocate() {
   ASSERT_WITH_LIBRARY_INTERNAL_ALLOCATIONS(globalMemCounter.checkOutstandingNewEq(1));
   ASSERT_WITH_LIBRARY_INTERNAL_ALLOCATIONS(globalMemCounter.checkLastNewSizeEq(50));
 
-  r1.deallocate(ret, 1);
+  r1.deallocate(ret, 50);
   assert(globalMemCounter.checkOutstandingNewEq(0));
   ASSERT_WITH_LIBRARY_INTERNAL_ALLOCATIONS(globalMemCounter.checkDeleteCalledEq(1));
 }
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index c81b56b..093cd39 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -38,6 +38,39 @@ def _getAndroidDeviceApi(cfg):
         )
     )
 
+
+def _mingwSupportsModules(cfg):
+    # Only mingw headers are known to work with libc++ built as a module,
+    # at the moment.
+    if not "__MINGW32__" in compilerMacros(cfg):
+        return False
+    # For mingw headers, check for a version known to support being built
+    # as a module.
+    return sourceBuilds(
+        cfg,
+        """
+        #include <_mingw_mac.h>
+        #if __MINGW64_VERSION_MAJOR < 12
+        #error Headers known to be incompatible
+        #elif __MINGW64_VERSION_MAJOR == 12
+        // The headers were fixed to work with libc++ modules during
+        // __MINGW64_VERSION_MAJOR == 12. The headers became compatible
+        // with libc++ built as a module in
+        // 1652e9241b5d8a5a779c6582b1c3c4f4a7cc66e5 (Apr 2024), but the
+        // following commit 8c13b28ace68f2c0094d45121d59a4b951b533ed
+        // removed the now unused __mingw_static_ovr define. Use this
+        // as indicator for whether we've got new enough headers.
+        #ifdef __mingw_static_ovr
+        #error Headers too old
+        #endif
+        #else
+        // __MINGW64_VERSION_MAJOR > 12 should be ok.
+        #endif
+        int main() { return 0; }
+        """,
+    )
+
+
 # Lit features are evaluated in order. Some checks may require the compiler detection to have
 # run first in order to work properly.
 DEFAULT_FEATURES = [
@@ -281,7 +314,7 @@ DEFAULT_FEATURES = [
         #  Any declaration of a library function shall have external linkage.
         when=lambda cfg: "__ANDROID__" in compilerMacros(cfg)
         or "__FreeBSD__" in compilerMacros(cfg)
-        or "_WIN32" in compilerMacros(cfg)
+        or ("_WIN32" in compilerMacros(cfg) and not _mingwSupportsModules(cfg))
         or platform.system().lower().startswith("aix")
         # Avoid building on platforms that don't support modules properly.
         or not hasCompileFlag(cfg, "-Wno-reserved-module-identifier"),
diff --git a/libcxxabi/include/cxxabi.h b/libcxxabi/include/cxxabi.h
index d070118..0e39690 100644
--- a/libcxxabi/include/cxxabi.h
+++ b/libcxxabi/include/cxxabi.h
@@ -48,13 +48,17 @@ extern _LIBCXXABI_FUNC_VIS void
 __cxa_free_exception(void *thrown_exception) throw();
 // This function is an LLVM extension, which mirrors the same extension in libsupc++ and libcxxrt
 extern _LIBCXXABI_FUNC_VIS __cxa_exception*
+#ifdef __wasm__
+// In Wasm, a destructor returns its argument
+__cxa_init_primary_exception(void* object, std::type_info* tinfo, void*(_LIBCXXABI_DTOR_FUNC* dest)(void*)) throw();
+#else
 __cxa_init_primary_exception(void* object, std::type_info* tinfo, void(_LIBCXXABI_DTOR_FUNC* dest)(void*)) throw();
+#endif
 
 // 2.4.3 Throwing the Exception Object
 extern _LIBCXXABI_FUNC_VIS _LIBCXXABI_NORETURN void
 __cxa_throw(void *thrown_exception, std::type_info *tinfo,
-#ifdef __USING_WASM_EXCEPTIONS__
-            // In Wasm, a destructor returns its argument
+#ifdef __wasm__
             void *(_LIBCXXABI_DTOR_FUNC *dest)(void *));
 #else
             void (_LIBCXXABI_DTOR_FUNC *dest)(void *));
diff --git a/libcxxabi/src/cxa_exception.cpp b/libcxxabi/src/cxa_exception.cpp
index 65e9f45..ff69a4c 100644
--- a/libcxxabi/src/cxa_exception.cpp
+++ b/libcxxabi/src/cxa_exception.cpp
@@ -207,7 +207,12 @@ void __cxa_free_exception(void *thrown_object) throw() {
 }
 
 __cxa_exception* __cxa_init_primary_exception(void* object, std::type_info* tinfo,
+#ifdef __wasm__
+// In Wasm, a destructor returns its argument
+                                              void *(_LIBCXXABI_DTOR_FUNC* dest)(void*)) throw() {
+#else
                                               void(_LIBCXXABI_DTOR_FUNC* dest)(void*)) throw() {
+#endif
   __cxa_exception* exception_header = cxa_exception_from_thrown_object(object);
   exception_header->referenceCount = 0;
   exception_header->unexpectedHandler = std::get_unexpected();
@@ -267,7 +272,7 @@ will call terminate, assuming that there was no handler for the
 exception.
 */
 void
-#ifdef __USING_WASM_EXCEPTIONS__
+#ifdef __wasm__
 // In Wasm, a destructor returns its argument
 __cxa_throw(void *thrown_object, std::type_info *tinfo, void *(_LIBCXXABI_DTOR_FUNC *dest)(void *)) {
 #else
diff --git a/libcxxabi/src/cxa_exception.h b/libcxxabi/src/cxa_exception.h
index 10712f6..aba08f2 100644
--- a/libcxxabi/src/cxa_exception.h
+++ b/libcxxabi/src/cxa_exception.h
@@ -43,7 +43,7 @@ struct _LIBCXXABI_HIDDEN __cxa_exception {
 
     //  Manage the exception object itself.
     std::type_info *exceptionType;
-#ifdef __USING_WASM_EXCEPTIONS__
+#ifdef __wasm__
     // In Wasm, a destructor returns its argument
     void *(_LIBCXXABI_DTOR_FUNC *exceptionDestructor)(void *);
 #else
diff --git a/libcxxabi/src/cxa_personality.cpp b/libcxxabi/src/cxa_personality.cpp
index d95d781..843a18a 100644
--- a/libcxxabi/src/cxa_personality.cpp
+++ b/libcxxabi/src/cxa_personality.cpp
@@ -70,7 +70,7 @@ extern "C" EXCEPTION_DISPOSITION _GCC_specific_handler(PEXCEPTION_RECORD,
 +------------------+--+-----+-----+------------------------+--------------------------+
 | callSiteTableLength | (ULEB128) | Call Site Table length, used to find Action table |
 +---------------------+-----------+---------------------------------------------------+
-#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__USING_WASM_EXCEPTIONS__)
+#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__WASM_EXCEPTIONS__)
 +---------------------+-----------+------------------------------------------------+
 | Beginning of Call Site Table            The current ip lies within the           |
 | ...                                     (start, length) range of one of these    |
@@ -84,7 +84,7 @@ extern "C" EXCEPTION_DISPOSITION _GCC_specific_handler(PEXCEPTION_RECORD,
 | +-------------+---------------------------------+------------------------------+ |
 | ...                                                                              |
 +----------------------------------------------------------------------------------+
-#else  // __USING_SJLJ_EXCEPTIONS__ || __USING_WASM_EXCEPTIONS__
+#else  // __USING_SJLJ_EXCEPTIONS__ || __WASM_EXCEPTIONS__
 +---------------------+-----------+------------------------------------------------+
 | Beginning of Call Site Table            The current ip is a 1-based index into   |
 | ...                                     this table.  Or it is -1 meaning no      |
@@ -97,7 +97,7 @@ extern "C" EXCEPTION_DISPOSITION _GCC_specific_handler(PEXCEPTION_RECORD,
 | +-------------+---------------------------------+------------------------------+ |
 | ...                                                                              |
 +----------------------------------------------------------------------------------+
-#endif // __USING_SJLJ_EXCEPTIONS__ || __USING_WASM_EXCEPTIONS__
+#endif // __USING_SJLJ_EXCEPTIONS__ || __WASM_EXCEPTIONS__
 +---------------------------------------------------------------------+
 | Beginning of Action Table       ttypeIndex == 0 : cleanup           |
 | ...                             ttypeIndex  > 0 : catch             |
@@ -547,7 +547,7 @@ void
 set_registers(_Unwind_Exception* unwind_exception, _Unwind_Context* context,
               const scan_results& results)
 {
-#if defined(__USING_SJLJ_EXCEPTIONS__) || defined(__USING_WASM_EXCEPTIONS__)
+#if defined(__USING_SJLJ_EXCEPTIONS__) || defined(__WASM_EXCEPTIONS__)
 #define __builtin_eh_return_data_regno(regno) regno
 #elif defined(__ibmxl__)
 // IBM xlclang++ compiler does not support __builtin_eh_return_data_regno.
@@ -642,7 +642,7 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
     // Get beginning current frame's code (as defined by the
     // emitted dwarf code)
     uintptr_t funcStart = _Unwind_GetRegionStart(context);
-#if defined(__USING_SJLJ_EXCEPTIONS__) || defined(__USING_WASM_EXCEPTIONS__)
+#if defined(__USING_SJLJ_EXCEPTIONS__) || defined(__WASM_EXCEPTIONS__)
     if (ip == uintptr_t(-1))
     {
         // no action
@@ -652,9 +652,9 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
     else if (ip == 0)
         call_terminate(native_exception, unwind_exception);
     // ip is 1-based index into call site table
-#else  // !__USING_SJLJ_EXCEPTIONS__ && !__USING_WASM_EXCEPTIONS__
+#else  // !__USING_SJLJ_EXCEPTIONS__ && !__WASM_EXCEPTIONS__
     uintptr_t ipOffset = ip - funcStart;
-#endif // !__USING_SJLJ_EXCEPTIONS__ && !__USING_WASM_EXCEPTIONS__
+#endif // !__USING_SJLJ_EXCEPTIONS__ && !__WASM_EXCEPTIONS__
     const uint8_t* classInfo = NULL;
     // Note: See JITDwarfEmitter::EmitExceptionTable(...) for corresponding
     //       dwarf emission
@@ -675,7 +675,7 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
     // Walk call-site table looking for range that
     // includes current PC.
     uint8_t callSiteEncoding = *lsda++;
-#if defined(__USING_SJLJ_EXCEPTIONS__) || defined(__USING_WASM_EXCEPTIONS__)
+#if defined(__USING_SJLJ_EXCEPTIONS__) || defined(__WASM_EXCEPTIONS__)
     (void)callSiteEncoding;  // When using SjLj/Wasm exceptions, callSiteEncoding is never used
 #endif
     uint32_t callSiteTableLength = static_cast<uint32_t>(readULEB128(&lsda));
@@ -686,7 +686,7 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
     while (callSitePtr < callSiteTableEnd)
     {
         // There is one entry per call site.
-#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__USING_WASM_EXCEPTIONS__)
+#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__WASM_EXCEPTIONS__)
         // The call sites are non-overlapping in [start, start+length)
         // The call sites are ordered in increasing value of start
         uintptr_t start = readEncodedPointer(&callSitePtr, callSiteEncoding);
@@ -694,15 +694,15 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
         uintptr_t landingPad = readEncodedPointer(&callSitePtr, callSiteEncoding);
         uintptr_t actionEntry = readULEB128(&callSitePtr);
         if ((start <= ipOffset) && (ipOffset < (start + length)))
-#else  // __USING_SJLJ_EXCEPTIONS__ || __USING_WASM_EXCEPTIONS__
+#else  // __USING_SJLJ_EXCEPTIONS__ || __WASM_EXCEPTIONS__
         // ip is 1-based index into this table
         uintptr_t landingPad = readULEB128(&callSitePtr);
         uintptr_t actionEntry = readULEB128(&callSitePtr);
         if (--ip == 0)
-#endif // __USING_SJLJ_EXCEPTIONS__ || __USING_WASM_EXCEPTIONS__
+#endif // __USING_SJLJ_EXCEPTIONS__ || __WASM_EXCEPTIONS__
         {
             // Found the call site containing ip.
-#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__USING_WASM_EXCEPTIONS__)
+#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__WASM_EXCEPTIONS__)
             if (landingPad == 0)
             {
                 // No handler here
@@ -710,9 +710,9 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
                 return;
             }
             landingPad = (uintptr_t)lpStart + landingPad;
-#else  // __USING_SJLJ_EXCEPTIONS__ || __USING_WASM_EXCEPTIONS__
+#else  // __USING_SJLJ_EXCEPTIONS__ || __WASM_EXCEPTIONS__
             ++landingPad;
-#endif // __USING_SJLJ_EXCEPTIONS__ || __USING_WASM_EXCEPTIONS__
+#endif // __USING_SJLJ_EXCEPTIONS__ || __WASM_EXCEPTIONS__
             results.landingPad = landingPad;
             if (actionEntry == 0)
             {
@@ -838,7 +838,7 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
                 action += actionOffset;
             }  // there is no break out of this loop, only return
         }
-#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__USING_WASM_EXCEPTIONS__)
+#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__WASM_EXCEPTIONS__)
         else if (ipOffset < start)
         {
             // There is no call site for this ip
@@ -846,7 +846,7 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
             // Possible stack corruption.
             call_terminate(native_exception, unwind_exception);
         }
-#endif // !__USING_SJLJ_EXCEPTIONS__ && !__USING_WASM_EXCEPTIONS__
+#endif // !__USING_SJLJ_EXCEPTIONS__ && !__WASM_EXCEPTIONS__
     }  // there might be some tricky cases which break out of this loop
 
     // It is possible that no eh table entry specify how to handle
@@ -903,7 +903,7 @@ _UA_CLEANUP_PHASE
 */
 
 #if !defined(_LIBCXXABI_ARM_EHABI)
-#ifdef __USING_WASM_EXCEPTIONS__
+#ifdef __WASM_EXCEPTIONS__
 _Unwind_Reason_Code __gxx_personality_wasm0
 #elif defined(__SEH__) && !defined(__USING_SJLJ_EXCEPTIONS__)
 static _Unwind_Reason_Code __gxx_personality_imp
@@ -972,7 +972,7 @@ __gxx_personality_v0
             exc->languageSpecificData = results.languageSpecificData;
             exc->catchTemp = reinterpret_cast<void*>(results.landingPad);
             exc->adjustedPtr = results.adjustedPtr;
-#ifdef __USING_WASM_EXCEPTIONS__
+#ifdef __WASM_EXCEPTIONS__
             // Wasm only uses a single phase (_UA_SEARCH_PHASE), so save the
             // results here.
             set_registers(unwind_exception, context, results);
diff --git a/libunwind/include/__libunwind_config.h b/libunwind/include/__libunwind_config.h
index 8db336b..028b9e3 100644
--- a/libunwind/include/__libunwind_config.h
+++ b/libunwind/include/__libunwind_config.h
@@ -180,6 +180,10 @@
 #endif
 #define _LIBUNWIND_HIGHEST_DWARF_REGISTER                                      \
   _LIBUNWIND_HIGHEST_DWARF_REGISTER_LOONGARCH
+#elif defined(__wasm__)
+// Unused
+#define _LIBUNWIND_CONTEXT_SIZE 0
+#define _LIBUNWIND_CURSOR_SIZE 0
 # else
 #  error "Unsupported architecture."
 # endif
diff --git a/libunwind/src/Unwind-wasm.c b/libunwind/src/Unwind-wasm.c
index f7f39d3..b18b32c 100644
--- a/libunwind/src/Unwind-wasm.c
+++ b/libunwind/src/Unwind-wasm.c
@@ -14,7 +14,7 @@
 
 #include "config.h"
 
-#ifdef __USING_WASM_EXCEPTIONS__
+#ifdef __WASM_EXCEPTIONS__
 
 #include "unwind.h"
 #include <threads.h>
@@ -120,4 +120,4 @@ _Unwind_GetRegionStart(struct _Unwind_Context *context) {
   return 0;
 }
 
-#endif // defined(__USING_WASM_EXCEPTIONS__)
+#endif // defined(__WASM_EXCEPTIONS__)
diff --git a/libunwind/src/UnwindCursor.hpp b/libunwind/src/UnwindCursor.hpp
index 7753936..66fe8e2 100644
--- a/libunwind/src/UnwindCursor.hpp
+++ b/libunwind/src/UnwindCursor.hpp
@@ -2416,7 +2416,7 @@ int UnwindCursor<A, R>::stepWithTBTable(pint_t pc, tbtable *TBTable,
     }
 
     // Reset LR in the current context.
-    newRegisters.setLR(NULL);
+    newRegisters.setLR(static_cast<uintptr_t>(NULL));
 
     _LIBUNWIND_TRACE_UNWINDING(
         "Extract info from lastStack=%p, returnAddress=%p",
diff --git a/libunwind/src/UnwindLevel1.c b/libunwind/src/UnwindLevel1.c
index 05d0f2c..48e7bc3 100644
--- a/libunwind/src/UnwindLevel1.c
+++ b/libunwind/src/UnwindLevel1.c
@@ -31,7 +31,8 @@
 #include "libunwind_ext.h"
 #include "unwind.h"
 
-#if !defined(_LIBUNWIND_ARM_EHABI) && !defined(__USING_SJLJ_EXCEPTIONS__)
+#if !defined(_LIBUNWIND_ARM_EHABI) && !defined(__USING_SJLJ_EXCEPTIONS__) &&   \
+    !defined(__wasm__)
 
 #ifndef _LIBUNWIND_SUPPORT_SEH_UNWIND
 
diff --git a/libunwind/src/UnwindRegistersRestore.S b/libunwind/src/UnwindRegistersRestore.S
index 42c2488..67d9e05 100644
--- a/libunwind/src/UnwindRegistersRestore.S
+++ b/libunwind/src/UnwindRegistersRestore.S
@@ -20,7 +20,7 @@
   .text
 #endif
 
-#if !defined(__USING_SJLJ_EXCEPTIONS__)
+#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__wasm__)
 
 #if defined(__i386__)
 DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_x86_jumpto)
@@ -1232,7 +1232,7 @@ DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind19Registers_loongarch6jumptoEv)
 
 #endif
 
-#endif /* !defined(__USING_SJLJ_EXCEPTIONS__) */
+#endif /* !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__wasm__) */
 
 NO_EXEC_STACK_DIRECTIVE
 
diff --git a/libunwind/src/UnwindRegistersSave.S b/libunwind/src/UnwindRegistersSave.S
index 19a0e87..5bf6055 100644
--- a/libunwind/src/UnwindRegistersSave.S
+++ b/libunwind/src/UnwindRegistersSave.S
@@ -20,7 +20,7 @@
     .text
 #endif
 
-#if !defined(__USING_SJLJ_EXCEPTIONS__)
+#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__wasm__)
 
 #if defined(__i386__)
 
@@ -1177,6 +1177,6 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
 
   WEAK_ALIAS(__unw_getcontext, unw_getcontext)
 
-#endif /* !defined(__USING_SJLJ_EXCEPTIONS__) */
+#endif /* !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__wasm__) */
 
 NO_EXEC_STACK_DIRECTIVE
diff --git a/libunwind/src/libunwind.cpp b/libunwind/src/libunwind.cpp
index 217dde9..cf39ec5 100644
--- a/libunwind/src/libunwind.cpp
+++ b/libunwind/src/libunwind.cpp
@@ -26,7 +26,7 @@
 #include <sanitizer/asan_interface.h>
 #endif
 
-#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__USING_WASM_EXCEPTIONS__)
+#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__wasm__)
 #include "AddressSpace.hpp"
 #include "UnwindCursor.hpp"
 
@@ -347,8 +347,7 @@ void __unw_remove_dynamic_eh_frame_section(unw_word_t eh_frame_start) {
 }
 
 #endif // defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)
-#endif // !defined(__USING_SJLJ_EXCEPTIONS__) &&
-       // !defined(__USING_WASM_EXCEPTIONS__)
+#endif // !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__wasm__)
 
 #ifdef __APPLE__
 
diff --git a/lld/ELF/Arch/AVR.cpp b/lld/ELF/Arch/AVR.cpp
index 9211eab..2275f86 100644
--- a/lld/ELF/Arch/AVR.cpp
+++ b/lld/ELF/Arch/AVR.cpp
@@ -231,14 +231,13 @@ void AVR::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
 
   // Since every jump destination is word aligned we gain an extra bit
   case R_AVR_7_PCREL: {
-    checkInt(loc, val - 2, 7, rel);
+    checkInt(loc, val - 2, 8, rel);
     checkAlignment(loc, val, 2, rel);
     const uint16_t target = (val - 2) >> 1;
     write16le(loc, (read16le(loc) & 0xfc07) | ((target & 0x7f) << 3));
     break;
   }
   case R_AVR_13_PCREL: {
-    checkInt(loc, val - 2, 13, rel);
     checkAlignment(loc, val, 2, rel);
     const uint16_t target = (val - 2) >> 1;
     write16le(loc, (read16le(loc) & 0xf000) | (target & 0xfff));
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index dbb8141..f0dfe7f 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -102,6 +102,9 @@ enum class GnuStackKind { None, Exec, NoExec };
 // For --lto=
 enum LtoKind : uint8_t {UnifiedThin, UnifiedRegular, Default};
 
+// For -z gcs=
+enum class GcsPolicy { Implicit, Never, Always };
+
 struct SymbolVersion {
   llvm::StringRef name;
   bool isExternCpp;
@@ -188,6 +191,7 @@ struct Config {
   StringRef zBtiReport = "none";
   StringRef zCetReport = "none";
   StringRef zPauthReport = "none";
+  StringRef zGcsReport = "none";
   bool ltoBBAddrMap;
   llvm::StringRef ltoBasicBlockSections;
   std::pair<llvm::StringRef, llvm::StringRef> thinLTOObjectSuffixReplace;
@@ -341,6 +345,7 @@ struct Config {
   UnresolvedPolicy unresolvedSymbols;
   UnresolvedPolicy unresolvedSymbolsInShlib;
   Target2Policy target2;
+  GcsPolicy zGcs;
   bool power10Stubs;
   ARMVFPArgKind armVFPArgs = ARMVFPArgKind::Default;
   BuildIdKind buildId = BuildIdKind::None;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 028cdcc..ddc574a 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -466,6 +466,10 @@ static void checkOptions() {
       error("-z bti-report only supported on AArch64");
     if (config->zPauthReport != "none")
       error("-z pauth-report only supported on AArch64");
+    if (config->zGcsReport != "none")
+      error("-z gcs-report only supported on AArch64");
+    if (config->zGcs != GcsPolicy::Implicit)
+      error("-z gcs only supported on AArch64");
   }
 
   if (config->emachine != EM_386 && config->emachine != EM_X86_64 &&
@@ -560,6 +564,25 @@ static uint8_t getZStartStopVisibility(opt::InputArgList &args) {
   return ret;
 }
 
+static GcsPolicy getZGcs(opt::InputArgList &args) {
+  GcsPolicy ret = GcsPolicy::Implicit;
+  for (auto *arg : args.filtered(OPT_z)) {
+    std::pair<StringRef, StringRef> kv = StringRef(arg->getValue()).split('=');
+    if (kv.first == "gcs") {
+      arg->claim();
+      if (kv.second == "implicit")
+        ret = GcsPolicy::Implicit;
+      else if (kv.second == "never")
+        ret = GcsPolicy::Never;
+      else if (kv.second == "always")
+        ret = GcsPolicy::Always;
+      else
+        error("unknown -z gcs= value: " + kv.second);
+    }
+  }
+  return ret;
+}
+
 // Report a warning for an unknown -z option.
 static void checkZOptions(opt::InputArgList &args) {
   // This function is called before getTarget(), when certain options are not
@@ -1438,6 +1461,7 @@ static void readConfigs(opt::InputArgList &args) {
   config->zCopyreloc = getZFlag(args, "copyreloc", "nocopyreloc", true);
   config->zForceBti = hasZOption(args, "force-bti");
   config->zForceIbt = hasZOption(args, "force-ibt");
+  config->zGcs = getZGcs(args);
   config->zGlobal = hasZOption(args, "global");
   config->zGnustack = getZGnuStack(args);
   config->zHazardplt = hasZOption(args, "hazardplt");
@@ -1510,6 +1534,7 @@ static void readConfigs(opt::InputArgList &args) {
 
   auto reports = {std::make_pair("bti-report", &config->zBtiReport),
                   std::make_pair("cet-report", &config->zCetReport),
+                  std::make_pair("gcs-report", &config->zGcsReport),
                   std::make_pair("pauth-report", &config->zPauthReport)};
   for (opt::Arg *arg : args.filtered(OPT_z)) {
     std::pair<StringRef, StringRef> option =
@@ -2678,6 +2703,11 @@ static void readSecurityNotes() {
                       "GNU_PROPERTY_AARCH64_FEATURE_1_BTI property");
 
     checkAndReportMissingFeature(
+        config->zGcsReport, features, GNU_PROPERTY_AARCH64_FEATURE_1_GCS,
+        toString(f) + ": -z gcs-report: file does not have "
+                      "GNU_PROPERTY_AARCH64_FEATURE_1_GCS property");
+
+    checkAndReportMissingFeature(
         config->zCetReport, features, GNU_PROPERTY_X86_FEATURE_1_IBT,
         toString(f) + ": -z cet-report: file does not have "
                       "GNU_PROPERTY_X86_FEATURE_1_IBT property");
@@ -2729,6 +2759,12 @@ static void readSecurityNotes() {
   // Force enable Shadow Stack.
   if (config->zShstk)
     config->andFeatures |= GNU_PROPERTY_X86_FEATURE_1_SHSTK;
+
+  // Force enable/disable GCS
+  if (config->zGcs == GcsPolicy::Always)
+    config->andFeatures |= GNU_PROPERTY_AARCH64_FEATURE_1_GCS;
+  else if (config->zGcs == GcsPolicy::Never)
+    config->andFeatures &= ~GNU_PROPERTY_AARCH64_FEATURE_1_GCS;
 }
 
 static void initSectionsAndLocalSyms(ELFFileBase *file, bool ignoreComdats) {
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td
index 883a607..ff61a56 100644
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -69,6 +69,7 @@ defm compress_debug_sections:
 
 defm compress_sections: EEq<"compress-sections",
   "Compress output sections that match the glob and do not have the SHF_ALLOC flag. "
+  "The sections remain uncompressed if compressed content would be larger. "
   "The compression level is <level> (if specified) or a default speed-focused level">,
   MetaVarName<"<section-glob>={none,zlib,zstd}[:level]">;
 
diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
index fcb4c43..60de100 100644
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -344,9 +344,10 @@ template <class ELFT> void OutputSection::maybeCompress() {
   (void)sizeof(Elf_Chdr);
 
   DebugCompressionType ctype = DebugCompressionType::None;
+  size_t compressedSize = sizeof(Elf_Chdr);
   unsigned level = 0; // default compression level
   if (!(flags & SHF_ALLOC) && config->compressDebugSections &&
-      name.starts_with(".debug_") && size)
+      name.starts_with(".debug_"))
     ctype = *config->compressDebugSections;
   for (auto &[glob, t, l] : config->compressSections)
     if (glob.match(name))
@@ -360,7 +361,6 @@ template <class ELFT> void OutputSection::maybeCompress() {
   }
 
   llvm::TimeTraceScope timeScope("Compress sections");
-  compressed.uncompressedSize = size;
   auto buf = std::make_unique<uint8_t[]>(size);
   // Write uncompressed data to a temporary zero-initialized buffer.
   {
@@ -378,7 +378,6 @@ template <class ELFT> void OutputSection::maybeCompress() {
   [[maybe_unused]] constexpr size_t shardSize = 1 << 20;
   auto shardsIn = split(ArrayRef<uint8_t>(buf.get(), size), shardSize);
   const size_t numShards = shardsIn.size();
-  compressed.numShards = numShards;
   auto shardsOut = std::make_unique<SmallVector<uint8_t, 0>[]>(numShards);
 
 #if LLVM_ENABLE_ZSTD
@@ -409,9 +408,8 @@ template <class ELFT> void OutputSection::maybeCompress() {
       shardsOut[i] = std::move(out);
     });
     compressed.type = ELFCOMPRESS_ZSTD;
-    size = sizeof(Elf_Chdr);
     for (size_t i = 0; i != numShards; ++i)
-      size += shardsOut[i].size();
+      compressedSize += shardsOut[i].size();
   }
 #endif
 
@@ -434,18 +432,23 @@ template <class ELFT> void OutputSection::maybeCompress() {
 
     // Update section size and combine Alder-32 checksums.
     uint32_t checksum = 1;       // Initial Adler-32 value
-    size = sizeof(Elf_Chdr) + 2; // Elf_Chdir and zlib header
+    compressedSize += 2;         // Elf_Chdir and zlib header
     for (size_t i = 0; i != numShards; ++i) {
-      size += shardsOut[i].size();
+      compressedSize += shardsOut[i].size();
       checksum = adler32_combine(checksum, shardsAdler[i], shardsIn[i].size());
     }
-    size += 4; // checksum
+    compressedSize += 4; // checksum
     compressed.type = ELFCOMPRESS_ZLIB;
     compressed.checksum = checksum;
   }
 #endif
 
+  if (compressedSize >= size)
+    return;
+  compressed.uncompressedSize = size;
   compressed.shards = std::move(shardsOut);
+  compressed.numShards = numShards;
+  size = compressedSize;
   flags |= SHF_COMPRESSED;
 }
 
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index 0df13f0..da3b926 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -148,6 +148,7 @@ Alias for
 .Fl -color-diagnostics Ns = Ns Cm auto .
 .It Fl -compress-debug-sections Ns = Ns Ar value
 Compress DWARF debug sections.
+The sections remain uncompressed if compressed content would be larger.
 .Cm value
 may be
 .Pp
@@ -163,6 +164,7 @@ Use the default compression level in zstd.
 .Pp
 .It Fl -compress-sections Ns = Ns Ar section-glob={none,zlib,zstd}[:level]
 Compress output sections that match the glob and do not have the SHF_ALLOC flag.
+The matched sections remain uncompressed if compressed content would be larger.
 The compression level is
 .Cm level
 (if specified) or a default speed-focused level.
@@ -420,9 +422,7 @@ Disable string merging.
 .It Cm 1
 Enable string merging.
 .It Cm 2
-Enable string tail merging. If
-.Fl -compress-debug-sections
-is given, compress debug sections at compression level 6 instead of 1.
+Enable string tail merging.
 .El
 .Pp
 .Fl O Ns Cm 1
diff --git a/lld/test/ELF/aarch64-feature-gcs.s b/lld/test/ELF/aarch64-feature-gcs.s
new file mode 100644
index 0000000..7a08673
--- /dev/null
+++ b/lld/test/ELF/aarch64-feature-gcs.s
@@ -0,0 +1,134 @@
+# REQUIRES: aarch64
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu func1-gcs.s -o func1-gcs.o
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu func2.s -o func2.o
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu func2-gcs.s -o func2-gcs.o
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu func3.s -o func3.o
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu func3-gcs.s -o func3-gcs.o
+
+## GCS should be enabled when it's enabled in all inputs or when it's forced on.
+
+# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -o gcs
+# RUN: llvm-readelf -n gcs | FileCheck --check-prefix GCS %s
+# RUN: ld.lld func1-gcs.o func3-gcs.o --shared -o gcs.so
+# RUN: llvm-readelf -n gcs.so | FileCheck --check-prefix GCS %s
+# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -o force-gcs -z gcs=always
+# RUN: llvm-readelf -n force-gcs | FileCheck --check-prefix GCS %s
+# RUN: ld.lld func2-gcs.o func3.o --shared -o force-gcs.so -z gcs=always
+# RUN: llvm-readelf -n force-gcs.so | FileCheck --check-prefix GCS %s
+# RUN: ld.lld func2-gcs.o func3.o --shared -o force-gcs2.so -z gcs=never -z gcs=always
+# RUN: llvm-readelf -n force-gcs2.so | FileCheck --check-prefix GCS %s
+
+# GCS: Properties:    aarch64 feature: GCS
+
+## GCS should not be enabled if it's not enabled in at least one input.
+
+# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -o no-gcs
+# RUN: llvm-readelf -n no-gcs | count 0
+# RUN: ld.lld func2-gcs.o func3.o --shared -o no-gcs.so
+
+## GCS should be disabled with gcs=never, even if GCS is present in all inputs.
+
+# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -z gcs=never -o never-gcs
+# RUN: llvm-readelf -n never-gcs | count 0
+# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -z gcs=always -z gcs=never -o never-gcs2
+# RUN: llvm-readelf -n never-gcs2 | count 0
+
+## gcs-report should report any input files that don't have the gcs property.
+
+# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -o /dev/null -z gcs-report=warning 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
+# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -o /dev/null -z gcs-report=warning -z gcs=always 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
+# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -o /dev/null -z gcs-report=warning -z gcs=never 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
+# RUN: not ld.lld func2-gcs.o func3.o --shared -o /dev/null -z gcs-report=error 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s
+# RUN: not ld.lld func2-gcs.o func3.o --shared -o /dev/null -z gcs-report=error -z gcs=always 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s
+# RUN: not ld.lld func2-gcs.o func3.o --shared -o /dev/null -z gcs-report=error -z gcs=never 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s
+# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -o /dev/null -z gcs-report=warning 2>&1 | count 0
+# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -o /dev/null -z gcs-report=warning -z gcs=always 2>&1 | count 0
+# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -o /dev/null -z gcs-report=warning -z gcs=never 2>&1 | count 0
+
+# REPORT-WARN: warning: func2.o: -z gcs-report: file does not have GNU_PROPERTY_AARCH64_FEATURE_1_GCS property
+# REPORT-ERROR: error: func3.o: -z gcs-report: file does not have GNU_PROPERTY_AARCH64_FEATURE_1_GCS property
+
+## An invalid gcs option should give an error
+# RUN: not ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -z gcs=nonsense 2>&1 | FileCheck --check-prefix=INVALID %s
+
+# INVALID: error: unknown -z gcs= value: nonsense
+
+#--- func1-gcs.s
+.section ".note.gnu.property", "a"
+.long 4
+.long 0x10
+.long 0x5
+.asciz "GNU"
+
+.long 0xc0000000 // GNU_PROPERTY_AARCH64_FEATURE_1_AND
+.long 4
+.long 4          // GNU_PROPERTY_AARCH64_FEATURE_1_GCS
+.long 0
+
+.text
+.globl _start
+.type func1,%function
+func1:
+  bl func2
+  ret
+
+#--- func2.s
+
+.text
+.globl func2
+.type func2,@function
+func2:
+  .globl func3
+  .type func3, @function
+  bl func3
+  ret
+
+#--- func2-gcs.s
+
+.section ".note.gnu.property", "a"
+.long 4
+.long 0x10
+.long 0x5
+.asciz "GNU"
+
+.long 0xc0000000 // GNU_PROPERTY_AARCH64_FEATURE_1_AND
+.long 4
+.long 4          // GNU_PROPERTY_AARCH64_FEATURE_1_GCS
+.long 0
+
+.text
+.globl func2
+.type func2,@function
+func2:
+  .globl func3
+  .type func3, @function
+  bl func3
+  ret
+
+#--- func3.s
+
+.text
+.globl func3
+.type func3,@function
+func3:
+  ret
+
+#--- func3-gcs.s
+
+.section ".note.gnu.property", "a"
+.long 4
+.long 0x10
+.long 0x5
+.asciz "GNU"
+
+.long 0xc0000000 // GNU_PROPERTY_AARCH64_FEATURE_1_AND
+.long 4
+.long 4          // GNU_PROPERTY_AARCH64_FEATURE_1_GCS
+.long 0
+
+.text
+.globl func3
+.type func3,@function
+func3:
+  ret
diff --git a/lld/test/ELF/avr-reloc-error.s b/lld/test/ELF/avr-reloc-error.s
index 0a30f68..f177e44 100644
--- a/lld/test/ELF/avr-reloc-error.s
+++ b/lld/test/ELF/avr-reloc-error.s
@@ -3,7 +3,7 @@
 # RUN: rm -rf %t && split-file %s %t && cd %t
 
 # RUN: llvm-mc -filetype=obj -triple=avr -mcpu=atmega328 avr-pcrel-7.s -o avr-pcrel-7.o
-# RUN: not ld.lld avr-pcrel-7.o -o /dev/null -Ttext=0x1000 --defsym=callee0=0x1040 --defsym=callee1=0x1044 --defsym=callee2=0x100f 2>&1 | \
+# RUN: not ld.lld avr-pcrel-7.o -o /dev/null -Ttext=0x1000 --defsym=callee0=0x1040 --defsym=callee1=0x1084 --defsym=callee2=0x100f 2>&1 | \
 # RUN:     FileCheck %s --check-prefix=PCREL7
 # RUN: llvm-mc -filetype=obj -triple=avr -mcpu=atmega328 avr-pcrel-13.s -o avr-pcrel-13.o
 # RUN: not ld.lld avr-pcrel-13.o -o /dev/null -Ttext=0x1000 --defsym=callee0=0x2000 --defsym=callee1=0x2004 --defsym=callee2=0x100f 2>&1 | \
@@ -20,7 +20,7 @@
 __start:
 
 # PCREL7-NOT: callee0
-# PCREL7:     error: {{.*}} relocation R_AVR_7_PCREL out of range: {{.*}} is not in [-64, 63]; references 'callee1'
+# PCREL7:     error: {{.*}} relocation R_AVR_7_PCREL out of range: {{.*}} is not in [-128, 127]; references 'callee1'
 # PCREL7:     error: {{.*}} improper alignment for relocation R_AVR_7_PCREL: {{.*}} is not aligned to 2 bytes
 brne callee0
 breq callee1
@@ -34,7 +34,6 @@ brlt callee2
 __start:
 
 # PCREL13-NOT: callee0
-# PCREL13:     error: {{.*}} relocation R_AVR_13_PCREL out of range: {{.*}} is not in [-4096, 4095]; references 'callee1'
 # PCREL13:     error: {{.*}} improper alignment for relocation R_AVR_13_PCREL: {{.*}} is not aligned to 2 bytes
 rjmp  callee0
 rcall callee1
diff --git a/lld/test/ELF/avr-reloc.s b/lld/test/ELF/avr-reloc.s
index 172c0e0..ec088ea 100644
--- a/lld/test/ELF/avr-reloc.s
+++ b/lld/test/ELF/avr-reloc.s
@@ -82,6 +82,12 @@ sbic  b, 1    ; R_AVR_PORT5
 ; CHECK-NEXT:  rjmp .-36
 ; CHECK-NEXT:  breq .+26
 ; CHECK-NEXT:  breq .-40
+; CHECK-NEXT:  rjmp .-4096
+; CHECK-NEXT:  rjmp .+4094
+; CHECK-NEXT:  rjmp .+4094
+; CHECK-NEXT:  rjmp .-4096
+; CHECK-NEXT:  breq .-128
+; CHECK-NEXT:  breq .+126
 ; HEX-LABEL:   section .PCREL:
 ; HEX-NEXT:    0fc0eecf 69f061f3
 foo:
@@ -89,6 +95,12 @@ rjmp foo + 32  ; R_AVR_13_PCREL
 rjmp foo - 32  ; R_AVR_13_PCREL
 breq foo + 32  ; R_AVR_7_PCREL
 breq foo - 32  ; R_AVR_7_PCREL
+rjmp 1f - 4096  $ 1:  ; R_AVR_13_PCREL
+rjmp 1f + 4094  $ 1:  ; R_AVR_13_PCREL
+rjmp 1f - 4098  $ 1:  ; R_AVR_13_PCREL (overflow)
+rjmp 1f + 4096  $ 1:  ; R_AVR_13_PCREL (overflow)
+breq 1f - 128   $ 1:  ; R_AVR_7_PCREL
+breq 1f + 126   $ 1:  ; R_AVR_7_PCREL
 
 .section .LDSSTS,"ax",@progbits
 ; CHECK-LABEL: section .LDSSTS:
diff --git a/lld/test/ELF/compress-debug-sections-zstd.s b/lld/test/ELF/compress-debug-sections-zstd.s
index 97ab192..d9f29af 100644
--- a/lld/test/ELF/compress-debug-sections-zstd.s
+++ b/lld/test/ELF/compress-debug-sections-zstd.s
@@ -3,22 +3,25 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64 --compress-debug-sections=zstd %s -o %t.o
 
 # RUN: ld.lld %t.o -o %t.so -shared
-# RUN: llvm-readelf -S -x .debug_str %t.so | FileCheck %s
+# RUN: llvm-readelf -S -p .debug_str %t.so | FileCheck %s
 
 # CHECK:      .debug_str    PROGBITS [[#%x,]] [[#%x,]] [[#%x,]] 01 MS  0 0  1
-# CHECK:      Hex dump of section '.debug_str':
-# CHECK-NEXT: 0x00000000 73686f72 7420756e 7369676e 65642069 short unsigned i
-# CHECK-NEXT: 0x00000010 6e740075 6e736967 6e656420 63686172 nt.unsigned char
-# CHECK-NEXT: 0x00000020 00636861 72006c6f 6e672075 6e736967 .char.long unsig
-# CHECK-NEXT: 0x00000030 6e656420 696e7400 756e7369 676e6564 ned int.unsigned
-# CHECK-NEXT: 0x00000040 20696e74 00                          int.
+# CHECK:      String dump of section '.debug_str':
+# CHECK-NEXT: [     0] {{A+}}
+# CHECK-NEXT: [    81] short unsigned int
+# CHECK-NEXT: [    94] unsigned char
+# CHECK-NEXT: [    a2] char
+# CHECK-NEXT: [    a7] long unsigned int
+# CHECK-NEXT: [    b9] unsigned int
 
 # RUN: ld.lld %t.o -o %t.so -shared --compress-debug-sections=zstd
 # RUN: llvm-readelf -S %t.so | FileCheck %s --check-prefix=OUTPUT-SEC
 # RUN: llvm-objcopy --decompress-debug-sections %t.so
-# RUN: llvm-readelf -S -x .debug_str %t.so | FileCheck %s
+# RUN: llvm-readelf -S -p .debug_str %t.so | FileCheck %s
 
-# OUTPUT-SEC: .debug_str    PROGBITS [[#%x,]] [[#%x,]] [[#%x,]] 01 MSC 0 0  1
+# OUTPUT-SEC:      .debug_str    PROGBITS [[#%x,]] [[#%x,]] [[#%x,]] 01 MSC 0 0  1
+# OUTPUT-SEC-NEXT: .debug_frame  PROGBITS [[#%x,]] [[#%x,]] 000000   00     0 0  1
+# OUTPUT-SEC-NEXT: .debug_loc    PROGBITS [[#%x,]] [[#%x,]] 000010   00     0 0  1
 
 .section .debug_str,"MS",@progbits,1
 .LASF2:
@@ -31,3 +34,11 @@
  .string "char"
 .LASF1:
  .string "unsigned char"
+.Lunused:
+ .fill 128, 1, 0x41
+ .byte 0
+
+## Test sections where compressed content would be larger.
+.section .debug_frame,""
+.section .debug_loc,""
+.space 16
diff --git a/lld/test/ELF/compress-sections-special.s b/lld/test/ELF/compress-sections-special.s
index 80c61fe..7e474ac 100644
--- a/lld/test/ELF/compress-sections-special.s
+++ b/lld/test/ELF/compress-sections-special.s
@@ -14,7 +14,7 @@
 # CHECK:      warning: {{.*}}: unable to get the string table for the SHT_SYMTAB section: SHT_STRTAB string table section
 
 # CHECK:      Hex dump of section '.strtab':
-# CHECK-NEXT: 01000000 00000000 1a000000 00000000
+# CHECK-NEXT: 01000000 00000000 5c000000 00000000
 # CHECK-NEXT: 01000000 00000000 {{.*}}
 
 # RUN: not ld.lld -shared a.o --compress-sections .dynstr=zlib 2>&1 | FileCheck %s --check-prefix=ERR-ALLOC
@@ -25,6 +25,8 @@ _start:
 l0:
 g0:
 g1:
+.globl ggggggggggggggggggggggggggggggg0
+.globl ggggggggggggggggggggggggggggggg1
 
 .section nonalloc0,""
 .quad .text+1
diff --git a/lld/test/ELF/compress-sections.s b/lld/test/ELF/compress-sections.s
index aa30c7a..aaad314 100644
--- a/lld/test/ELF/compress-sections.s
+++ b/lld/test/ELF/compress-sections.s
@@ -11,10 +11,11 @@
 # CHECK1-NEXT: .text      PROGBITS [[#%x,TEXT:]]    [[#%x,]] [[#%x,]] 00 AX   0   0  4
 # CHECK1:      nonalloc0  PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
 # CHECK1-NEXT: nonalloc1  PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
+# CHECK1-NEXT: smallc0    PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
 # CHECK1-NEXT: .debug_str PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01 MS   0   0  1
 
-# CHECK1: 0000000000000010  0 NOTYPE  LOCAL  DEFAULT   [[#]] (nonalloc0) sym0
-# CHECK1: 0000000000000008  0 NOTYPE  LOCAL  DEFAULT   [[#]] (nonalloc1) sym1
+# CHECK1: 0000000000000090  0 NOTYPE  LOCAL  DEFAULT   [[#]] (nonalloc0) sym0
+# CHECK1: 0000000000000088  0 NOTYPE  LOCAL  DEFAULT   [[#]] (nonalloc1) sym1
 
 # RUN: ld.lld -pie a.o --compress-sections '*c0=zlib' --compress-sections .debug_str=zstd:3 -o out2
 # RUN: llvm-readelf -SrsX -x nonalloc0 -x .debug_str out2 | FileCheck %s --check-prefix=CHECK2
@@ -24,15 +25,16 @@
 # CHECK2-NEXT: foo1       PROGBITS [[#%x,FOO1:]]    [[#%x,]] [[#%x,]] 00 A    0   0  8
 # CHECK2-NEXT: .text      PROGBITS [[#%x,TEXT:]]    [[#%x,]] [[#%x,]] 00 AX   0   0  4
 # CHECK2:      nonalloc0  PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00 C    0   0  1
-# CHECK2-NEXT: nonalloc1  PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
+# CHECK2-NEXT: nonalloc1  PROGBITS 0000000000000000 [[#%x,]] 000088   00      0   0  8
+# CHECK2-NEXT: smallc0    PROGBITS 0000000000000000 [[#%x,]] 00000c   00      0   0  1
 # CHECK2-NEXT: .debug_str PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01 MSC  0   0  1
 
-# CHECK2: 0000000000000010  0 NOTYPE  LOCAL  DEFAULT   [[#]] (nonalloc0) sym0
-# CHECK2: 0000000000000008  0 NOTYPE  LOCAL  DEFAULT   [[#]] (nonalloc1) sym1
+# CHECK2: 0000000000000090  0 NOTYPE  LOCAL  DEFAULT   [[#]] (nonalloc0) sym0
+# CHECK2: 0000000000000088  0 NOTYPE  LOCAL  DEFAULT   [[#]] (nonalloc1) sym1
 
 # CHECK2:      Hex dump of section 'nonalloc0':
-## zlib with ch_size=0x10
-# CHECK2-NEXT: 01000000 00000000 10000000 00000000
+## zlib with ch_size=0x90
+# CHECK2-NEXT: 01000000 00000000 90000000 00000000
 # CHECK2-NEXT: 01000000 00000000 {{.*}}
 # CHECK2:      Hex dump of section '.debug_str':
 ## zstd with ch_size=0x38
@@ -80,20 +82,28 @@ _start:
 .balign 8
 .quad .text-.
 .quad .text-.
+.space 128
 .section foo1,"a"
 .balign 8
 .quad .text-.
 .quad .text-.
+.space 128
 .section nonalloc0,""
 .balign 8
 .quad .text+1
 .quad .text+2
+.space 128
 sym0:
 .section nonalloc1,""
 .balign 8
 .quad 42
+.space 128
 sym1:
 
+.section smallc0,""
+.balign 8
+.space 12
+
 .section .debug_str,"MS",@progbits,1
 .Linfo_string0:
   .asciz "AAAAAAAAAAAAAAAAAAAAAAAAAAA"
diff --git a/lld/test/ELF/compressed-debug-level.test b/lld/test/ELF/compressed-debug-level.test
index ce3a194..5a4d37e 100644
--- a/lld/test/ELF/compressed-debug-level.test
+++ b/lld/test/ELF/compressed-debug-level.test
@@ -18,8 +18,8 @@
 # RUN: llvm-readelf --sections %t.6 | FileCheck -check-prefixes=HEADER,LEVEL6 %s
 
 # HEADER: [Nr] Name        Type     Address  Off    Size
-# LEVEL1: [ 1] .debug_info PROGBITS 00000000 000094 00001{{[bc]}}
-# LEVEL6: [ 1] .debug_info PROGBITS 00000000 000094 00001a
+# LEVEL1: [ 1] .debug_info PROGBITS 00000000 000094 0000{{1[def]|21}}
+# LEVEL6: [ 1] .debug_info PROGBITS 00000000 000094 00001{{[abc]}}
 
 ## A little arbitrary debug section which has a different size after
 ## applying compression of level 1 and 6.
@@ -33,4 +33,4 @@ FileHeader:
 Sections:
   - Name:    .debug_info
     Type:    SHT_PROGBITS
-    Content: '010101010101010201010201'
+    Content: '010101010101010201010201010101010101010201010201010101010101010201010201'
diff --git a/lld/test/ELF/linkerscript/compress-debug-sections.s b/lld/test/ELF/linkerscript/compress-debug-sections.s
index fe1c66d..8d06689 100644
--- a/lld/test/ELF/linkerscript/compress-debug-sections.s
+++ b/lld/test/ELF/linkerscript/compress-debug-sections.s
@@ -34,3 +34,5 @@
 .section .debug_str,"MS",@progbits,1
   .asciz "AAA"
   .asciz "BBB"
+  .fill 64,1,0x41
+  .byte 0
diff --git a/lld/test/ELF/linkerscript/compress-sections.s b/lld/test/ELF/linkerscript/compress-sections.s
index 9b4574a..5131fa7 100644
--- a/lld/test/ELF/linkerscript/compress-sections.s
+++ b/lld/test/ELF/linkerscript/compress-sections.s
@@ -10,10 +10,11 @@
 # CHECK-NEXT: str      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01 MSC   0   0  1
 
 # CHECK:      0000000000000000  0 NOTYPE  GLOBAL DEFAULT [[#]] (nonalloc) nonalloc_start
-# CHECK:      0000000000000023  0 NOTYPE  GLOBAL DEFAULT [[#]] (nonalloc) nonalloc_end
+# CHECK:      0000000000000063  0 NOTYPE  GLOBAL DEFAULT [[#]] (nonalloc) nonalloc_end
 # CHECK:      String dump of section 'str':
 # CHECK-NEXT: [     0] AAA
-# CHECK-NEXT: [     4] BBB
+# CHECK-NEXT: [     4] {{a+}}
+# CHECK-NEXT: [    45] BBB
 
 ## TODO The uncompressed size of 'nonalloc' is dependent on linker script
 ## commands, which is not handled. We should report an error.
@@ -28,6 +29,7 @@ _start:
 .balign 8
 .quad .text
 .quad .text
+.space 64
 .section nonalloc1,""
 .balign 8
 .quad 42
@@ -35,6 +37,8 @@ _start:
 .section str,"MS",@progbits,1
   .asciz "AAA"
   .asciz "BBB"
+  .fill 64,1,0x61
+  .byte 0
 
 #--- a.lds
 SECTIONS {
diff --git a/lld/test/wasm/shared64.s b/lld/test/wasm/shared64.s
index 3401fae..73f7743 100644
--- a/lld/test/wasm/shared64.s
+++ b/lld/test/wasm/shared64.s
@@ -154,6 +154,7 @@ get_local_func_address:
 # CHECK-NEXT:           Index:           0
 # CHECK-NEXT:           ElemType:        FUNCREF
 # CHECK-NEXT:           Limits:
+# CHECK-NEXT:             Flags:         [ IS_64 ]
 # CHECK-NEXT:             Minimum:         0x2
 # CHECK-NEXT:       - Module:          env
 # CHECK-NEXT:         Field:           __stack_pointer
@@ -170,11 +171,6 @@ get_local_func_address:
 # CHECK-NEXT:         Kind:            GLOBAL
 # CHECK-NEXT:         GlobalType:      I64
 # CHECK-NEXT:         GlobalMutable:   false
-# CHECK-NEXT:       - Module:          env
-# CHECK-NEXT:         Field:           __table_base32
-# CHECK-NEXT:         Kind:            GLOBAL
-# CHECK-NEXT:         GlobalType:      I32
-# CHECK-NEXT:         GlobalMutable:   false
 # CHECK-NEXT:       - Module:          GOT.mem
 # CHECK-NEXT:         Field:           indirect_func
 # CHECK-NEXT:         Kind:            GLOBAL
@@ -209,7 +205,7 @@ get_local_func_address:
 # CHECK-NEXT:     Segments:
 # CHECK-NEXT:       - Offset:
 # CHECK-NEXT:           Opcode:          GLOBAL_GET
-# CHECK-NEXT:           Index:           3
+# CHECK-NEXT:           Index:           2
 # CHECK-NEXT:         Functions:       [ 3, 2 ]
 
 # check the generated code in __wasm_call_ctors and __wasm_apply_data_relocs functions
@@ -223,7 +219,7 @@ get_local_func_address:
 # DIS-NEXT:                 i64.const       4
 # DIS-NEXT:                 global.get      1
 # DIS-NEXT:                 i64.add
-# DIS-NEXT:                 global.get      5
+# DIS-NEXT:                 global.get      4
 # DIS-NEXT:                 i64.store       0:p2align=2
 # DIS-NEXT:                 i64.const       12
 # DIS-NEXT:                 global.get      1
@@ -242,12 +238,12 @@ get_local_func_address:
 # DIS-NEXT:                 i64.const       24
 # DIS-NEXT:                 global.get      1
 # DIS-NEXT:                 i64.add
-# DIS-NEXT:                 global.get      6
+# DIS-NEXT:                 global.get      5
 # DIS-NEXT:                 i64.store       0:p2align=2
 # DIS-NEXT:                 i64.const       32
 # DIS-NEXT:                 global.get      1
 # DIS-NEXT:                 i64.add
-# DIS-NEXT:                 global.get      7
+# DIS-NEXT:                 global.get      6
 # DIS-NEXT:                 i32.const       4
 # DIS-NEXT:                 i32.add
 # DIS-NEXT:                 i32.store       0
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index d5d763b..cc79f80 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -870,13 +870,6 @@ static void createSyntheticSymbols() {
     WasmSym::tableBase = createUndefinedGlobal("__table_base", globalType);
     WasmSym::memoryBase->markLive();
     WasmSym::tableBase->markLive();
-    if (is64) {
-      WasmSym::tableBase32 =
-          createUndefinedGlobal("__table_base32", &globalTypeI32);
-      WasmSym::tableBase32->markLive();
-    } else {
-      WasmSym::tableBase32 = nullptr;
-    }
   } else {
     // For non-PIC code
     WasmSym::stackPointer = createGlobalVariable("__stack_pointer", true);
@@ -923,9 +916,6 @@ static void createOptionalSymbols() {
     WasmSym::heapEnd = symtab->addOptionalDataSymbol("__heap_end");
     WasmSym::definedMemoryBase = symtab->addOptionalDataSymbol("__memory_base");
     WasmSym::definedTableBase = symtab->addOptionalDataSymbol("__table_base");
-    if (config->is64.value_or(false))
-      WasmSym::definedTableBase32 =
-          symtab->addOptionalDataSymbol("__table_base32");
   }
 
   // For non-shared memory programs we still need to define __tls_base since we
diff --git a/lld/wasm/Symbols.cpp b/lld/wasm/Symbols.cpp
index ace6bad..687728d 100644
--- a/lld/wasm/Symbols.cpp
+++ b/lld/wasm/Symbols.cpp
@@ -96,8 +96,6 @@ GlobalSymbol *WasmSym::tlsSize;
 GlobalSymbol *WasmSym::tlsAlign;
 UndefinedGlobal *WasmSym::tableBase;
 DefinedData *WasmSym::definedTableBase;
-UndefinedGlobal *WasmSym::tableBase32;
-DefinedData *WasmSym::definedTableBase32;
 UndefinedGlobal *WasmSym::memoryBase;
 DefinedData *WasmSym::definedMemoryBase;
 TableSymbol *WasmSym::indirectFunctionTable;
diff --git a/lld/wasm/Symbols.h b/lld/wasm/Symbols.h
index 38586bb..65a062b 100644
--- a/lld/wasm/Symbols.h
+++ b/lld/wasm/Symbols.h
@@ -603,11 +603,6 @@ struct WasmSym {
   // Used in PIC code for offset of indirect function table
   static UndefinedGlobal *tableBase;
   static DefinedData *definedTableBase;
-  // 32-bit copy in wasm64 to work around init expr limitations.
-  // These can potentially be removed again once we have
-  // https://github.com/WebAssembly/extended-const 
-  static UndefinedGlobal *tableBase32;
-  static DefinedData *definedTableBase32;
 
   // __memory_base
   // Used in PIC code for offset of global data
diff --git a/lld/wasm/SyntheticSections.cpp b/lld/wasm/SyntheticSections.cpp
index 72e2559..b359e0f 100644
--- a/lld/wasm/SyntheticSections.cpp
+++ b/lld/wasm/SyntheticSections.cpp
@@ -584,12 +584,10 @@ void ElemSection::writeBody() {
   initExpr.Extended = false;
   if (ctx.isPic) {
     initExpr.Inst.Opcode = WASM_OPCODE_GLOBAL_GET;
-    initExpr.Inst.Value.Global =
-        (config->is64.value_or(false) ? WasmSym::tableBase32
-                                      : WasmSym::tableBase)
-            ->getGlobalIndex();
+    initExpr.Inst.Value.Global = WasmSym::tableBase->getGlobalIndex();
   } else {
-    initExpr.Inst.Opcode = WASM_OPCODE_I32_CONST;
+    bool is64 = config->is64.value_or(false);
+    initExpr.Inst.Opcode = is64 ? WASM_OPCODE_I64_CONST : WASM_OPCODE_I32_CONST;
     initExpr.Inst.Value.Int32 = config->tableBase;
   }
   writeInitExpr(os, initExpr);
diff --git a/lld/wasm/Writer.cpp b/lld/wasm/Writer.cpp
index 55eff99..7a01576 100644
--- a/lld/wasm/Writer.cpp
+++ b/lld/wasm/Writer.cpp
@@ -939,6 +939,8 @@ static void finalizeIndirectFunctionTable() {
     limits.Flags |= WASM_LIMITS_FLAG_HAS_MAX;
     limits.Maximum = limits.Minimum;
   }
+  if (config->is64.value_or(false))
+    limits.Flags |= WASM_LIMITS_FLAG_IS_64;
   WasmSym::indirectFunctionTable->setLimits(limits);
 }
 
@@ -1691,12 +1693,8 @@ void Writer::createSyntheticSectionsPostLayout() {
 void Writer::run() {
   // For PIC code the table base is assigned dynamically by the loader.
   // For non-PIC, we start at 1 so that accessing table index 0 always traps.
-  if (!ctx.isPic) {
-    if (WasmSym::definedTableBase)
-      WasmSym::definedTableBase->setVA(config->tableBase);
-    if (WasmSym::definedTableBase32)
-      WasmSym::definedTableBase32->setVA(config->tableBase);
-  }
+  if (!ctx.isPic && WasmSym::definedTableBase)
+    WasmSym::definedTableBase->setVA(config->tableBase);
 
   log("-- createOutputSegments");
   createOutputSegments();
diff --git a/lldb/cmake/modules/LLDBConfig.cmake b/lldb/cmake/modules/LLDBConfig.cmake
index 3c6223b..6458f2e 100644
--- a/lldb/cmake/modules/LLDBConfig.cmake
+++ b/lldb/cmake/modules/LLDBConfig.cmake
@@ -187,24 +187,18 @@ include_directories("${CMAKE_CURRENT_BINARY_DIR}/../clang/include")
 # form -W<foo>, and if supported, add the corresponding -Wno-<foo> option.
 
 # Disable GCC warnings
-check_cxx_compiler_flag("-Wdeprecated-declarations" CXX_SUPPORTS_DEPRECATED_DECLARATIONS)
-append_if(CXX_SUPPORTS_DEPRECATED_DECLARATIONS "-Wno-deprecated-declarations" CMAKE_CXX_FLAGS)
-
-check_cxx_compiler_flag("-Wunknown-pragmas" CXX_SUPPORTS_UNKNOWN_PRAGMAS)
-append_if(CXX_SUPPORTS_UNKNOWN_PRAGMAS "-Wno-unknown-pragmas" CMAKE_CXX_FLAGS)
-
-check_cxx_compiler_flag("-Wstrict-aliasing" CXX_SUPPORTS_STRICT_ALIASING)
-append_if(CXX_SUPPORTS_STRICT_ALIASING "-Wno-strict-aliasing" CMAKE_CXX_FLAGS)
+append("-Wno-deprecated-declarations" CMAKE_CXX_FLAGS)
+append("-Wno-unknown-pragmas" CMAKE_CXX_FLAGS)
+append("-Wno-strict-aliasing" CMAKE_CXX_FLAGS)
 
 check_cxx_compiler_flag("-Wstringop-truncation" CXX_SUPPORTS_STRINGOP_TRUNCATION)
 append_if(CXX_SUPPORTS_STRINGOP_TRUNCATION "-Wno-stringop-truncation" CMAKE_CXX_FLAGS)
 
 # Disable Clang warnings
-check_cxx_compiler_flag("-Wdeprecated-register" CXX_SUPPORTS_DEPRECATED_REGISTER)
-append_if(CXX_SUPPORTS_DEPRECATED_REGISTER "-Wno-deprecated-register" CMAKE_CXX_FLAGS)
-
-check_cxx_compiler_flag("-Wvla-extension" CXX_SUPPORTS_VLA_EXTENSION)
-append_if(CXX_SUPPORTS_VLA_EXTENSION "-Wno-vla-extension" CMAKE_CXX_FLAGS)
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  append("-Wno-deprecated-register" CMAKE_CXX_FLAGS)
+  append("-Wno-vla-extension" CMAKE_CXX_FLAGS)
+endif()
 
 # Disable MSVC warnings
 if( MSVC )
diff --git a/lldb/docs/resources/build.rst b/lldb/docs/resources/build.rst
index 09d3d15..33b6a6f 100644
--- a/lldb/docs/resources/build.rst
+++ b/lldb/docs/resources/build.rst
@@ -477,7 +477,6 @@ further by passing the appropriate cmake options, such as:
   -DLLDB_ENABLE_PYTHON=0
   -DLLDB_ENABLE_LIBEDIT=0
   -DLLDB_ENABLE_CURSES=0
-  -DLLVM_ENABLE_TERMINFO=0
 
 (see :ref:`Optional Dependencies` for more)
 
diff --git a/lldb/include/lldb/Symbol/CompilerType.h b/lldb/include/lldb/Symbol/CompilerType.h
index 28c723a..70dacdc 100644
--- a/lldb/include/lldb/Symbol/CompilerType.h
+++ b/lldb/include/lldb/Symbol/CompilerType.h
@@ -436,7 +436,7 @@ public:
                                    uint32_t *bitfield_bit_size_ptr = nullptr,
                                    bool *is_bitfield_ptr = nullptr) const;
 
-  CompilerType GetChildCompilerTypeAtIndex(
+  llvm::Expected<CompilerType> GetChildCompilerTypeAtIndex(
       ExecutionContext *exe_ctx, size_t idx, bool transparent_pointers,
       bool omit_empty_base_classes, bool ignore_array_bounds,
       std::string &child_name, uint32_t &child_byte_size,
diff --git a/lldb/include/lldb/Symbol/TypeSystem.h b/lldb/include/lldb/Symbol/TypeSystem.h
index 7bcb8d6..b4025c1 100644
--- a/lldb/include/lldb/Symbol/TypeSystem.h
+++ b/lldb/include/lldb/Symbol/TypeSystem.h
@@ -359,7 +359,7 @@ public:
     return CompilerDecl();
   }
 
-  virtual CompilerType GetChildCompilerTypeAtIndex(
+  virtual llvm::Expected<CompilerType> GetChildCompilerTypeAtIndex(
       lldb::opaque_compiler_type_t type, ExecutionContext *exe_ctx, size_t idx,
       bool transparent_pointers, bool omit_empty_base_classes,
       bool ignore_array_bounds, std::string &child_name,
diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index aac0cf5..637d34c 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -915,8 +915,8 @@ public:
   /// \param[in] force_kill
   ///     Whether lldb should force a kill (instead of a detach) from
   ///     the inferior process.  Normally if lldb launched a binary and
-  ///     Destory is called, lldb kills it.  If lldb attached to a
-  ///     running process and Destory is called, lldb detaches.  If
+  ///     Destroy is called, lldb kills it.  If lldb attached to a
+  ///     running process and Destroy is called, lldb detaches.  If
   ///     this behavior needs to be over-ridden, this is the bool that
   ///     can be used.
   ///
diff --git a/lldb/packages/Python/lldbsuite/test/dotest.py b/lldb/packages/Python/lldbsuite/test/dotest.py
index ebabf34..2e537e3 100644
--- a/lldb/packages/Python/lldbsuite/test/dotest.py
+++ b/lldb/packages/Python/lldbsuite/test/dotest.py
@@ -542,12 +542,6 @@ def setupSysPath():
     lldbDAPExec = os.path.join(lldbDir, "lldb-dap")
     if is_exe(lldbDAPExec):
         os.environ["LLDBDAP_EXEC"] = lldbDAPExec
-    else:
-        if not configuration.shouldSkipBecauseOfCategories(["lldb-dap"]):
-            print(
-                "The 'lldb-dap' executable cannot be located.  The lldb-dap tests can not be run as a result."
-            )
-            configuration.skip_categories.append("lldb-dap")
 
     lldbPythonDir = None  # The directory that contains 'lldb/__init__.py'
 
@@ -929,6 +923,24 @@ def checkPexpectSupport():
         configuration.skip_categories.append("pexpect")
 
 
+def checkDAPSupport():
+    import lldb
+
+    if "LLDBDAP_EXEC" not in os.environ:
+        msg = (
+            "The 'lldb-dap' executable cannot be located and its tests will not be run."
+        )
+    elif lldb.remote_platform:
+        msg = "lldb-dap tests are not compatible with remote platforms and will not be run."
+    else:
+        msg = None
+
+    if msg:
+        if configuration.verbose:
+            print(msg)
+        configuration.skip_categories.append("lldb-dap")
+
+
 def run_suite():
     # On MacOS X, check to make sure that domain for com.apple.DebugSymbols defaults
     # does not exist before proceeding to running the test suite.
@@ -1029,6 +1041,7 @@ def run_suite():
     checkObjcSupport()
     checkForkVForkSupport()
     checkPexpectSupport()
+    checkDAPSupport()
 
     skipped_categories_list = ", ".join(configuration.skip_categories)
     print(
diff --git a/lldb/source/Breakpoint/BreakpointResolverFileLine.cpp b/lldb/source/Breakpoint/BreakpointResolverFileLine.cpp
index d7d8c71..16c4ee1 100644
--- a/lldb/source/Breakpoint/BreakpointResolverFileLine.cpp
+++ b/lldb/source/Breakpoint/BreakpointResolverFileLine.cpp
@@ -198,16 +198,16 @@ void BreakpointResolverFileLine::DeduceSourceMapping(
     return;
 
   Log *log = GetLog(LLDBLog::Breakpoints);
-  const llvm::StringRef path_separator = llvm::sys::path::get_separator(
-      m_location_spec.GetFileSpec().GetPathStyle());
   // Check if "b" is a suffix of "a".
   // And return std::nullopt if not or the new path
   // of "a" after consuming "b" from the back.
   auto check_suffix =
-      [path_separator](llvm::StringRef a, llvm::StringRef b,
-                       bool case_sensitive) -> std::optional<llvm::StringRef> {
+      [](llvm::StringRef a, llvm::StringRef b,
+         bool case_sensitive) -> std::optional<llvm::StringRef> {
     if (case_sensitive ? a.consume_back(b) : a.consume_back_insensitive(b)) {
-      if (a.empty() || a.ends_with(path_separator)) {
+      // Note sc_file_dir and request_file_dir below are normalized
+      // and always contain the path separator '/'.
+      if (a.empty() || a.ends_with("/")) {
         return a;
       }
     }
diff --git a/lldb/source/Commands/CommandObjectThread.cpp b/lldb/source/Commands/CommandObjectThread.cpp
index 4397ee1..db96ee2 100644
--- a/lldb/source/Commands/CommandObjectThread.cpp
+++ b/lldb/source/Commands/CommandObjectThread.cpp
@@ -114,8 +114,8 @@ public:
   CommandObjectThreadBacktrace(CommandInterpreter &interpreter)
       : CommandObjectIterateOverThreads(
             interpreter, "thread backtrace",
-            "Show thread call stacks.  Defaults to the current thread, thread "
-            "indexes can be specified as arguments.\n"
+            "Show backtraces of thread call stacks.  Defaults to the current "
+            "thread, thread indexes can be specified as arguments.\n"
             "Use the thread-index \"all\" to see all threads.\n"
             "Use the thread-index \"unique\" to see threads grouped by unique "
             "call stacks.\n"
diff --git a/lldb/source/Core/CMakeLists.txt b/lldb/source/Core/CMakeLists.txt
index 10525ac..f24dbbd 100644
--- a/lldb/source/Core/CMakeLists.txt
+++ b/lldb/source/Core/CMakeLists.txt
@@ -11,9 +11,6 @@ set(LLDB_LIBEDIT_LIBS)
 
 if (LLDB_ENABLE_CURSES)
   list(APPEND LLDB_CURSES_LIBS ${PANEL_LIBRARIES} ${CURSES_LIBRARIES})
-  if(LLVM_ENABLE_TERMINFO)
-    list(APPEND LLDB_CURSES_LIBS ${Terminfo_LIBRARIES})
-  endif()
   if (LLVM_BUILD_STATIC)
     list(APPEND LLDB_CURSES_LIBS gpm)
   endif()
diff --git a/lldb/source/Core/ValueObject.cpp b/lldb/source/Core/ValueObject.cpp
index f39bd07..1443d9d 100644
--- a/lldb/source/Core/ValueObject.cpp
+++ b/lldb/source/Core/ValueObject.cpp
@@ -505,15 +505,23 @@ ValueObject *ValueObject::CreateChildAtIndex(size_t idx,
   uint64_t language_flags = 0;
 
   const bool transparent_pointers = !synthetic_array_member;
-  CompilerType child_compiler_type;
 
   ExecutionContext exe_ctx(GetExecutionContextRef());
 
-  child_compiler_type = GetCompilerType().GetChildCompilerTypeAtIndex(
-      &exe_ctx, idx, transparent_pointers, omit_empty_base_classes,
-      ignore_array_bounds, child_name_str, child_byte_size, child_byte_offset,
-      child_bitfield_bit_size, child_bitfield_bit_offset, child_is_base_class,
-      child_is_deref_of_parent, this, language_flags);
+  auto child_compiler_type_or_err =
+      GetCompilerType().GetChildCompilerTypeAtIndex(
+          &exe_ctx, idx, transparent_pointers, omit_empty_base_classes,
+          ignore_array_bounds, child_name_str, child_byte_size,
+          child_byte_offset, child_bitfield_bit_size, child_bitfield_bit_offset,
+          child_is_base_class, child_is_deref_of_parent, this, language_flags);
+  CompilerType child_compiler_type;
+  if (!child_compiler_type_or_err)
+    LLDB_LOG_ERROR(GetLog(LLDBLog::Types),
+                   child_compiler_type_or_err.takeError(),
+                   "could not find child: {0}");
+  else
+    child_compiler_type = *child_compiler_type_or_err;
+
   if (child_compiler_type) {
     if (synthetic_index)
       child_byte_offset += child_byte_size * synthetic_index;
@@ -2624,16 +2632,23 @@ ValueObjectSP ValueObject::Dereference(Status &error) {
     bool child_is_deref_of_parent = false;
     const bool transparent_pointers = false;
     CompilerType compiler_type = GetCompilerType();
-    CompilerType child_compiler_type;
     uint64_t language_flags = 0;
 
     ExecutionContext exe_ctx(GetExecutionContextRef());
 
-    child_compiler_type = compiler_type.GetChildCompilerTypeAtIndex(
+    CompilerType child_compiler_type;
+    auto child_compiler_type_or_err = compiler_type.GetChildCompilerTypeAtIndex(
         &exe_ctx, 0, transparent_pointers, omit_empty_base_classes,
         ignore_array_bounds, child_name_str, child_byte_size, child_byte_offset,
         child_bitfield_bit_size, child_bitfield_bit_offset, child_is_base_class,
         child_is_deref_of_parent, this, language_flags);
+    if (!child_compiler_type_or_err)
+      LLDB_LOG_ERROR(GetLog(LLDBLog::Types),
+                     child_compiler_type_or_err.takeError(),
+                     "could not find child: {0}");
+    else
+      child_compiler_type = *child_compiler_type_or_err;
+
     if (child_compiler_type && child_byte_size) {
       ConstString child_name;
       if (!child_name_str.empty())
diff --git a/lldb/source/Core/ValueObjectConstResultImpl.cpp b/lldb/source/Core/ValueObjectConstResultImpl.cpp
index e2db3ac..493980d 100644
--- a/lldb/source/Core/ValueObjectConstResultImpl.cpp
+++ b/lldb/source/Core/ValueObjectConstResultImpl.cpp
@@ -17,6 +17,8 @@
 #include "lldb/Target/ExecutionContext.h"
 #include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/Endian.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
 #include "lldb/Utility/Scalar.h"
 
 #include <string>
@@ -66,15 +68,21 @@ ValueObject *ValueObjectConstResultImpl::CreateChildAtIndex(
 
   const bool transparent_pointers = !synthetic_array_member;
   CompilerType compiler_type = m_impl_backend->GetCompilerType();
-  CompilerType child_compiler_type;
 
   ExecutionContext exe_ctx(m_impl_backend->GetExecutionContextRef());
 
-  child_compiler_type = compiler_type.GetChildCompilerTypeAtIndex(
+  auto child_compiler_type_or_err = compiler_type.GetChildCompilerTypeAtIndex(
       &exe_ctx, idx, transparent_pointers, omit_empty_base_classes,
       ignore_array_bounds, child_name_str, child_byte_size, child_byte_offset,
       child_bitfield_bit_size, child_bitfield_bit_offset, child_is_base_class,
       child_is_deref_of_parent, m_impl_backend, language_flags);
+  CompilerType child_compiler_type;
+  if (!child_compiler_type_or_err)
+    LLDB_LOG_ERROR(GetLog(LLDBLog::Types),
+                   child_compiler_type_or_err.takeError(),
+                   "could not find child: {0}");
+  else
+    child_compiler_type = *child_compiler_type_or_err;
 
   // One might think we should check that the size of the children
   // is always strictly positive, hence we could avoid creating a
diff --git a/lldb/source/Host/common/Socket.cpp b/lldb/source/Host/common/Socket.cpp
index bd0c127..f9911cf 100644
--- a/lldb/source/Host/common/Socket.cpp
+++ b/lldb/source/Host/common/Socket.cpp
@@ -87,8 +87,7 @@ llvm::Error Socket::Initialize() {
   if (err == 0) {
     if (wsaData.wVersion < wVersion) {
       WSACleanup();
-      return llvm::make_error<llvm::StringError>(
-          "WSASock version is not expected.", llvm::inconvertibleErrorCode());
+      return llvm::createStringError("WSASock version is not expected.");
     }
   } else {
     return llvm::errorCodeToError(llvm::mapWindowsError(::WSAGetLastError()));
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index 811726e..7f21f38 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -828,11 +828,11 @@ void CommandInterpreter::LoadCommandDictionary() {
   std::unique_ptr<CommandObjectRegexCommand> bt_regex_cmd_up(
       new CommandObjectRegexCommand(
           *this, "_regexp-bt",
-          "Show the current thread's call stack.  Any numeric argument "
-          "displays at most that many "
-          "frames.  The argument 'all' displays all threads.  Use 'settings"
-          " set frame-format' to customize the printing of individual frames "
-          "and 'settings set thread-format' to customize the thread header.",
+          "Show backtrace of the current thread's call stack.  Any numeric "
+          "argument displays at most that many frames.  The argument 'all' "
+          "displays all threads.  Use 'settings set frame-format' to customize "
+          "the printing of individual frames and 'settings set thread-format' "
+          "to customize the thread header.",
           "bt [<digit> | all]", 0, false));
   if (bt_regex_cmd_up) {
     // accept but don't document "bt -c <number>" -- before bt was a regex
diff --git a/lldb/source/Interpreter/Options.cpp b/lldb/source/Interpreter/Options.cpp
index 51b7e6b..4e7d074 100644
--- a/lldb/source/Interpreter/Options.cpp
+++ b/lldb/source/Interpreter/Options.cpp
@@ -931,8 +931,7 @@ llvm::Expected<Args> Options::ParseAlias(const Args &args,
   Option *long_options = GetLongOptions();
 
   if (long_options == nullptr) {
-    return llvm::make_error<llvm::StringError>("Invalid long options",
-                                               llvm::inconvertibleErrorCode());
+    return llvm::createStringError("Invalid long options");
   }
 
   std::string short_options = BuildShortOptions(long_options);
@@ -957,8 +956,7 @@ llvm::Expected<Args> Options::ParseAlias(const Args &args,
       break;
 
     if (val == '?') {
-      return llvm::make_error<llvm::StringError>(
-          "Unknown or ambiguous option", llvm::inconvertibleErrorCode());
+      return llvm::createStringError("Unknown or ambiguous option");
     }
 
     if (val == 0)
@@ -980,9 +978,8 @@ llvm::Expected<Args> Options::ParseAlias(const Args &args,
 
     // See if the option takes an argument, and see if one was supplied.
     if (long_options_index == -1) {
-      return llvm::make_error<llvm::StringError>(
-          llvm::formatv("Invalid option with value '{0}'.", char(val)).str(),
-          llvm::inconvertibleErrorCode());
+      return llvm::createStringError(
+          llvm::formatv("Invalid option with value '{0}'.", char(val)).str());
     }
 
     StreamString option_str;
@@ -995,11 +992,10 @@ llvm::Expected<Args> Options::ParseAlias(const Args &args,
     switch (has_arg) {
     case OptionParser::eRequiredArgument:
       if (OptionParser::GetOptionArgument() == nullptr) {
-        return llvm::make_error<llvm::StringError>(
+        return llvm::createStringError(
             llvm::formatv("Option '{0}' is missing argument specifier.",
                           option_str.GetString())
-                .str(),
-            llvm::inconvertibleErrorCode());
+                .str());
       }
       [[fallthrough]];
     case OptionParser::eOptionalArgument:
@@ -1008,12 +1004,11 @@ llvm::Expected<Args> Options::ParseAlias(const Args &args,
     case OptionParser::eNoArgument:
       break;
     default:
-      return llvm::make_error<llvm::StringError>(
+      return llvm::createStringError(
           llvm::formatv("error with options table; invalid value in has_arg "
                         "field for option '{0}'.",
                         char(val))
-              .str(),
-          llvm::inconvertibleErrorCode());
+              .str());
     }
     // Find option in the argument list; also see if it was supposed to take an
     // argument and if one was supplied.  Remove option (and argument, if
@@ -1261,8 +1256,7 @@ llvm::Expected<Args> Options::Parse(const Args &args,
   Status error;
   Option *long_options = GetLongOptions();
   if (long_options == nullptr) {
-    return llvm::make_error<llvm::StringError>("Invalid long options.",
-                                               llvm::inconvertibleErrorCode());
+    return llvm::createStringError("Invalid long options.");
   }
 
   std::string short_options = BuildShortOptions(long_options);
@@ -1322,9 +1316,8 @@ llvm::Expected<Args> Options::Parse(const Args &args,
       if (!platform_sp && require_validation) {
         // Caller requires validation but we cannot validate as we don't have
         // the mandatory platform against which to validate.
-        return llvm::make_error<llvm::StringError>(
-            "cannot validate options: no platform available",
-            llvm::inconvertibleErrorCode());
+        return llvm::createStringError(
+            "cannot validate options: no platform available");
       }
 
       bool validation_failed = false;
diff --git a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp
index 173b561..eac0587 100644
--- a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp
+++ b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp
@@ -501,14 +501,12 @@ public:
                                                      CompilerType &type) {
     RegisterContext *reg_ctx = thread.GetRegisterContext().get();
     if (!reg_ctx)
-      return llvm::make_error<llvm::StringError>(
-          LOG_PREFIX "Failed to get RegisterContext",
-          llvm::inconvertibleErrorCode());
+      return llvm::createStringError(LOG_PREFIX
+                                     "Failed to get RegisterContext");
 
     ProcessSP process_sp = thread.GetProcess();
     if (!process_sp)
-      return llvm::make_error<llvm::StringError>(
-          LOG_PREFIX "GetProcess() failed", llvm::inconvertibleErrorCode());
+      return llvm::createStringError(LOG_PREFIX "GetProcess() failed");
 
     return ReturnValueExtractor(thread, type, reg_ctx, process_sp);
   }
@@ -836,7 +834,7 @@ private:
     for (uint32_t i = 0; i < n; i++) {
       std::string name;
       uint32_t size;
-      GetChildType(i, name, size);
+      (void)GetChildType(i, name, size);
       // NOTE: the offset returned by GetChildCompilerTypeAtIndex()
       //       can't be used because it never considers alignment bytes
       //       between struct fields.
@@ -903,7 +901,8 @@ private:
   }
 
   // get child
-  CompilerType GetChildType(uint32_t i, std::string &name, uint32_t &size) {
+  llvm::Expected<CompilerType> GetChildType(uint32_t i, std::string &name,
+                                            uint32_t &size) {
     // GetChild constant inputs
     const bool transparent_pointers = false;
     const bool omit_empty_base_classes = true;
diff --git a/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp b/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp
index 9a6e135..2c9b3c4 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp
@@ -12,6 +12,7 @@
 #include "Plugins/ExpressionParser/Clang/ClangPersistentVariables.h"
 #include "Plugins/TypeSystem/Clang/TypeSystemClang.h"
 #include "lldb/Core/ValueObject.h"
+#include "lldb/Core/ValueObjectConstResult.h"
 #include "lldb/DataFormatters/FormattersHelpers.h"
 #include "lldb/Symbol/CompilerType.h"
 #include "lldb/Symbol/TypeSystem.h"
@@ -105,13 +106,16 @@ public:
     bool child_is_deref_of_parent = false;
     uint64_t language_flags = 0;
 
-    const CompilerType child_type =
-        m_block_struct_type.GetChildCompilerTypeAtIndex(
-            &exe_ctx, idx, transparent_pointers, omit_empty_base_classes,
-            ignore_array_bounds, child_name, child_byte_size, child_byte_offset,
-            child_bitfield_bit_size, child_bitfield_bit_offset,
-            child_is_base_class, child_is_deref_of_parent, value_object,
-            language_flags);
+    auto child_type_or_err = m_block_struct_type.GetChildCompilerTypeAtIndex(
+        &exe_ctx, idx, transparent_pointers, omit_empty_base_classes,
+        ignore_array_bounds, child_name, child_byte_size, child_byte_offset,
+        child_bitfield_bit_size, child_bitfield_bit_offset, child_is_base_class,
+        child_is_deref_of_parent, value_object, language_flags);
+    if (!child_type_or_err)
+      return ValueObjectConstResult::Create(
+          exe_ctx.GetBestExecutionContextScope(),
+          Status(child_type_or_err.takeError()));
+    CompilerType child_type = *child_type_or_err;
 
     ValueObjectSP struct_pointer_sp =
         m_backend.Cast(m_block_struct_type.GetPointerType());
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
index ec5b320..0929d49 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
@@ -295,13 +295,13 @@ void lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd::GetValueOffset(
     bool child_is_base_class;
     bool child_is_deref_of_parent;
     uint64_t language_flags;
-    if (tree_node_type
-            .GetChildCompilerTypeAtIndex(
-                nullptr, 4, true, true, true, child_name, child_byte_size,
-                child_byte_offset, child_bitfield_bit_size,
-                child_bitfield_bit_offset, child_is_base_class,
-                child_is_deref_of_parent, nullptr, language_flags)
-            .IsValid())
+    auto child_type =
+        llvm::expectedToStdOptional(tree_node_type.GetChildCompilerTypeAtIndex(
+            nullptr, 4, true, true, true, child_name, child_byte_size,
+            child_byte_offset, child_bitfield_bit_size,
+            child_bitfield_bit_offset, child_is_base_class,
+            child_is_deref_of_parent, nullptr, language_flags));
+    if (child_type && child_type->IsValid())
       m_skip_size = (uint32_t)child_byte_offset;
   }
 }
diff --git a/lldb/source/Plugins/Process/NetBSD/NativeThreadNetBSD.cpp b/lldb/source/Plugins/Process/NetBSD/NativeThreadNetBSD.cpp
index f561c21..77b4301 100644
--- a/lldb/source/Plugins/Process/NetBSD/NativeThreadNetBSD.cpp
+++ b/lldb/source/Plugins/Process/NetBSD/NativeThreadNetBSD.cpp
@@ -180,8 +180,6 @@ void NativeThreadNetBSD::SetStepping() {
 }
 
 std::string NativeThreadNetBSD::GetName() {
-  Log *log = GetLog(POSIXLog::Thread);
-
 #ifdef PT_LWPSTATUS
   struct ptrace_lwpstatus info = {};
   info.pl_lwpid = m_tid;
@@ -193,6 +191,8 @@ std::string NativeThreadNetBSD::GetName() {
   return info.pl_name;
 #else
   std::vector<struct kinfo_lwp> infos;
+  Log *log = GetLog(POSIXLog::Thread);
+
   int mib[5] = {CTL_KERN, KERN_LWP, static_cast<int>(m_process.GetID()),
                 sizeof(struct kinfo_lwp), 0};
   size_t size;
diff --git a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
index 36812c2..30af934 100644
--- a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
+++ b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
@@ -250,6 +250,9 @@ Status ProcessElfCore::DoLoadCore() {
     }
   }
 
+  // Try to find gnu build id before we load the executable.
+  UpdateBuildIdForNTFileEntries();
+
   // Core files are useless without the main executable. See if we can locate
   // the main executable using data we found in the core file notes.
   lldb::ModuleSP exe_module_sp = GetTarget().GetExecutableModule();
@@ -258,6 +261,7 @@ Status ProcessElfCore::DoLoadCore() {
     if (!m_nt_file_entries.empty()) {
       ModuleSpec exe_module_spec;
       exe_module_spec.GetArchitecture() = arch;
+      exe_module_spec.GetUUID() = m_nt_file_entries[0].uuid;
       exe_module_spec.GetFileSpec().SetFile(m_nt_file_entries[0].path,
                                             FileSpec::Style::native);
       if (exe_module_spec.GetFileSpec()) {
@@ -271,6 +275,12 @@ Status ProcessElfCore::DoLoadCore() {
   return error;
 }
 
+void ProcessElfCore::UpdateBuildIdForNTFileEntries() {
+  for (NT_FILE_Entry &entry : m_nt_file_entries) {
+    entry.uuid = FindBuidIdInCoreMemory(entry.start);
+  }
+}
+
 lldb_private::DynamicLoader *ProcessElfCore::GetDynamicLoader() {
   if (m_dyld_up.get() == nullptr)
     m_dyld_up.reset(DynamicLoader::FindPlugin(
@@ -983,6 +993,67 @@ llvm::Error ProcessElfCore::ParseThreadContextsFromNoteSegment(
   }
 }
 
+UUID ProcessElfCore::FindBuidIdInCoreMemory(lldb::addr_t address) {
+  UUID invalid_uuid;
+  const uint32_t addr_size = GetAddressByteSize();
+  const size_t elf_header_size = addr_size == 4 ? sizeof(llvm::ELF::Elf32_Ehdr)
+                                                : sizeof(llvm::ELF::Elf64_Ehdr);
+
+  std::vector<uint8_t> elf_header_bytes;
+  elf_header_bytes.resize(elf_header_size);
+  Status error;
+  size_t byte_read =
+      ReadMemory(address, elf_header_bytes.data(), elf_header_size, error);
+  if (byte_read != elf_header_size ||
+      !elf::ELFHeader::MagicBytesMatch(elf_header_bytes.data()))
+    return invalid_uuid;
+  DataExtractor elf_header_data(elf_header_bytes.data(), elf_header_size,
+                                GetByteOrder(), addr_size);
+  lldb::offset_t offset = 0;
+
+  elf::ELFHeader elf_header;
+  elf_header.Parse(elf_header_data, &offset);
+
+  const lldb::addr_t ph_addr = address + elf_header.e_phoff;
+
+  std::vector<uint8_t> ph_bytes;
+  ph_bytes.resize(elf_header.e_phentsize);
+  for (unsigned int i = 0; i < elf_header.e_phnum; ++i) {
+    byte_read = ReadMemory(ph_addr + i * elf_header.e_phentsize,
+                           ph_bytes.data(), elf_header.e_phentsize, error);
+    if (byte_read != elf_header.e_phentsize)
+      break;
+    DataExtractor program_header_data(ph_bytes.data(), elf_header.e_phentsize,
+                                      GetByteOrder(), addr_size);
+    offset = 0;
+    elf::ELFProgramHeader program_header;
+    program_header.Parse(program_header_data, &offset);
+    if (program_header.p_type != llvm::ELF::PT_NOTE)
+      continue;
+
+    std::vector<uint8_t> note_bytes;
+    note_bytes.resize(program_header.p_memsz);
+
+    byte_read = ReadMemory(program_header.p_vaddr, note_bytes.data(),
+                           program_header.p_memsz, error);
+    if (byte_read != program_header.p_memsz)
+      continue;
+    DataExtractor segment_data(note_bytes.data(), note_bytes.size(),
+                               GetByteOrder(), addr_size);
+    auto notes_or_error = parseSegment(segment_data);
+    if (!notes_or_error)
+      return invalid_uuid;
+    for (const CoreNote &note : *notes_or_error) {
+      if (note.info.n_namesz == 4 &&
+          note.info.n_type == llvm::ELF::NT_GNU_BUILD_ID &&
+          "GNU" == note.info.n_name &&
+          note.data.ValidOffsetForDataOfSize(0, note.info.n_descsz))
+        return UUID(note.data.GetData().take_front(note.info.n_descsz));
+    }
+  }
+  return invalid_uuid;
+}
+
 uint32_t ProcessElfCore::GetNumThreadContexts() {
   if (!m_thread_data_valid)
     DoLoadCore();
diff --git a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.h b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.h
index 2cec635..668a7c4 100644
--- a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.h
+++ b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.h
@@ -117,6 +117,10 @@ private:
     lldb::addr_t end;
     lldb::addr_t file_ofs;
     std::string path;
+    // Add a UUID member for convenient access. The UUID value is not in the
+    // NT_FILE entries, we will find it in core memory and store it here for
+    // easy access.
+    lldb_private::UUID uuid;
   };
 
   // For ProcessElfCore only
@@ -158,6 +162,12 @@ private:
   // Returns number of thread contexts stored in the core file
   uint32_t GetNumThreadContexts();
 
+  // Populate gnu uuid for each NT_FILE entry
+  void UpdateBuildIdForNTFileEntries();
+
+  // Returns the value of certain type of note of a given start address
+  lldb_private::UUID FindBuidIdInCoreMemory(lldb::addr_t address);
+
   // Parse a contiguous address range of the process from LOAD segment
   lldb::addr_t
   AddAddressRangeFromLoadSegment(const elf::ELFProgramHeader &header);
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index ce52f35..6e676de 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -2494,8 +2494,7 @@ bool ScriptInterpreterPythonImpl::LoadScriptingModule(
 
   auto ExtendSysPath = [&](std::string directory) -> llvm::Error {
     if (directory.empty()) {
-      return llvm::make_error<llvm::StringError>(
-          "invalid directory name", llvm::inconvertibleErrorCode());
+      return llvm::createStringError("invalid directory name");
     }
 
     replace_all(directory, "\\", "\\\\");
@@ -2508,10 +2507,8 @@ bool ScriptInterpreterPythonImpl::LoadScriptingModule(
                           directory.c_str(), directory.c_str());
     bool syspath_retval =
         ExecuteMultipleLines(command_stream.GetData(), exc_options).Success();
-    if (!syspath_retval) {
-      return llvm::make_error<llvm::StringError>(
-          "Python sys.path handling failed", llvm::inconvertibleErrorCode());
-    }
+    if (!syspath_retval)
+      return llvm::createStringError("Python sys.path handling failed");
 
     return llvm::Error::success();
   };
diff --git a/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.h b/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.h
index 83215bf..041b388 100644
--- a/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.h
+++ b/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.h
@@ -120,9 +120,8 @@ public:
 
   llvm::Expected<lldb::TypeSystemSP>
   GetTypeSystemForLanguage(lldb::LanguageType language) override {
-    return llvm::make_error<llvm::StringError>(
-        "SymbolFileBreakpad does not support GetTypeSystemForLanguage",
-        llvm::inconvertibleErrorCode());
+    return llvm::createStringError(
+        "SymbolFileBreakpad does not support GetTypeSystemForLanguage");
   }
 
   CompilerDeclContext FindNamespace(ConstString name,
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
index 1b0fefe..688a287 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
@@ -11,6 +11,7 @@
 #include <cassert>
 
 #include <algorithm>
+#include <limits>
 #include <optional>
 
 #include "llvm/Support/LEB128.h"
@@ -41,13 +42,23 @@ extern int g_verbose;
 // Extract a debug info entry for a given DWARFUnit from the data
 // starting at the offset in offset_ptr
 bool DWARFDebugInfoEntry::Extract(const DWARFDataExtractor &data,
-                                  const DWARFUnit *cu,
+                                  const DWARFUnit &unit,
                                   lldb::offset_t *offset_ptr) {
   m_offset = *offset_ptr;
+  auto report_error = [&](const char *fmt, const auto &...vals) {
+    unit.GetSymbolFileDWARF().GetObjectFile()->GetModule()->ReportError(
+        "[{0:x16}]: {1}, please file a bug and "
+        "attach the file at the start of this error message",
+        static_cast<uint64_t>(m_offset), llvm::formatv(fmt, vals...));
+    *offset_ptr = std::numeric_limits<lldb::offset_t>::max();
+    return false;
+  };
+
   m_parent_idx = 0;
   m_sibling_idx = 0;
   const uint64_t abbr_idx = data.GetULEB128(offset_ptr);
-  lldbassert(abbr_idx <= UINT16_MAX);
+  if (abbr_idx > std::numeric_limits<uint16_t>::max())
+    return report_error("abbreviation code {0} too big", abbr_idx);
   m_abbr_idx = abbr_idx;
 
   if (m_abbr_idx == 0) {
@@ -56,31 +67,18 @@ bool DWARFDebugInfoEntry::Extract(const DWARFDataExtractor &data,
     return true; // NULL debug tag entry
   }
 
-  const auto *abbrevDecl = GetAbbreviationDeclarationPtr(cu);
-  if (abbrevDecl == nullptr) {
-    cu->GetSymbolFileDWARF().GetObjectFile()->GetModule()->ReportError(
-        "[{0:x16}]: invalid abbreviation code {1}, "
-        "please file a bug and "
-        "attach the file at the start of this error message",
-        (uint64_t)m_offset, (unsigned)abbr_idx);
-    // WE can't parse anymore if the DWARF is borked...
-    *offset_ptr = UINT32_MAX;
-    return false;
-  }
+  const auto *abbrevDecl = GetAbbreviationDeclarationPtr(&unit);
+  if (abbrevDecl == nullptr)
+    return report_error("invalid abbreviation code {0}", abbr_idx);
+
   m_tag = abbrevDecl->getTag();
   m_has_children = abbrevDecl->hasChildren();
   // Skip all data in the .debug_info or .debug_types for the attributes
   for (const auto &attribute : abbrevDecl->attributes()) {
-    if (DWARFFormValue::SkipValue(attribute.Form, data, offset_ptr, cu))
+    if (DWARFFormValue::SkipValue(attribute.Form, data, offset_ptr, &unit))
       continue;
 
-    cu->GetSymbolFileDWARF().GetObjectFile()->GetModule()->ReportError(
-        "[{0:x16}]: Unsupported DW_FORM_{1:x}, please file a bug "
-        "and "
-        "attach the file at the start of this error message",
-        (uint64_t)m_offset, (unsigned)attribute.Form);
-    *offset_ptr = m_offset;
-    return false;
+    return report_error("Unsupported DW_FORM_{1:x}", attribute.Form);
   }
   return true;
 }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h
index c19fa74..6773b00 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h
@@ -49,7 +49,7 @@ public:
   void BuildFunctionAddressRangeTable(DWARFUnit *cu,
                                       DWARFDebugAranges *debug_aranges) const;
 
-  bool Extract(const DWARFDataExtractor &data, const DWARFUnit *cu,
+  bool Extract(const DWARFDataExtractor &data, const DWARFUnit &cu,
                lldb::offset_t *offset_ptr);
 
   using Recurse = DWARFBaseDIE::Recurse;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
index 3a57ec9..66a762b 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
@@ -63,7 +63,7 @@ void DWARFUnit::ExtractUnitDIENoDwoIfNeeded() {
   // parse
   const DWARFDataExtractor &data = GetData();
   if (offset < GetNextUnitOffset() &&
-      m_first_die.Extract(data, this, &offset)) {
+      m_first_die.Extract(data, *this, &offset)) {
     AddUnitDIE(m_first_die);
     return;
   }
@@ -242,7 +242,7 @@ void DWARFUnit::ExtractDIEsRWLocked() {
   die_index_stack.reserve(32);
   die_index_stack.push_back(0);
   bool prev_die_had_children = false;
-  while (offset < next_cu_offset && die.Extract(data, this, &offset)) {
+  while (offset < next_cu_offset && die.Extract(data, *this, &offset)) {
     const bool null_die = die.IsNULL();
     if (depth == 0) {
       assert(m_die_array.empty() && "Compile unit DIE already added");
@@ -670,7 +670,7 @@ DWARFUnit::GetDIE(dw_offset_t die_offset) {
 
 llvm::StringRef DWARFUnit::PeekDIEName(dw_offset_t die_offset) {
   DWARFDebugInfoEntry die;
-  if (!die.Extract(GetData(), this, &die_offset))
+  if (!die.Extract(GetData(), *this, &die_offset))
     return llvm::StringRef();
 
   // Does die contain a DW_AT_Name?
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 582d9ea..369ae46 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -5272,8 +5272,7 @@ TypeSystemClang::GetNumChildren(lldb::opaque_compiler_type_t type,
                                 bool omit_empty_base_classes,
                                 const ExecutionContext *exe_ctx) {
   if (!type)
-    return llvm::make_error<llvm::StringError>("invalid clang type",
-                                               llvm::inconvertibleErrorCode());
+    return llvm::createStringError("invalid clang type");
 
   uint32_t num_children = 0;
   clang::QualType qual_type(RemoveWrappingTypes(GetQualType(type)));
@@ -5331,9 +5330,8 @@ TypeSystemClang::GetNumChildren(lldb::opaque_compiler_type_t type,
       num_children += std::distance(record_decl->field_begin(),
                                record_decl->field_end());
     } else
-      return llvm::make_error<llvm::StringError>(
-          "incomplete type \"" + GetDisplayTypeName(type).GetString() + "\"",
-          llvm::inconvertibleErrorCode());
+      return llvm::createStringError(
+          "incomplete type \"" + GetDisplayTypeName(type).GetString() + "\"");
     break;
   case clang::Type::ObjCObject:
   case clang::Type::ObjCInterface:
@@ -6130,7 +6128,7 @@ uint32_t TypeSystemClang::GetNumPointeeChildren(clang::QualType type) {
   return 0;
 }
 
-CompilerType TypeSystemClang::GetChildCompilerTypeAtIndex(
+llvm::Expected<CompilerType> TypeSystemClang::GetChildCompilerTypeAtIndex(
     lldb::opaque_compiler_type_t type, ExecutionContext *exe_ctx, size_t idx,
     bool transparent_pointers, bool omit_empty_base_classes,
     bool ignore_array_bounds, std::string &child_name,
@@ -6156,11 +6154,8 @@ CompilerType TypeSystemClang::GetChildCompilerTypeAtIndex(
 
   auto num_children_or_err =
       GetNumChildren(type, omit_empty_base_classes, exe_ctx);
-  if (!num_children_or_err) {
-    LLDB_LOG_ERRORV(GetLog(LLDBLog::Types), num_children_or_err.takeError(),
-                    "{0}");
-    return {};
-  }
+  if (!num_children_or_err)
+    return num_children_or_err.takeError();
 
   const bool idx_is_valid = idx < *num_children_or_err;
   int32_t bit_offset;
@@ -6242,7 +6237,8 @@ CompilerType TypeSystemClang::GetChildCompilerTypeAtIndex(
             std::optional<uint64_t> size =
                 base_class_clang_type.GetBitSize(get_exe_scope());
             if (!size)
-              return {};
+              return llvm::createStringError("no size info for base class");
+
             uint64_t base_class_clang_type_bit_size = *size;
 
             // Base classes bit sizes should be a multiple of 8 bits in size
@@ -6274,7 +6270,8 @@ CompilerType TypeSystemClang::GetChildCompilerTypeAtIndex(
           std::optional<uint64_t> size =
               field_clang_type.GetByteSize(get_exe_scope());
           if (!size)
-            return {};
+            return llvm::createStringError("no size info for field");
+
           child_byte_size = *size;
           const uint32_t child_bit_size = child_byte_size * 8;
 
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
index 042379d..d67b7a4 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
@@ -887,7 +887,7 @@ public:
 
   static uint32_t GetNumPointeeChildren(clang::QualType type);
 
-  CompilerType GetChildCompilerTypeAtIndex(
+  llvm::Expected<CompilerType> GetChildCompilerTypeAtIndex(
       lldb::opaque_compiler_type_t type, ExecutionContext *exe_ctx, size_t idx,
       bool transparent_pointers, bool omit_empty_base_classes,
       bool ignore_array_bounds, std::string &child_name,
diff --git a/lldb/source/Symbol/CompilerType.cpp b/lldb/source/Symbol/CompilerType.cpp
index 072dbcc..f8da9ef 100644
--- a/lldb/source/Symbol/CompilerType.cpp
+++ b/lldb/source/Symbol/CompilerType.cpp
@@ -805,8 +805,7 @@ CompilerType::GetNumChildren(bool omit_empty_base_classes,
     if (auto type_system_sp = GetTypeSystem())
       return type_system_sp->GetNumChildren(m_type, omit_empty_base_classes,
                                        exe_ctx);
-  return llvm::make_error<llvm::StringError>("invalid type",
-                                             llvm::inconvertibleErrorCode());
+  return llvm::createStringError("invalid type");
 }
 
 lldb::BasicType CompilerType::GetBasicTypeEnumeration() const {
@@ -902,7 +901,7 @@ uint32_t CompilerType::GetIndexOfFieldWithName(
   return UINT32_MAX;
 }
 
-CompilerType CompilerType::GetChildCompilerTypeAtIndex(
+llvm::Expected<CompilerType> CompilerType::GetChildCompilerTypeAtIndex(
     ExecutionContext *exe_ctx, size_t idx, bool transparent_pointers,
     bool omit_empty_base_classes, bool ignore_array_bounds,
     std::string &child_name, uint32_t &child_byte_size,
diff --git a/lldb/source/Symbol/Symbol.cpp b/lldb/source/Symbol/Symbol.cpp
index 1895f29..9b0042f 100644
--- a/lldb/source/Symbol/Symbol.cpp
+++ b/lldb/source/Symbol/Symbol.cpp
@@ -101,18 +101,15 @@ const Symbol &Symbol::operator=(const Symbol &rhs) {
 llvm::Expected<Symbol> Symbol::FromJSON(const JSONSymbol &symbol,
                                         SectionList *section_list) {
   if (!section_list)
-    return llvm::make_error<llvm::StringError>("no section list provided",
-                                               llvm::inconvertibleErrorCode());
+    return llvm::createStringError("no section list provided");
 
   if (!symbol.value && !symbol.address)
-    return llvm::make_error<llvm::StringError>(
-        "symbol must contain either a value or an address",
-        llvm::inconvertibleErrorCode());
+    return llvm::createStringError(
+        "symbol must contain either a value or an address");
 
   if (symbol.value && symbol.address)
-    return llvm::make_error<llvm::StringError>(
-        "symbol cannot contain both a value and an address",
-        llvm::inconvertibleErrorCode());
+    return llvm::createStringError(
+        "symbol cannot contain both a value and an address");
 
   const uint64_t size = symbol.size.value_or(0);
   const bool is_artificial = false;
@@ -133,9 +130,8 @@ llvm::Expected<Symbol> Symbol::FromJSON(const JSONSymbol &symbol,
                     AddressRange(section_sp, offset, size), size_is_valid,
                     contains_linker_annotations, flags);
     }
-    return llvm::make_error<llvm::StringError>(
-        llvm::formatv("no section found for address: {0:x}", *symbol.address),
-        llvm::inconvertibleErrorCode());
+    return llvm::createStringError(
+        llvm::formatv("no section found for address: {0:x}", *symbol.address));
   }
 
   // Absolute symbols encode the integer value in the m_offset of the
diff --git a/lldb/source/Symbol/SymbolFileOnDemand.cpp b/lldb/source/Symbol/SymbolFileOnDemand.cpp
index c6d9f00..0cfe9fc 100644
--- a/lldb/source/Symbol/SymbolFileOnDemand.cpp
+++ b/lldb/source/Symbol/SymbolFileOnDemand.cpp
@@ -457,9 +457,8 @@ SymbolFileOnDemand::GetTypeSystemForLanguage(LanguageType language) {
     Log *log = GetLog();
     LLDB_LOG(log, "[{0}] {1} is skipped for language type {2}",
              GetSymbolFileName(), __FUNCTION__, language);
-    return llvm::make_error<llvm::StringError>(
-        "GetTypeSystemForLanguage is skipped by SymbolFileOnDemand",
-        llvm::inconvertibleErrorCode());
+    return llvm::createStringError(
+        "GetTypeSystemForLanguage is skipped by SymbolFileOnDemand");
   }
   return m_sym_file_impl->GetTypeSystemForLanguage(language);
 }
diff --git a/lldb/source/Symbol/TypeSystem.cpp b/lldb/source/Symbol/TypeSystem.cpp
index 3665771..4956f10 100644
--- a/lldb/source/Symbol/TypeSystem.cpp
+++ b/lldb/source/Symbol/TypeSystem.cpp
@@ -267,9 +267,8 @@ llvm::Expected<lldb::TypeSystemSP> TypeSystemMap::GetTypeSystemForLanguage(
     std::optional<CreateCallback> create_callback) {
   std::lock_guard<std::mutex> guard(m_mutex);
   if (m_clear_in_progress)
-    return llvm::make_error<llvm::StringError>(
-        "Unable to get TypeSystem because TypeSystemMap is being cleared",
-        llvm::inconvertibleErrorCode());
+    return llvm::createStringError(
+        "Unable to get TypeSystem because TypeSystemMap is being cleared");
 
   collection::iterator pos = m_map.find(language);
   if (pos != m_map.end()) {
@@ -277,11 +276,10 @@ llvm::Expected<lldb::TypeSystemSP> TypeSystemMap::GetTypeSystemForLanguage(
       assert(!pos->second->weak_from_this().expired());
       return pos->second;
     }
-    return llvm::make_error<llvm::StringError>(
+    return llvm::createStringError(
         "TypeSystem for language " +
-            llvm::StringRef(Language::GetNameForLanguageType(language)) +
-            " doesn't exist",
-        llvm::inconvertibleErrorCode());
+        llvm::StringRef(Language::GetNameForLanguageType(language)) +
+        " doesn't exist");
   }
 
   for (const auto &pair : m_map) {
@@ -291,31 +289,27 @@ llvm::Expected<lldb::TypeSystemSP> TypeSystemMap::GetTypeSystemForLanguage(
       m_map[language] = pair.second;
       if (pair.second)
         return pair.second;
-      return llvm::make_error<llvm::StringError>(
+      return llvm::createStringError(
           "TypeSystem for language " +
-              llvm::StringRef(Language::GetNameForLanguageType(language)) +
-              " doesn't exist",
-          llvm::inconvertibleErrorCode());
+          llvm::StringRef(Language::GetNameForLanguageType(language)) +
+          " doesn't exist");
     }
   }
 
   if (!create_callback)
-    return llvm::make_error<llvm::StringError>(
+    return llvm::createStringError(
         "Unable to find type system for language " +
-            llvm::StringRef(Language::GetNameForLanguageType(language)),
-        llvm::inconvertibleErrorCode());
-
+        llvm::StringRef(Language::GetNameForLanguageType(language)));
   // Cache even if we get a shared pointer that contains a null type system
   // back.
   TypeSystemSP type_system_sp = (*create_callback)();
   m_map[language] = type_system_sp;
   if (type_system_sp)
     return type_system_sp;
-  return llvm::make_error<llvm::StringError>(
+  return llvm::createStringError(
       "TypeSystem for language " +
-          llvm::StringRef(Language::GetNameForLanguageType(language)) +
-          " doesn't exist",
-      llvm::inconvertibleErrorCode());
+      llvm::StringRef(Language::GetNameForLanguageType(language)) +
+      " doesn't exist");
 }
 
 llvm::Expected<lldb::TypeSystemSP>
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index 7773116..ec0da8a 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -2414,8 +2414,7 @@ llvm::Expected<lldb::TypeSystemSP>
 Target::GetScratchTypeSystemForLanguage(lldb::LanguageType language,
                                         bool create_on_demand) {
   if (!m_valid)
-    return llvm::make_error<llvm::StringError>("Invalid Target",
-                                               llvm::inconvertibleErrorCode());
+    return llvm::createStringError("Invalid Target");
 
   if (language == eLanguageTypeMipsAssembler // GNU AS and LLVM use it for all
                                              // assembly code
@@ -2428,9 +2427,8 @@ Target::GetScratchTypeSystemForLanguage(lldb::LanguageType language,
                                  // target language.
     } else {
       if (languages_for_expressions.Empty())
-        return llvm::make_error<llvm::StringError>(
-            "No expression support for any languages",
-            llvm::inconvertibleErrorCode());
+        return llvm::createStringError(
+            "No expression support for any languages");
       language = (LanguageType)languages_for_expressions.bitvector.find_first();
     }
   }
@@ -2574,23 +2572,20 @@ Target::CreateUtilityFunction(std::string expression, std::string name,
     return type_system_or_err.takeError();
   auto ts = *type_system_or_err;
   if (!ts)
-    return llvm::make_error<llvm::StringError>(
+    return llvm::createStringError(
         llvm::StringRef("Type system for language ") +
-            Language::GetNameForLanguageType(language) +
-            llvm::StringRef(" is no longer live"),
-        llvm::inconvertibleErrorCode());
+        Language::GetNameForLanguageType(language) +
+        llvm::StringRef(" is no longer live"));
   std::unique_ptr<UtilityFunction> utility_fn =
       ts->CreateUtilityFunction(std::move(expression), std::move(name));
   if (!utility_fn)
-    return llvm::make_error<llvm::StringError>(
+    return llvm::createStringError(
         llvm::StringRef("Could not create an expression for language") +
-            Language::GetNameForLanguageType(language),
-        llvm::inconvertibleErrorCode());
+        Language::GetNameForLanguageType(language));
 
   DiagnosticManager diagnostics;
   if (!utility_fn->Install(diagnostics, exe_ctx))
-    return llvm::make_error<llvm::StringError>(diagnostics.GetString(),
-                                               llvm::inconvertibleErrorCode());
+    return llvm::createStringError(diagnostics.GetString());
 
   return std::move(utility_fn);
 }
@@ -2621,8 +2616,7 @@ void Target::SetDefaultArchitecture(const ArchSpec &arch) {
 llvm::Error Target::SetLabel(llvm::StringRef label) {
   size_t n = LLDB_INVALID_INDEX32;
   if (llvm::to_integer(label, n))
-    return llvm::make_error<llvm::StringError>(
-        "Cannot use integer as target label.", llvm::inconvertibleErrorCode());
+    return llvm::createStringError("Cannot use integer as target label.");
   TargetList &targets = GetDebugger().GetTargetList();
   for (size_t i = 0; i < targets.GetNumTargets(); i++) {
     TargetSP target_sp = targets.GetTargetAtIndex(i);
@@ -2790,15 +2784,13 @@ llvm::Expected<lldb_private::Address> Target::GetEntryPointAddress() {
 
   // We haven't found the entry point address. Return an appropriate error.
   if (!has_primary_executable)
-    return llvm::make_error<llvm::StringError>(
+    return llvm::createStringError(
         "No primary executable found and could not find entry point address in "
-        "any executable module",
-        llvm::inconvertibleErrorCode());
+        "any executable module");
 
-  return llvm::make_error<llvm::StringError>(
+  return llvm::createStringError(
       "Could not find entry point address for primary executable module \"" +
-          exe_module->GetFileSpec().GetFilename().GetStringRef() + "\"",
-      llvm::inconvertibleErrorCode());
+      exe_module->GetFileSpec().GetFilename().GetStringRef() + "\"");
 }
 
 lldb::addr_t Target::GetCallableLoadAddress(lldb::addr_t load_addr,
diff --git a/lldb/source/Utility/Status.cpp b/lldb/source/Utility/Status.cpp
index 3bd00bb..18312e8 100644
--- a/lldb/source/Utility/Status.cpp
+++ b/lldb/source/Utility/Status.cpp
@@ -92,8 +92,7 @@ llvm::Error Status::ToError() const {
   if (m_type == ErrorType::eErrorTypePOSIX)
     return llvm::errorCodeToError(
         std::error_code(m_code, std::generic_category()));
-  return llvm::make_error<llvm::StringError>(AsCString(),
-                                             llvm::inconvertibleErrorCode());
+  return llvm::createStringError(AsCString());
 }
 
 Status::~Status() = default;
diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py
index c219a4e..605561c 100644
--- a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py
+++ b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py
@@ -6,7 +6,7 @@ Test lldb breakpoint command add/list/delete.
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
-from lldbsuite.test import lldbutil
+from lldbsuite.test import lldbutil, lldbplatformutil
 import json
 import os
 import side_effect
@@ -581,7 +581,6 @@ class BreakpointCommandTestCase(TestBase):
         self.assertNotEqual(target_stats, None)
         self.assertEqual(target_stats["sourceMapDeduceCount"], expected_count)
 
-    @skipIf(oslist=["windows"])
     @no_debug_info_test
     def test_breakpoints_auto_source_map_relative(self):
         """
@@ -612,8 +611,13 @@ class BreakpointCommandTestCase(TestBase):
         self.verify_source_map_deduce_statistics(target, 0)
 
         # Verify auto deduced source map when file path in debug info
-        # is a suffix of request breakpoint file path
-        path = "/x/y/a/b/c/main.cpp"
+        # is a suffix of request breakpoint file path.
+        # Note the path must be absolute.
+        path = (
+            "/x/y/a/b/c/main.cpp"
+            if lldbplatformutil.getHostPlatform() != "windows"
+            else r"C:\x\y\a\b\c\main.cpp"
+        )
         bp = target.BreakpointCreateByLocation(path, 2)
         self.assertGreater(
             bp.GetNumLocations(),
@@ -625,7 +629,11 @@ class BreakpointCommandTestCase(TestBase):
 
         source_map_json = self.get_source_map_json()
         self.assertEqual(len(source_map_json), 1, "source map should not be empty")
-        self.verify_source_map_entry_pair(source_map_json[0], ".", "/x/y")
+        self.verify_source_map_entry_pair(
+            source_map_json[0],
+            ".",
+            "/x/y" if lldbplatformutil.getHostPlatform() != "windows" else r"C:\x\y",
+        )
         self.verify_source_map_deduce_statistics(target, 1)
 
         # Reset source map.
diff --git a/lldb/test/API/functionalities/thread/exit_during_expression/main.c b/lldb/test/API/functionalities/thread/exit_during_expression/main.c
index eb6d175..f633632 100644
--- a/lldb/test/API/functionalities/thread/exit_during_expression/main.c
+++ b/lldb/test/API/functionalities/thread/exit_during_expression/main.c
@@ -3,7 +3,7 @@
 #include <stdio.h>
 #include <unistd.h>
 
-static unsigned int g_timeout = 100000;
+static unsigned int g_timeout = 1000000;
 
 extern int usleep(unsigned int);
 
diff --git a/lldb/test/API/lang/c/enum_types/TestEnumTypes.py b/lldb/test/API/lang/c/enum_types/TestEnumTypes.py
index 33a846c..0015c8f 100644
--- a/lldb/test/API/lang/c/enum_types/TestEnumTypes.py
+++ b/lldb/test/API/lang/c/enum_types/TestEnumTypes.py
@@ -26,7 +26,9 @@ class EnumTypesTestCase(TestBase):
         self.expect("fr var b", DATA_TYPES_DISPLAYED_CORRECTLY, patterns=[" = B$"])
         self.expect("fr var c", DATA_TYPES_DISPLAYED_CORRECTLY, patterns=[" = C$"])
         self.expect("fr var ab", DATA_TYPES_DISPLAYED_CORRECTLY, patterns=[" = AB$"])
-        self.expect("fr var ac", DATA_TYPES_DISPLAYED_CORRECTLY, patterns=[" = A | C$"])
+        self.expect(
+            "fr var ac", DATA_TYPES_DISPLAYED_CORRECTLY, patterns=[" = A \| C$"]
+        )
         self.expect("fr var all", DATA_TYPES_DISPLAYED_CORRECTLY, patterns=[" = ALL$"])
         # Test that an enum that doesn't match the heuristic we use in
         # TypeSystemClang::DumpEnumValue, gets printed as a raw integer.
@@ -37,7 +39,7 @@ class EnumTypesTestCase(TestBase):
         self.expect(
             "expression (enum bitfield)nonsense",
             DATA_TYPES_DISPLAYED_CORRECTLY,
-            patterns=[" = B | C | 0x10$"],
+            patterns=[" = B \| C \| 0x10$"],
         )
 
         # Break inside the main.
diff --git a/lldb/test/API/python_api/debugger/TestDebuggerAPI.py b/lldb/test/API/python_api/debugger/TestDebuggerAPI.py
index 29b8cfa..a007a87 100644
--- a/lldb/test/API/python_api/debugger/TestDebuggerAPI.py
+++ b/lldb/test/API/python_api/debugger/TestDebuggerAPI.py
@@ -91,6 +91,11 @@ class DebuggerAPITestCase(TestBase):
         # Test the local property again, is it set to new_cache_line_size?
         self.assertEqual(get_cache_line_size(), new_cache_line_size)
 
+    @expectedFailureAll(
+        hostoslist=["windows"],
+        remote=True,
+        bugnumber="github.com/llvm/llvm-project/issues/92419",
+    )
     def test_CreateTarget_platform(self):
         exe = self.getBuildArtifact("a.out")
         self.yaml2obj("elf.yaml", exe)
diff --git a/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py b/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py
index cab0067..b3ba697 100644
--- a/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py
+++ b/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py
@@ -41,7 +41,6 @@ class TestDAP_attach(lldbdap_testcase.DAPTestCaseBase):
 
     @skipIfWindows
     @skipIfNetBSD  # Hangs on NetBSD as well
-    @skipIfRemote
     def test_by_pid(self):
         """
         Tests attaching to a process by process ID.
@@ -59,7 +58,6 @@ class TestDAP_attach(lldbdap_testcase.DAPTestCaseBase):
 
     @skipIfWindows
     @skipIfNetBSD  # Hangs on NetBSD as well
-    @skipIfRemote
     def test_by_name(self):
         """
         Tests attaching to a process by process name.
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_logpoints.py b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_logpoints.py
index cbf190f..78ceb79 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_logpoints.py
+++ b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_logpoints.py
@@ -20,7 +20,6 @@ class TestDAP_logpoints(lldbdap_testcase.DAPTestCaseBase):
         self.main_path = os.path.realpath(self.getBuildArtifact(self.main_basename))
 
     @skipIfWindows
-    @skipIfRemote
     def test_logmessage_basic(self):
         """Tests breakpoint logmessage basic functionality."""
         before_loop_line = line_number("main.cpp", "// before loop")
@@ -83,7 +82,6 @@ class TestDAP_logpoints(lldbdap_testcase.DAPTestCaseBase):
             self.assertRegex(logMessage_line, reg_str)
 
     @skipIfWindows
-    @skipIfRemote
     def test_logmessage_advanced(self):
         """Tests breakpoint logmessage functionality for complex expression."""
         before_loop_line = line_number("main.cpp", "// before loop")
@@ -144,7 +142,6 @@ class TestDAP_logpoints(lldbdap_testcase.DAPTestCaseBase):
             self.assertEqual(logMessage_line, logMessage_prefix + str(result))
 
     @skipIfWindows
-    @skipIfRemote
     def test_logmessage_format(self):
         """
         Tests breakpoint logmessage functionality with format.
@@ -209,7 +206,6 @@ class TestDAP_logpoints(lldbdap_testcase.DAPTestCaseBase):
             )
 
     @skipIfWindows
-    @skipIfRemote
     def test_logmessage_format_failure(self):
         """
         Tests breakpoint logmessage format with parsing failure.
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
index 6f57c05..123fea7 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
+++ b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
@@ -20,7 +20,6 @@ class TestDAP_setBreakpoints(lldbdap_testcase.DAPTestCaseBase):
         self.main_path = os.path.realpath(self.getBuildArtifact(self.main_basename))
 
     @skipIfWindows
-    @skipIfRemote
     def test_source_map(self):
         """
         This test simulates building two files in a folder, and then moving
@@ -99,7 +98,6 @@ class TestDAP_setBreakpoints(lldbdap_testcase.DAPTestCaseBase):
         self.assertEqual(frames[1]["source"]["path"], new_main_path)
 
     @skipIfWindows
-    @skipIfRemote
     def test_set_and_clear(self):
         """Tests setting and clearing source file and line breakpoints.
         This packet is a bit tricky on the debug adaptor side since there
@@ -261,7 +259,6 @@ class TestDAP_setBreakpoints(lldbdap_testcase.DAPTestCaseBase):
                 )
 
     @skipIfWindows
-    @skipIfRemote
     def test_clear_breakpoints_unset_breakpoints(self):
         """Test clearing breakpoints like test_set_and_clear, but clear
         breakpoints by omitting the breakpoints array instead of sending an
@@ -305,7 +302,6 @@ class TestDAP_setBreakpoints(lldbdap_testcase.DAPTestCaseBase):
         self.assertEqual(len(breakpoints), 0, "expect no source breakpoints")
 
     @skipIfWindows
-    @skipIfRemote
     def test_functionality(self):
         """Tests hitting breakpoints and the functionality of a single
         breakpoint, like 'conditions' and 'hitCondition' settings."""
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setExceptionBreakpoints.py b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setExceptionBreakpoints.py
index 84d3f12..b2ab12e 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setExceptionBreakpoints.py
+++ b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setExceptionBreakpoints.py
@@ -12,7 +12,6 @@ import lldbdap_testcase
 
 class TestDAP_setExceptionBreakpoints(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     def test_functionality(self):
         """Tests setting and clearing exception breakpoints.
         This packet is a bit tricky on the debug adaptor side since there
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setFunctionBreakpoints.py b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setFunctionBreakpoints.py
index 9708eff..8f00f42 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setFunctionBreakpoints.py
+++ b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setFunctionBreakpoints.py
@@ -12,7 +12,6 @@ import lldbdap_testcase
 
 class TestDAP_setFunctionBreakpoints(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     def test_set_and_clear(self):
         """Tests setting and clearing function breakpoints.
         This packet is a bit tricky on the debug adaptor side since there
@@ -123,7 +122,6 @@ class TestDAP_setFunctionBreakpoints(lldbdap_testcase.DAPTestCaseBase):
             )
 
     @skipIfWindows
-    @skipIfRemote
     def test_functionality(self):
         """Tests hitting breakpoints and the functionality of a single
         breakpoint, like 'conditions' and 'hitCondition' settings."""
diff --git a/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py b/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py
index bfdf9ef2..226b938 100644
--- a/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py
+++ b/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py
@@ -7,7 +7,6 @@ from lldbsuite.test.decorators import *
 
 
 class TestDAP_commands(lldbdap_testcase.DAPTestCaseBase):
-    @skipIfRemote
     def test_command_directive_quiet_on_success(self):
         program = self.getBuildArtifact("a.out")
         command_quiet = (
@@ -61,7 +60,6 @@ class TestDAP_commands(lldbdap_testcase.DAPTestCaseBase):
     def test_command_directive_abort_on_error_pre_run_commands(self):
         self.do_test_abort_on_error(use_pre_run_commands=True)
 
-    @skipIfRemote
     def test_command_directive_abort_on_error_post_run_commands(self):
         self.do_test_abort_on_error(use_post_run_commands=True)
 
diff --git a/lldb/test/API/tools/lldb-dap/completions/TestDAP_completions.py b/lldb/test/API/tools/lldb-dap/completions/TestDAP_completions.py
index 3250a50..2b3ec65 100644
--- a/lldb/test/API/tools/lldb-dap/completions/TestDAP_completions.py
+++ b/lldb/test/API/tools/lldb-dap/completions/TestDAP_completions.py
@@ -19,7 +19,6 @@ class TestDAP_completions(lldbdap_testcase.DAPTestCaseBase):
             self.assertNotIn(not_expected_item, actual_list)
 
     @skipIfWindows
-    @skipIfRemote
     @skipIf(compiler="clang", compiler_version=["<", "17.0"])
     def test_completions(self):
         """
diff --git a/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py b/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py
index 8769f39..e634581 100644
--- a/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py
+++ b/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py
@@ -38,7 +38,6 @@ class TestDAP_console(lldbdap_testcase.DAPTestCaseBase):
         )
 
     @skipIfWindows
-    @skipIfRemote
     def test_scopes_variables_setVariable_evaluate(self):
         """
         Tests that the "scopes" request causes the currently selected
@@ -82,7 +81,6 @@ class TestDAP_console(lldbdap_testcase.DAPTestCaseBase):
         self.check_lldb_command("frame select", "frame #1", "frame 1 is selected")
 
     @skipIfWindows
-    @skipIfRemote
     def test_custom_escape_prefix(self):
         program = self.getBuildArtifact("a.out")
         self.build_and_launch(program, commandEscapePrefix="::")
@@ -99,7 +97,6 @@ class TestDAP_console(lldbdap_testcase.DAPTestCaseBase):
         )
 
     @skipIfWindows
-    @skipIfRemote
     def test_empty_escape_prefix(self):
         program = self.getBuildArtifact("a.out")
         self.build_and_launch(program, commandEscapePrefix="")
@@ -116,7 +113,6 @@ class TestDAP_console(lldbdap_testcase.DAPTestCaseBase):
         )
 
     @skipIfWindows
-    @skipIfRemote
     def test_exit_status_message_sigterm(self):
         source = "main.cpp"
         program = self.getBuildArtifact("a.out")
@@ -154,7 +150,6 @@ class TestDAP_console(lldbdap_testcase.DAPTestCaseBase):
         )
 
     @skipIfWindows
-    @skipIfRemote
     def test_exit_status_message_ok(self):
         source = "main.cpp"
         program = self.getBuildArtifact("a.out")
diff --git a/lldb/test/API/tools/lldb-dap/console/TestDAP_redirection_to_console.py b/lldb/test/API/tools/lldb-dap/console/TestDAP_redirection_to_console.py
index 85911a4..8b47d4b 100644
--- a/lldb/test/API/tools/lldb-dap/console/TestDAP_redirection_to_console.py
+++ b/lldb/test/API/tools/lldb-dap/console/TestDAP_redirection_to_console.py
@@ -8,7 +8,6 @@ import lldbdap_testcase
 
 class TestDAP_redirection_to_console(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     def test(self):
         """
         Without proper stderr and stdout redirection, the following code would throw an
diff --git a/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py b/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py
index cabaeaf..3c847dc 100644
--- a/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py
+++ b/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py
@@ -13,7 +13,6 @@ import os
 
 class TestDAP_coreFile(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     @skipIfLLVMTargetMissing("X86")
     def test_core_file(self):
         current_dir = os.path.dirname(__file__)
@@ -60,7 +59,6 @@ class TestDAP_coreFile(lldbdap_testcase.DAPTestCaseBase):
         self.assertEqual(self.get_stackFrames(), expected_frames)
 
     @skipIfWindows
-    @skipIfRemote
     @skipIfLLVMTargetMissing("X86")
     def test_core_file_source_mapping(self):
         """Test that sourceMap property is correctly applied when loading a core"""
diff --git a/lldb/test/API/tools/lldb-dap/databreakpoint/TestDAP_setDataBreakpoints.py b/lldb/test/API/tools/lldb-dap/databreakpoint/TestDAP_setDataBreakpoints.py
index 1e0e40d..a542a31 100644
--- a/lldb/test/API/tools/lldb-dap/databreakpoint/TestDAP_setDataBreakpoints.py
+++ b/lldb/test/API/tools/lldb-dap/databreakpoint/TestDAP_setDataBreakpoints.py
@@ -13,7 +13,6 @@ class TestDAP_setDataBreakpoints(lldbdap_testcase.DAPTestCaseBase):
         self.accessTypes = ["read", "write", "readWrite"]
 
     @skipIfWindows
-    @skipIfRemote
     def test_duplicate_start_addresses(self):
         """Test setDataBreakpoints with multiple watchpoints starting at the same addresses."""
         program = self.getBuildArtifact("a.out")
@@ -58,7 +57,6 @@ class TestDAP_setDataBreakpoints(lldbdap_testcase.DAPTestCaseBase):
         self.assertEqual(i_val, "2")
 
     @skipIfWindows
-    @skipIfRemote
     def test_expression(self):
         """Tests setting data breakpoints on expression."""
         program = self.getBuildArtifact("a.out")
@@ -99,7 +97,6 @@ class TestDAP_setDataBreakpoints(lldbdap_testcase.DAPTestCaseBase):
         self.assertEqual(i_val, "2")
 
     @skipIfWindows
-    @skipIfRemote
     def test_functionality(self):
         """Tests setting data breakpoints on variable."""
         program = self.getBuildArtifact("a.out")
diff --git a/lldb/test/API/tools/lldb-dap/disassemble/TestDAP_disassemble.py b/lldb/test/API/tools/lldb-dap/disassemble/TestDAP_disassemble.py
index 1b96ea7..9e8ef5b 100644
--- a/lldb/test/API/tools/lldb-dap/disassemble/TestDAP_disassemble.py
+++ b/lldb/test/API/tools/lldb-dap/disassemble/TestDAP_disassemble.py
@@ -13,7 +13,6 @@ import os
 
 class TestDAP_disassemble(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     def test_disassemble(self):
         """
         Tests the 'disassemble' request.
diff --git a/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py b/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py
index e5aab88..f9e461a 100644
--- a/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py
+++ b/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py
@@ -24,7 +24,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         self.assertTrue(output is None or len(output) == 0)
 
     @skipIfWindows
-    @skipIfRemote
     def test_launch(self):
         """
         This test launches a process that would creates a file, but we disconnect
@@ -46,7 +45,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         self.assertFalse(os.path.exists(program + ".side_effect"))
 
     @skipIfWindows
-    @skipIfRemote
     @expectedFailureNetBSD
     def test_attach(self):
         """
diff --git a/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py b/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
index 68c57ad..29548a8 100644
--- a/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
+++ b/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
@@ -192,31 +192,26 @@ class TestDAP_evaluate(lldbdap_testcase.DAPTestCaseBase):
         self.assertEvaluate("my_bool_vec", "size=2")
 
     @skipIfWindows
-    @skipIfRemote
     def test_generic_evaluate_expressions(self):
         # Tests context-less expression evaluations
         self.run_test_evaluate_expressions(enableAutoVariableSummaries=False)
 
     @skipIfWindows
-    @skipIfRemote
     def test_repl_evaluate_expressions(self):
         # Tests expression evaluations that are triggered from the Debug Console
         self.run_test_evaluate_expressions("repl", enableAutoVariableSummaries=False)
 
     @skipIfWindows
-    @skipIfRemote
     def test_watch_evaluate_expressions(self):
         # Tests expression evaluations that are triggered from a watch expression
         self.run_test_evaluate_expressions("watch", enableAutoVariableSummaries=True)
 
     @skipIfWindows
-    @skipIfRemote
     def test_hover_evaluate_expressions(self):
         # Tests expression evaluations that are triggered when hovering on the editor
         self.run_test_evaluate_expressions("hover", enableAutoVariableSummaries=False)
 
     @skipIfWindows
-    @skipIfRemote
     def test_variable_evaluate_expressions(self):
         # Tests expression evaluations that are triggered in the variable explorer
         self.run_test_evaluate_expressions("variable", enableAutoVariableSummaries=True)
diff --git a/lldb/test/API/tools/lldb-dap/exception/TestDAP_exception.py b/lldb/test/API/tools/lldb-dap/exception/TestDAP_exception.py
index 58a67d8..8c2c015 100644
--- a/lldb/test/API/tools/lldb-dap/exception/TestDAP_exception.py
+++ b/lldb/test/API/tools/lldb-dap/exception/TestDAP_exception.py
@@ -9,7 +9,6 @@ import lldbdap_testcase
 
 
 class TestDAP_exception(lldbdap_testcase.DAPTestCaseBase):
-    @skipIfRemote
     @skipIfWindows
     def test_stopped_description(self):
         """
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
index 0760d35..05873e9 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
@@ -13,7 +13,6 @@ import os
 
 class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     def test_default(self):
         """
         Tests the default launch of a simple program. No arguments,
@@ -29,7 +28,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         self.assertIn(program, lines[0], "make sure program path is in first argument")
 
     @skipIfWindows
-    @skipIfRemote
     def test_termination(self):
         """
         Tests the correct termination of lldb-dap upon a 'disconnect'
@@ -50,7 +48,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         self.assertEqual(self.dap_server.process.poll(), 0)
 
     @skipIfWindows
-    @skipIfRemote
     def test_stopOnEntry(self):
         """
         Tests the default launch of a simple program that stops at the
@@ -70,7 +67,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
                     )
 
     @skipIfWindows
-    @skipIfRemote
     def test_cwd(self):
         """
         Tests the default launch of a simple program with a current working
@@ -97,7 +93,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         self.assertTrue(found, "verified program working directory")
 
     @skipIfWindows
-    @skipIfRemote
     def test_debuggerRoot(self):
         """
         Tests the "debuggerRoot" will change the working directory of
@@ -127,7 +122,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         self.continue_to_exit()
 
     @skipIfWindows
-    @skipIfRemote
     def test_sourcePath(self):
         """
         Tests the "sourcePath" will set the target.source-map.
@@ -153,7 +147,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         self.continue_to_exit()
 
     @skipIfWindows
-    @skipIfRemote
     def test_disableSTDIO(self):
         """
         Tests the default launch of a simple program with STDIO disabled.
@@ -168,7 +161,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
     @skipIfLinux  # shell argument expansion doesn't seem to work on Linux
     @expectedFailureAll(oslist=["freebsd", "netbsd"], bugnumber="llvm.org/pr48349")
-    @skipIfRemote
     def test_shellExpandArguments_enabled(self):
         """
         Tests the default launch of a simple program with shell expansion
@@ -191,7 +183,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
                 )
 
     @skipIfWindows
-    @skipIfRemote
     def test_shellExpandArguments_disabled(self):
         """
         Tests the default launch of a simple program with shell expansion
@@ -214,7 +205,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
                 )
 
     @skipIfWindows
-    @skipIfRemote
     def test_args(self):
         """
         Tests launch of a simple program with arguments
@@ -240,7 +230,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
             )
 
     @skipIfWindows
-    @skipIfRemote
     def test_environment(self):
         """
         Tests launch of a simple program with environment variables
@@ -270,7 +259,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
             )
 
     @skipIfWindows
-    @skipIfRemote
     @skipIf(
         archs=["arm", "aarch64"]
     )  # failed run https://lab.llvm.org/buildbot/#/builders/96/builds/6933
@@ -354,7 +342,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         self.verify_commands("terminateCommands", output, terminateCommands)
 
     @skipIfWindows
-    @skipIfRemote
     def test_extra_launch_commands(self):
         """
         Tests the "launchCommands" with extra launching settings
@@ -420,7 +407,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         self.verify_commands("exitCommands", output, exitCommands)
 
     @skipIfWindows
-    @skipIfRemote
     def test_failing_launch_commands(self):
         """
         Tests "launchCommands" failures prevents a launch.
diff --git a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
index 3f3ead0..a4e0f04 100644
--- a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
+++ b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
@@ -58,7 +58,6 @@ class TestDAP_module(lldbdap_testcase.DAPTestCaseBase):
         self.assertIn("addressRange", program_module)
 
     @skipIfWindows
-    @skipIfRemote
     def test_modules(self):
         """
         Mac or linux.
@@ -74,7 +73,6 @@ class TestDAP_module(lldbdap_testcase.DAPTestCaseBase):
         )
 
     @skipUnlessDarwin
-    @skipIfRemote
     def test_modules_dsym(self):
         """
         Darwin only test with dSYM file.
@@ -85,7 +83,6 @@ class TestDAP_module(lldbdap_testcase.DAPTestCaseBase):
         return self.run_test("a.out.dSYM", expect_debug_info_size=True)
 
     @skipIfWindows
-    @skipIfRemote
     def test_compile_units(self):
         program = self.getBuildArtifact("a.out")
         self.build_and_launch(program)
diff --git a/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py b/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py
index 90b130d..dc7f4f9 100644
--- a/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py
+++ b/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py
@@ -11,7 +11,6 @@ from lldbsuite.test.lldbtest import *
 
 class TestDAP_optimized(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     def test_stack_frame_name(self):
         """Test optimized frame has special name suffix."""
         program = self.getBuildArtifact("a.out")
@@ -30,7 +29,6 @@ class TestDAP_optimized(lldbdap_testcase.DAPTestCaseBase):
         self.assertTrue(parent_frame["name"].endswith(" [opt]"))
 
     @skipIfWindows
-    @skipIfRemote
     def test_optimized_variable(self):
         """Test optimized variable value contains error."""
         program = self.getBuildArtifact("a.out")
diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py
index 32dbc82..36fa0bd 100644
--- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py
+++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py
@@ -9,7 +9,6 @@ import lldbdap_testcase
 
 class TestDAP_restart(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     def test_basic_functionality(self):
         """
         Tests the basic restarting functionality: set two breakpoints in
@@ -45,7 +44,6 @@ class TestDAP_restart(lldbdap_testcase.DAPTestCaseBase):
         )
 
     @skipIfWindows
-    @skipIfRemote
     def test_stopOnEntry(self):
         """
         Check that the stopOnEntry setting is still honored after a restart.
@@ -87,7 +85,6 @@ class TestDAP_restart(lldbdap_testcase.DAPTestCaseBase):
                     )
 
     @skipIfWindows
-    @skipIfRemote
     def test_arguments(self):
         """
         Tests that lldb-dap will use updated launch arguments included
diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py
index c19a6d5..5a9938c 100644
--- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py
+++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py
@@ -21,7 +21,6 @@ class TestDAP_restart_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
             return False
 
     @skipIfWindows
-    @skipIfRemote
     @skipIf(archs=["arm"])  # Always times out on buildbot
     def test_basic_functionality(self):
         """
@@ -62,7 +61,6 @@ class TestDAP_restart_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
         )
 
     @skipIfWindows
-    @skipIfRemote
     @skipIf(archs=["arm"])  # Always times out on buildbot
     def test_stopOnEntry(self):
         """
diff --git a/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py b/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
index f79a319..9fcd210 100644
--- a/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
+++ b/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
@@ -44,7 +44,6 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
             return False
 
     @skipIfWindows
-    @skipIfRemote
     @skipIf(archs=no_match(["x86_64"]))
     def test_runInTerminal(self):
         if not self.isTestSupported():
@@ -92,7 +91,6 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
         self.assertIn("bar", env)
 
     @skipIfWindows
-    @skipIfRemote
     @skipIf(archs=no_match(["x86_64"]))
     def test_runInTerminalInvalidTarget(self):
         if not self.isTestSupported():
@@ -112,7 +110,6 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
         )
 
     @skipIfWindows
-    @skipIfRemote
     @skipIf(archs=no_match(["x86_64"]))
     def test_missingArgInRunInTerminalLauncher(self):
         if not self.isTestSupported():
@@ -128,7 +125,6 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
         )
 
     @skipIfWindows
-    @skipIfRemote
     @skipIf(archs=no_match(["x86_64"]))
     def test_FakeAttachedRunInTerminalLauncherWithInvalidProgram(self):
         if not self.isTestSupported():
@@ -156,7 +152,6 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
         self.assertIn("No such file or directory", stderr)
 
     @skipIfWindows
-    @skipIfRemote
     @skipIf(archs=no_match(["x86_64"]))
     def test_FakeAttachedRunInTerminalLauncherWithValidProgram(self):
         if not self.isTestSupported():
@@ -184,7 +179,6 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
         self.assertIn("foo", stdout)
 
     @skipIfWindows
-    @skipIfRemote
     @skipIf(archs=no_match(["x86_64"]))
     def test_FakeAttachedRunInTerminalLauncherAndCheckEnvironment(self):
         if not self.isTestSupported():
@@ -206,7 +200,6 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
         self.assertIn("FOO=BAR", stdout)
 
     @skipIfWindows
-    @skipIfRemote
     @skipIf(archs=no_match(["x86_64"]))
     def test_NonAttachedRunInTerminalLauncher(self):
         if not self.isTestSupported():
diff --git a/lldb/test/API/tools/lldb-dap/stackTrace/TestDAP_stackTrace.py b/lldb/test/API/tools/lldb-dap/stackTrace/TestDAP_stackTrace.py
index 70526cc..0d7776f 100644
--- a/lldb/test/API/tools/lldb-dap/stackTrace/TestDAP_stackTrace.py
+++ b/lldb/test/API/tools/lldb-dap/stackTrace/TestDAP_stackTrace.py
@@ -57,7 +57,6 @@ class TestDAP_stackTrace(lldbdap_testcase.DAPTestCaseBase):
         )
 
     @skipIfWindows
-    @skipIfRemote
     def test_stackTrace(self):
         """
         Tests the 'stackTrace' packet and all its variants.
@@ -190,7 +189,6 @@ class TestDAP_stackTrace(lldbdap_testcase.DAPTestCaseBase):
         )
 
     @skipIfWindows
-    @skipIfRemote
     def test_functionNameWithArgs(self):
         """
         Test that the stack frame without a function name is given its pc in the response.
diff --git a/lldb/test/API/tools/lldb-dap/stackTraceMissingFunctionName/TestDAP_stackTraceMissingFunctionName.py b/lldb/test/API/tools/lldb-dap/stackTraceMissingFunctionName/TestDAP_stackTraceMissingFunctionName.py
index 0011c0f..a04c752 100644
--- a/lldb/test/API/tools/lldb-dap/stackTraceMissingFunctionName/TestDAP_stackTraceMissingFunctionName.py
+++ b/lldb/test/API/tools/lldb-dap/stackTraceMissingFunctionName/TestDAP_stackTraceMissingFunctionName.py
@@ -13,7 +13,6 @@ from lldbsuite.test import lldbtest, lldbutil
 
 class TestDAP_stackTraceMissingFunctionName(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     def test_missingFunctionName(self):
         """
         Test that the stack frame without a function name is given its pc in the response.
diff --git a/lldb/test/API/tools/lldb-dap/startDebugging/TestDAP_startDebugging.py b/lldb/test/API/tools/lldb-dap/startDebugging/TestDAP_startDebugging.py
index 7700c65..fd48e69 100644
--- a/lldb/test/API/tools/lldb-dap/startDebugging/TestDAP_startDebugging.py
+++ b/lldb/test/API/tools/lldb-dap/startDebugging/TestDAP_startDebugging.py
@@ -11,7 +11,6 @@ import lldbdap_testcase
 
 
 class TestDAP_startDebugging(lldbdap_testcase.DAPTestCaseBase):
-    @skipIfRemote
     def test_startDebugging(self):
         """
         Tests the "startDebugging" reverse request. It makes sure that the IDE can
diff --git a/lldb/test/API/tools/lldb-dap/step/TestDAP_step.py b/lldb/test/API/tools/lldb-dap/step/TestDAP_step.py
index 578e64e..8a1bb76 100644
--- a/lldb/test/API/tools/lldb-dap/step/TestDAP_step.py
+++ b/lldb/test/API/tools/lldb-dap/step/TestDAP_step.py
@@ -12,7 +12,6 @@ import lldbdap_testcase
 
 class TestDAP_step(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     def test_step(self):
         """
         Tests the stepping in/out/over in threads.
diff --git a/lldb/test/API/tools/lldb-dap/stop-hooks/TestDAP_stop_hooks.py b/lldb/test/API/tools/lldb-dap/stop-hooks/TestDAP_stop_hooks.py
index c538e80..70c11a6 100644
--- a/lldb/test/API/tools/lldb-dap/stop-hooks/TestDAP_stop_hooks.py
+++ b/lldb/test/API/tools/lldb-dap/stop-hooks/TestDAP_stop_hooks.py
@@ -9,7 +9,6 @@ import lldbdap_testcase
 
 
 class TestDAP_stop_hooks(lldbdap_testcase.DAPTestCaseBase):
-    @skipIfRemote
     def test_stop_hooks_before_run(self):
         """
         Test that there is no race condition between lldb-dap and
diff --git a/lldb/test/API/tools/lldb-dap/terminated-event/TestDAP_terminatedEvent.py b/lldb/test/API/tools/lldb-dap/terminated-event/TestDAP_terminatedEvent.py
index ff5081a..6d1c25e 100644
--- a/lldb/test/API/tools/lldb-dap/terminated-event/TestDAP_terminatedEvent.py
+++ b/lldb/test/API/tools/lldb-dap/terminated-event/TestDAP_terminatedEvent.py
@@ -13,7 +13,6 @@ import json
 
 class TestDAP_terminatedEvent(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     def test_terminated_event(self):
         """
         Terminated Event
diff --git a/lldb/test/API/tools/lldb-dap/threads/TestDAP_threads.py b/lldb/test/API/tools/lldb-dap/threads/TestDAP_threads.py
index f7f1ad7..6edb4b8 100644
--- a/lldb/test/API/tools/lldb-dap/threads/TestDAP_threads.py
+++ b/lldb/test/API/tools/lldb-dap/threads/TestDAP_threads.py
@@ -10,7 +10,6 @@ import lldbdap_testcase
 
 class TestDAP_threads(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     def test_correct_thread(self):
         """
         Tests that the correct thread is selected if we continue from
@@ -45,7 +44,6 @@ class TestDAP_threads(lldbdap_testcase.DAPTestCaseBase):
         self.assertTrue(stopped_event[0]["body"]["threadCausedFocus"])
 
     @skipIfWindows
-    @skipIfRemote
     def test_thread_format(self):
         """
         Tests the support for custom thread formats.
diff --git a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
index 57c17e5..3c6901b 100644
--- a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
+++ b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
@@ -394,14 +394,12 @@ class TestDAP_variables(lldbdap_testcase.DAPTestCaseBase):
         self.verify_variables(verify_locals, locals)
 
     @skipIfWindows
-    @skipIfRemote
     def test_scopes_variables_setVariable_evaluate(self):
         self.do_test_scopes_variables_setVariable_evaluate(
             enableAutoVariableSummaries=False
         )
 
     @skipIfWindows
-    @skipIfRemote
     def test_scopes_variables_setVariable_evaluate_with_descriptive_summaries(self):
         self.do_test_scopes_variables_setVariable_evaluate(
             enableAutoVariableSummaries=True
@@ -603,12 +601,10 @@ class TestDAP_variables(lldbdap_testcase.DAPTestCaseBase):
                 self.assertEqual(scope.get("presentationHint"), "registers")
 
     @skipIfWindows
-    @skipIfRemote
     def test_scopes_and_evaluate_expansion(self):
         self.do_test_scopes_and_evaluate_expansion(enableAutoVariableSummaries=False)
 
     @skipIfWindows
-    @skipIfRemote
     def test_scopes_and_evaluate_expansion_with_descriptive_summaries(self):
         self.do_test_scopes_and_evaluate_expansion(enableAutoVariableSummaries=True)
 
@@ -664,17 +660,14 @@ class TestDAP_variables(lldbdap_testcase.DAPTestCaseBase):
         self.verify_variables(verify_children, children)
 
     @skipIfWindows
-    @skipIfRemote
     def test_indexedVariables(self):
         self.do_test_indexedVariables(enableSyntheticChildDebugging=False)
 
     @skipIfWindows
-    @skipIfRemote
     def test_indexedVariables_with_raw_child_for_synthetics(self):
         self.do_test_indexedVariables(enableSyntheticChildDebugging=True)
 
     @skipIfWindows
-    @skipIfRemote
     def test_registers(self):
         """
         Test that registers whose byte size is the size of a pointer on
@@ -748,7 +741,6 @@ class TestDAP_variables(lldbdap_testcase.DAPTestCaseBase):
 
     @no_debug_info_test
     @skipIfWindows
-    @skipIfRemote
     def test_value_format(self):
         """
         Test that toggle variables value format between decimal and hexical works.
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/invalid_abbreviation.s b/lldb/test/Shell/SymbolFile/DWARF/x86/invalid_abbreviation.s
new file mode 100644
index 0000000..3f32c03
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/invalid_abbreviation.s
@@ -0,0 +1,47 @@
+# REQUIRES: x86
+
+# RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj %s > %t
+# RUN: %lldb %t \
+# RUN:   -o exit 2>&1 | FileCheck %s
+
+# CHECK-DAG: error: {{.*}} [0x0000000000000022]: abbreviation code 65536 too big, please file a bug and attach the file at the start of this error message
+# CHECK-DAG: error: {{.*}} [0x0000000000000048]: invalid abbreviation code 47, please file a bug and attach the file at the start of this error message
+
+
+        .section        .debug_abbrev,"",@progbits
+        .uleb128 65535                  # Largest representable Abbreviation Code
+        .byte   17                      # DW_TAG_compile_unit
+        .byte   1                       # DW_CHILDREN_yes
+        .byte   37                      # DW_AT_producer
+        .byte   8                       # DW_FORM_string
+        .byte   0                       # EOM(1)
+        .byte   0                       # EOM(2)
+        .byte   0                       # EOM(3)
+
+        .section        .debug_info,"",@progbits
+.Lcu_begin0:
+        .long   .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+        .short  5                       # DWARF version number
+        .byte   1                       # DWARF Unit Type
+        .byte   8                       # Address Size (in bytes)
+        .long   .debug_abbrev           # Offset Into Abbrev. Section
+        .uleb128 65535                  # DW_TAG_compile_unit
+        .asciz  "Hand-written DWARF"    # DW_AT_producer
+        .uleb128 65536                  # Unrepresentable abbreviation
+        .byte   0                       # End Of Children Mark
+.Ldebug_info_end0:
+
+        .section        .debug_info,"",@progbits
+.Lcu_begin1:
+        .long   .Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+        .short  5                       # DWARF version number
+        .byte   1                       # DWARF Unit Type
+        .byte   8                       # Address Size (in bytes)
+        .long   .debug_abbrev           # Offset Into Abbrev. Section
+        .uleb128 65535                  # DW_TAG_compile_unit
+        .asciz  "Hand-written DWARF"    # DW_AT_producer
+        .byte   47                      # Missing abbreviation
+        .byte   0                       # End Of Children Mark
+.Ldebug_info_end1:
diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h
index bbd9d46..a88ee3e 100644
--- a/lldb/tools/lldb-dap/DAP.h
+++ b/lldb/tools/lldb-dap/DAP.h
@@ -26,6 +26,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/JSON.h"
+#include "llvm/Support/Threading.h"
 #include "llvm/Support/raw_ostream.h"
 
 #include "lldb/API/SBAttachInfo.h"
@@ -169,6 +170,7 @@ struct DAP {
   std::optional<llvm::json::Object> last_launch_or_attach_request;
   lldb::tid_t focus_tid;
   bool disconnecting = false;
+  llvm::once_flag terminated_event_flag;
   bool stop_at_entry;
   bool is_attach;
   bool enable_auto_variable_summaries;
diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index 170fa88..7746afb 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -227,13 +227,12 @@ void SendContinuedEvent() {
 // debugged.
 void SendTerminatedEvent() {
   // Prevent races if the process exits while we're being asked to disconnect.
-  static std::mutex mutex;
-  std::lock_guard<std::mutex> locker(mutex);
-
-  g_dap.RunTerminateCommands();
-  // Send a "terminated" event
-  llvm::json::Object event(CreateTerminatedEventObject());
-  g_dap.SendJSON(llvm::json::Value(std::move(event)));
+  llvm::call_once(g_dap.terminated_event_flag, [&] {
+    g_dap.RunTerminateCommands();
+    // Send a "terminated" event
+    llvm::json::Object event(CreateTerminatedEventObject());
+    g_dap.SendJSON(llvm::json::Value(std::move(event)));
+  });
 }
 
 // Send a thread stopped event for all threads as long as the process
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index f3aeafa..612e90a 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -539,8 +539,6 @@ set(FFI_INCLUDE_DIR "" CACHE PATH "Additional directory, where CMake should sear
 set(LLVM_TARGET_ARCH "host"
   CACHE STRING "Set target to use for LLVM JIT or use \"host\" for automatic detection.")
 
-option(LLVM_ENABLE_TERMINFO "Use terminfo database if available." ON)
-
 set(LLVM_ENABLE_LIBXML2 "ON" CACHE STRING "Use libxml2 if available. Can be ON, OFF, or FORCE_ON")
 
 option(LLVM_ENABLE_LIBEDIT "Use libedit if available." ON)
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index bf1b110..8cfb36b0 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -240,21 +240,11 @@ if(NOT LLVM_USE_SANITIZER MATCHES "Memory.*")
     else()
       set(HAVE_LIBEDIT 0)
     endif()
-    if(LLVM_ENABLE_TERMINFO)
-      if(LLVM_ENABLE_TERMINFO STREQUAL FORCE_ON)
-        find_package(Terminfo REQUIRED)
-      else()
-        find_package(Terminfo)
-      endif()
-      set(LLVM_ENABLE_TERMINFO "${Terminfo_FOUND}")
-    endif()
   else()
     set(HAVE_LIBEDIT 0)
-    set(LLVM_ENABLE_TERMINFO 0)
   endif()
 else()
   set(HAVE_LIBEDIT 0)
-  set(LLVM_ENABLE_TERMINFO 0)
 endif()
 
 # function checks
@@ -415,15 +405,18 @@ if( LLVM_ENABLE_PIC )
   set(ENABLE_PIC 1)
 else()
   set(ENABLE_PIC 0)
-  check_cxx_compiler_flag("-fno-pie" SUPPORTS_NO_PIE_FLAG)
-  if(SUPPORTS_NO_PIE_FLAG)
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fno-pie")
-  endif()
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fno-pie")
 endif()
 
-check_cxx_compiler_flag("-Wvariadic-macros" SUPPORTS_VARIADIC_MACROS_FLAG)
-check_cxx_compiler_flag("-Wgnu-zero-variadic-macro-arguments"
-                        SUPPORTS_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG)
+set(SUPPORTS_VARIADIC_MACROS_FLAG 0)
+if (LLVM_COMPILER_IS_GCC_COMPATIBLE)
+  set(SUPPORTS_VARIADIC_MACROS_FLAG 1)
+endif()
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  set(SUPPORTS_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG 1)
+else()
+  set(SUPPORTS_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG 0)
+endif()
 
 set(USE_NO_MAYBE_UNINITIALIZED 0)
 set(USE_NO_UNINITIALIZED 0)
@@ -433,11 +426,9 @@ set(USE_NO_UNINITIALIZED 0)
 if (CMAKE_COMPILER_IS_GNUCXX)
   # Disable all -Wuninitialized warning for old GCC versions.
   if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12.0)
-    check_cxx_compiler_flag("-Wuninitialized" HAS_UNINITIALIZED)
-    set(USE_NO_UNINITIALIZED ${HAS_UNINITIALIZED})
+    set(USE_NO_UNINITIALIZED 1)
   else()
-    check_cxx_compiler_flag("-Wmaybe-uninitialized" HAS_MAYBE_UNINITIALIZED)
-    set(USE_NO_MAYBE_UNINITIALIZED ${HAS_MAYBE_UNINITIALIZED})
+    set(USE_NO_MAYBE_UNINITIALIZED 1)
   endif()
 endif()
 
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index ecbae8a..03f4e1f 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -276,11 +276,11 @@ if (NOT DEFINED LLVM_LINKER_DETECTED AND NOT WIN32)
 
   if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
     include(CheckLinkerFlag)
-    # Linkers that support Darwin allow a setting to internalize all symbol exports, 
+    # Linkers that support Darwin allow a setting to internalize all symbol exports,
     # aiding in reducing binary size and often is applicable for executables.
     check_linker_flag(C "-Wl,-no_exported_symbols" LLVM_LINKER_SUPPORTS_NO_EXPORTED_SYMBOLS)
-    
-    if (NOT LLVM_USE_LINKER) 
+
+    if (NOT LLVM_USE_LINKER)
       # Apple's linker complains about duplicate libraries, which CMake likes to do
       # to support ELF platforms. To silence that warning, we can use
       # -no_warn_duplicate_libraries, but only in versions of the linker that
@@ -289,8 +289,8 @@ if (NOT DEFINED LLVM_LINKER_DETECTED AND NOT WIN32)
     else()
       set(LLVM_LINKER_SUPPORTS_NO_WARN_DUPLICATE_LIBRARIES OFF CACHE INTERNAL "")
     endif()
-  
-  else() 
+
+  else()
     set(LLVM_LINKER_SUPPORTS_NO_EXPORTED_SYMBOLS OFF CACHE INTERNAL "")
   endif()
 endif()
@@ -1069,7 +1069,7 @@ macro(add_llvm_executable name)
     add_llvm_symbol_exports( ${name} ${LLVM_EXPORTED_SYMBOL_FILE} )
   endif(LLVM_EXPORTED_SYMBOL_FILE)
 
-  if (DEFINED LLVM_ENABLE_EXPORTED_SYMBOLS_IN_EXECUTABLES AND 
+  if (DEFINED LLVM_ENABLE_EXPORTED_SYMBOLS_IN_EXECUTABLES AND
       NOT LLVM_ENABLE_EXPORTED_SYMBOLS_IN_EXECUTABLES)
     if(LLVM_LINKER_SUPPORTS_NO_EXPORTED_SYMBOLS)
       set_property(TARGET ${name} APPEND_STRING PROPERTY
@@ -1677,7 +1677,7 @@ function(add_unittest test_suite test_name)
 
   if (SUPPORTS_VARIADIC_MACROS_FLAG)
     list(APPEND LLVM_COMPILE_FLAGS "-Wno-variadic-macros")
-  endif ()
+  endif()
   # Some parts of gtest rely on this GNU extension, don't warn on it.
   if(SUPPORTS_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG)
     list(APPEND LLVM_COMPILE_FLAGS "-Wno-gnu-zero-variadic-macro-arguments")
diff --git a/llvm/cmake/modules/FindTerminfo.cmake b/llvm/cmake/modules/FindTerminfo.cmake
deleted file mode 100644
index 163af66..0000000
--- a/llvm/cmake/modules/FindTerminfo.cmake
+++ /dev/null
@@ -1,55 +0,0 @@
-# Attempts to discover terminfo library with a linkable setupterm function.
-#
-# Example usage:
-#
-# find_package(Terminfo)
-#
-# If successful, the following variables will be defined:
-# Terminfo_FOUND
-# Terminfo_LIBRARIES
-#
-# Additionally, the following import target will be defined:
-# Terminfo::terminfo
-
-find_library(Terminfo_LIBRARIES NAMES terminfo tinfo curses ncurses ncursesw)
-
-if(Terminfo_LIBRARIES)
-  include(CMakePushCheckState)
-  cmake_push_check_state()
-  list(APPEND CMAKE_REQUIRED_LIBRARIES ${Terminfo_LIBRARIES})
-  set(Terminfo_LINKABLE_SRC [=[
-    #ifdef __cplusplus
-    extern "C" {
-    #endif
-    int setupterm(char *term, int filedes, int *errret);
-    #ifdef __cplusplus
-    }
-    #endif
-    int main(void) { return setupterm(0, 0, 0); }
-    ]=])
-  if(DEFINED CMAKE_C_COMPILER)
-    include(CheckCSourceCompiles)
-    check_c_source_compiles("${Terminfo_LINKABLE_SRC}" Terminfo_LINKABLE)
-  else()
-    include(CheckCXXSourceCompiles)
-    check_cxx_source_compiles("${Terminfo_LINKABLE_SRC}" Terminfo_LINKABLE)
-  endif()
-  cmake_pop_check_state()
-endif()
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(Terminfo
-                                  FOUND_VAR
-                                    Terminfo_FOUND
-                                  REQUIRED_VARS
-                                    Terminfo_LIBRARIES
-                                    Terminfo_LINKABLE)
-mark_as_advanced(Terminfo_LIBRARIES
-                 Terminfo_LINKABLE)
-
-if(Terminfo_FOUND)
-  if(NOT TARGET Terminfo::terminfo)
-    add_library(Terminfo::terminfo UNKNOWN IMPORTED)
-    set_target_properties(Terminfo::terminfo PROPERTIES IMPORTED_LOCATION "${Terminfo_LIBRARIES}")
-  endif()
-endif()
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index d16641d..99d848b 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -425,7 +425,7 @@ if( LLVM_ENABLE_PIC )
   # GCC for MIPS can miscompile LLVM due to PR37701.
   if(CMAKE_COMPILER_IS_GNUCXX AND LLVM_NATIVE_ARCH STREQUAL "Mips" AND
          NOT Uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
-    add_flag_or_print_warning("-fno-shrink-wrap" FNO_SHRINK_WRAP)
+    append("-fno-shrink-wrap" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   endif()
   # gcc with -O3 -fPIC generates TLS sequences that violate the spec on
   # Solaris/sparcv9, causing executables created with the system linker
@@ -635,18 +635,16 @@ if( MSVC )
     # This checks CMAKE_CXX_COMPILER_ID in addition to check_cxx_compiler_flag()
     # because cl.exe does not emit an error on flags it doesn't understand,
     # letting check_cxx_compiler_flag() claim it understands all flags.
-    check_cxx_compiler_flag("/Brepro" SUPPORTS_BREPRO)
-    if (SUPPORTS_BREPRO)
-      # Check if /INCREMENTAL is passed to the linker and complain that it
-      # won't work with /Brepro.
-      has_msvc_incremental_no_flag("${CMAKE_EXE_LINKER_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${CMAKE_EXE_LINKER_FLAGS}" NO_INCR_EXE)
-      has_msvc_incremental_no_flag("${CMAKE_MODULE_LINKER_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${CMAKE_MODULE_LINKER_FLAGS}" NO_INCR_MODULE)
-      has_msvc_incremental_no_flag("${CMAKE_SHARED_LINKER_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${CMAKE_SHARED_LINKER_FLAGS}" NO_INCR_SHARED)
-      if (NO_INCR_EXE AND NO_INCR_MODULE AND NO_INCR_SHARED)
-        append("/Brepro" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-      else()
-        message(WARNING "/Brepro not compatible with /INCREMENTAL linking - builds will be non-deterministic")
-      endif()
+
+    # Check if /INCREMENTAL is passed to the linker and complain that it
+    # won't work with /Brepro.
+    has_msvc_incremental_no_flag("${CMAKE_EXE_LINKER_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${CMAKE_EXE_LINKER_FLAGS}" NO_INCR_EXE)
+    has_msvc_incremental_no_flag("${CMAKE_MODULE_LINKER_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${CMAKE_MODULE_LINKER_FLAGS}" NO_INCR_MODULE)
+    has_msvc_incremental_no_flag("${CMAKE_SHARED_LINKER_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${CMAKE_SHARED_LINKER_FLAGS}" NO_INCR_SHARED)
+    if (NO_INCR_EXE AND NO_INCR_MODULE AND NO_INCR_SHARED)
+      append("/Brepro" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+    else()
+      message(WARNING "/Brepro not compatible with /INCREMENTAL linking - builds will be non-deterministic")
     endif()
   endif()
   # By default MSVC has a 2^16 limit on the number of sections in an object file,
@@ -667,19 +665,22 @@ endif( LLVM_COMPILER_IS_GCC_COMPATIBLE )
 
 # Specific default warnings-as-errors for compilers accepting GCC-compatible warning flags:
 if ( LLVM_COMPILER_IS_GCC_COMPATIBLE OR CMAKE_CXX_COMPILER_ID MATCHES "XL" )
-  add_flag_if_supported("-Werror=date-time" WERROR_DATE_TIME)
-  add_flag_if_supported("-Werror=unguarded-availability-new" WERROR_UNGUARDED_AVAILABILITY_NEW)
+  append("-Werror=date-time" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 endif( LLVM_COMPILER_IS_GCC_COMPATIBLE OR CMAKE_CXX_COMPILER_ID MATCHES "XL" )
 
-if ( LLVM_COMPILER_IS_GCC_COMPATIBLE )
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  append("-Werror=unguarded-availability-new" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+endif()
+
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GCC")
   # LLVM data structures like llvm::User and llvm::MDNode rely on
   # the value of object storage persisting beyond the lifetime of the
   # object (#24952).  This is not standard compliant and causes a runtime
   # crash if LLVM is built with GCC and LTO enabled (#57740).  Until
   # these bugs are fixed, we need to disable dead store eliminations
   # based on object lifetime.
-  add_flag_if_supported("-fno-lifetime-dse" CMAKE_CXX_FLAGS)
-endif ( LLVM_COMPILER_IS_GCC_COMPATIBLE )
+  append("-fno-lifetime-dse" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+endif ()
 
 # Modules enablement for GCC-compatible compilers:
 if ( LLVM_COMPILER_IS_GCC_COMPATIBLE AND LLVM_ENABLE_MODULES )
@@ -697,22 +698,7 @@ if ( LLVM_COMPILER_IS_GCC_COMPATIBLE AND LLVM_ENABLE_MODULES )
        (uppercase_CMAKE_BUILD_TYPE STREQUAL "RELWITHDEBINFO")))
     set(module_flags "${module_flags} -gmodules")
   endif()
-  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${module_flags}")
-
-  # Check that we can build code with modules enabled, and that repeatedly
-  # including <cassert> still manages to respect NDEBUG properly.
-  CHECK_CXX_SOURCE_COMPILES("#undef NDEBUG
-                             #include <cassert>
-                             #define NDEBUG
-                             #include <cassert>
-                             int main() { assert(this code is not compiled); }"
-                             CXX_SUPPORTS_MODULES)
-  set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
-  if (CXX_SUPPORTS_MODULES)
-    append("${module_flags}" CMAKE_CXX_FLAGS)
-  else()
-    message(FATAL_ERROR "LLVM_ENABLE_MODULES is not supported by this compiler")
-  endif()
+  append("${module_flags}" CMAKE_CXX_FLAGS)
 endif( LLVM_COMPILER_IS_GCC_COMPATIBLE AND LLVM_ENABLE_MODULES )
 
 if (MSVC)
@@ -814,13 +800,10 @@ if (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
   # Turn off missing field initializer warnings for gcc to avoid noise from
   # false positives with empty {}. Turn them on otherwise (they're off by
   # default for clang).
-  check_cxx_compiler_flag("-Wmissing-field-initializers" CXX_SUPPORTS_MISSING_FIELD_INITIALIZERS_FLAG)
-  if (CXX_SUPPORTS_MISSING_FIELD_INITIALIZERS_FLAG)
-    if (CMAKE_COMPILER_IS_GNUCXX)
-      append("-Wno-missing-field-initializers" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-    else()
-      append("-Wmissing-field-initializers" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-    endif()
+  if (CMAKE_COMPILER_IS_GNUCXX)
+    append("-Wno-missing-field-initializers" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  else()
+    append("-Wmissing-field-initializers" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   endif()
 
   if (LLVM_ENABLE_PEDANTIC AND LLVM_COMPILER_IS_GCC_COMPATIBLE)
@@ -833,8 +816,13 @@ if (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
     add_flag_if_supported("-Wc++98-compat-extra-semi" CXX98_COMPAT_EXTRA_SEMI_FLAG)
   endif()
 
-  add_flag_if_supported("-Wimplicit-fallthrough" IMPLICIT_FALLTHROUGH_FLAG)
-  add_flag_if_supported("-Wcovered-switch-default" COVERED_SWITCH_DEFAULT_FLAG)
+  append("-Wimplicit-fallthrough" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+
+  set(CXX_SUPPORTS_COVERED_SWITCH_DEFAULT_FLAG 0)
+  if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    set(CXX_SUPPORTS_COVERED_SWITCH_DEFAULT_FLAG 1)
+    append("-Wcovered-switch-default" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  endif()
   append_if(USE_NO_UNINITIALIZED "-Wno-uninitialized" CMAKE_CXX_FLAGS)
   append_if(USE_NO_MAYBE_UNINITIALIZED "-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS)
 
@@ -845,38 +833,32 @@ if (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
 
   # Disable -Wclass-memaccess, a C++-only warning from GCC 8 that fires on
   # LLVM's ADT classes.
-  check_cxx_compiler_flag("-Wclass-memaccess" CXX_SUPPORTS_CLASS_MEMACCESS_FLAG)
-  append_if(CXX_SUPPORTS_CLASS_MEMACCESS_FLAG "-Wno-class-memaccess" CMAKE_CXX_FLAGS)
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1)
+      append("-Wno-class-memaccess" CMAKE_CXX_FLAGS)
+    endif()
+  endif()
 
   # Disable -Wredundant-move and -Wpessimizing-move on GCC>=9. GCC wants to
-  # remove std::move in code like "A foo(ConvertibleToA a) {
-  # return std::move(a); }", but this code does not compile (or uses the copy
+  # remove std::move in code like
+  # "A foo(ConvertibleToA a) { return std::move(a); }",
+  # but this code does not compile (or uses the copy
   # constructor instead) on clang<=3.8. Clang also has a -Wredundant-move and
   # -Wpessimizing-move, but they only fire when the types match exactly, so we
   # can keep them here.
   if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-    check_cxx_compiler_flag("-Wredundant-move" CXX_SUPPORTS_REDUNDANT_MOVE_FLAG)
-    append_if(CXX_SUPPORTS_REDUNDANT_MOVE_FLAG "-Wno-redundant-move" CMAKE_CXX_FLAGS)
-    check_cxx_compiler_flag("-Wpessimizing-move" CXX_SUPPORTS_PESSIMIZING_MOVE_FLAG)
-    append_if(CXX_SUPPORTS_PESSIMIZING_MOVE_FLAG "-Wno-pessimizing-move" CMAKE_CXX_FLAGS)
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 9.1)
+      append("-Wno-redundant-move" CMAKE_CXX_FLAGS)
+      append("-Wno-pessimizing-move" CMAKE_CXX_FLAGS)
+    endif()
   endif()
 
   # The LLVM libraries have no stable C++ API, so -Wnoexcept-type is not useful.
-  check_cxx_compiler_flag("-Wnoexcept-type" CXX_SUPPORTS_NOEXCEPT_TYPE_FLAG)
-  append_if(CXX_SUPPORTS_NOEXCEPT_TYPE_FLAG "-Wno-noexcept-type" CMAKE_CXX_FLAGS)
-
-  # Check if -Wnon-virtual-dtor warns for a class marked final, when it has a
-  # friend declaration. If it does, don't add -Wnon-virtual-dtor. The case is
-  # considered unhelpful (https://gcc.gnu.org/PR102168).
-  set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
-  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror=non-virtual-dtor")
-  CHECK_CXX_SOURCE_COMPILES("class f {};
-                             class base {friend f; public: virtual void anchor();protected: ~base();};
-                             int main() { return 0; }"
-                            CXX_WONT_WARN_ON_FINAL_NONVIRTUALDTOR)
-  set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
-  append_if(CXX_WONT_WARN_ON_FINAL_NONVIRTUALDTOR "-Wnon-virtual-dtor" CMAKE_CXX_FLAGS)
+  append("-Wno-noexcept-type" CMAKE_CXX_FLAGS)
 
+  if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    append("-Wnon-virtual-dtor" CMAKE_CXX_FLAGS)
+  endif()
   append("-Wdelete-non-virtual-dtor" CMAKE_CXX_FLAGS)
 
   # Enable -Wsuggest-override if it's available, and only if it doesn't
@@ -906,14 +888,15 @@ if (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
   endif()
 
   # Enable -Wstring-conversion to catch misuse of string literals.
-  add_flag_if_supported("-Wstring-conversion" STRING_CONVERSION_FLAG)
+  if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    append("-Wstring-conversion" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  endif()
 
   if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
     # Disable the misleading indentation warning with GCC; GCC can
     # produce noisy notes about this getting disabled in large files.
     # See e.g. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89549
-    check_cxx_compiler_flag("-Wmisleading-indentation" CXX_SUPPORTS_MISLEADING_INDENTATION_FLAG)
-    append_if(CXX_SUPPORTS_MISLEADING_INDENTATION_FLAG "-Wno-misleading-indentation" CMAKE_CXX_FLAGS)
+    append("-Wno-misleading-indentation" CMAKE_CXX_FLAGS)
   else()
     # Prevent bugs that can happen with llvm's brace style.
     add_flag_if_supported("-Wmisleading-indentation" MISLEADING_INDENTATION_FLAG)
@@ -931,14 +914,15 @@ macro(append_common_sanitizer_flags)
   if (NOT MSVC OR CLANG_CL)
     # Append -fno-omit-frame-pointer and turn on debug info to get better
     # stack traces.
-    add_flag_if_supported("-fno-omit-frame-pointer" FNO_OMIT_FRAME_POINTER)
+    append("-fno-omit-frame-pointer" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     if (NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" AND
-        NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "RELWITHDEBINFO")
-      add_flag_if_supported("-gline-tables-only" GLINE_TABLES_ONLY)
+        NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "RELWITHDEBINFO" AND
+        CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      append("-gline-tables-only" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     endif()
     # Use -O1 even in debug mode, otherwise sanitizers slowdown is too large.
     if (uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" AND LLVM_OPTIMIZE_SANITIZED_BUILDS)
-      add_flag_if_supported("-O1" O1)
+      append("-O1" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     endif()
   else()
     # Always ask the linker to produce symbols with asan.
@@ -1112,15 +1096,12 @@ endif()
 if(NOT CYGWIN AND NOT MSVC)
   if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin" AND
      NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
-    check_c_compiler_flag("-Werror -fno-function-sections" C_SUPPORTS_FNO_FUNCTION_SECTIONS)
-    if (C_SUPPORTS_FNO_FUNCTION_SECTIONS)
-      # Don't add -ffunction-sections if it can't be disabled with -fno-function-sections.
-      # Doing so will break sanitizers.
-      add_flag_if_supported("-ffunction-sections" FFUNCTION_SECTIONS)
-    elseif (CMAKE_CXX_COMPILER_ID MATCHES "XL")
+    if (CMAKE_CXX_COMPILER_ID MATCHES "XL")
       append("-qfuncsect" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+    else()
+      append("-ffunction-sections" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     endif()
-    add_flag_if_supported("-fdata-sections" FDATA_SECTIONS)
+    append("-fdata-sections" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   endif()
 elseif(MSVC)
   if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
@@ -1385,7 +1366,9 @@ if(LLVM_USE_RELATIVE_PATHS_IN_DEBUG_INFO)
   file(RELATIVE_PATH relative_root "${CMAKE_BINARY_DIR}" "${source_root}")
   append_if(SUPPORTS_FDEBUG_PREFIX_MAP "-fdebug-prefix-map=${CMAKE_BINARY_DIR}=${relative_root}" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   append_if(SUPPORTS_FDEBUG_PREFIX_MAP "-fdebug-prefix-map=${source_root}/=${LLVM_SOURCE_PREFIX}" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-  add_flag_if_supported("-no-canonical-prefixes" NO_CANONICAL_PREFIXES)
+  if (LLVM_COMPILER_IS_GCC_COMPATIBLE)
+    append("-no-canonical-prefixes" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  endif()
 endif()
 
 option(LLVM_USE_RELATIVE_PATHS_IN_FILES "Use relative paths in sources and debug info" OFF)
@@ -1400,7 +1383,9 @@ if(LLVM_USE_RELATIVE_PATHS_IN_FILES)
   file(RELATIVE_PATH relative_root "${CMAKE_BINARY_DIR}" "${source_root}")
   append_if(SUPPORTS_FFILE_PREFIX_MAP "-ffile-prefix-map=${CMAKE_BINARY_DIR}=${relative_root}" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   append_if(SUPPORTS_FFILE_PREFIX_MAP "-ffile-prefix-map=${source_root}/=${LLVM_SOURCE_PREFIX}" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-  add_flag_if_supported("-no-canonical-prefixes" NO_CANONICAL_PREFIXES)
+  if (LLVM_COMPILER_IS_GCC_COMPATIBLE)
+    append("-no-canonical-prefixes" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  endif()
 endif()
 
 set(LLVM_THIRD_PARTY_DIR  ${CMAKE_CURRENT_SOURCE_DIR}/../third-party CACHE STRING
diff --git a/llvm/cmake/modules/LLVMConfig.cmake.in b/llvm/cmake/modules/LLVMConfig.cmake.in
index 397bd581..7e1501a 100644
--- a/llvm/cmake/modules/LLVMConfig.cmake.in
+++ b/llvm/cmake/modules/LLVMConfig.cmake.in
@@ -60,11 +60,6 @@ if(LLVM_ENABLE_LIBEDIT)
   find_package(LibEdit)
 endif()
 
-set(LLVM_ENABLE_TERMINFO @LLVM_ENABLE_TERMINFO@)
-if(LLVM_ENABLE_TERMINFO)
-  find_package(Terminfo)
-endif()
-
 set(LLVM_ENABLE_THREADS @LLVM_ENABLE_THREADS@)
 
 set(LLVM_ENABLE_UNWIND_TABLES @LLVM_ENABLE_UNWIND_TABLES@)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 75536bc..1004956 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1970,6 +1970,8 @@ The AMDGPU backend uses the following ELF header:
      ``EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC``  0x053      ``gfx10-3-generic``
      ``EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC``    0x054      ``gfx11-generic``
      *reserved*                                 0x055      Reserved.
+     *reserved*                                 0x056      Reserved.
+     *reserved*                                 0x057      Reserved.
      ========================================== ========== =============================
 
 Sections
diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index 3588ef14..646f1d0 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -349,6 +349,11 @@ The :doc:`CodeOfConduct` applies to all office hours.
     - Every two weeks, Wednesdays at 2:00pm US Pacific, for 90 minutes.
     - Livestream chat or `Google meet <https://meet.google.com/wit-tvzc-dwc>`__
     - English
+  * - Renato Golin
+    - General LLVM, MLIR & Linalg, distributed computing, research, socials.
+    - Every first Tuesday of the month, 11:00am UK time, for 60 minutes.
+    - `Google meet <https://meet.google.com/esg-fggc-hfe>`__
+    - English, Portuguese
   * - Rotating hosts
     - Getting Started, beginner questions, new contributors.
     - Every Tuesday at 2 PM ET (11 AM PT), for 30 minutes.
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 358eb4b..d2d21c7 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -15761,8 +15761,8 @@ The arguments and return value are floating-point numbers of the same type.
 Semantics:
 """"""""""
 
-Return the same value as a corresponding libm '``fma``' function but without
-trapping or setting ``errno``.
+Return the same value as the IEEE-754 fusedMultiplyAdd operation. This
+is assumed to not trap or set ``errno``.
 
 When specified with the fast-math-flag 'afn', the result may be approximated
 using a less accurate calculation.
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index cba36c7..a495e6cb 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -62,6 +62,10 @@ Changes to LLVM infrastructure
 Changes to building LLVM
 ------------------------
 
+- The ``LLVM_ENABLE_TERMINFO`` flag has been removed. LLVM no longer depends on
+  terminfo and now always uses the ``TERM`` environment variable for color
+  support autodetection.
+
 Changes to TableGen
 -------------------
 
@@ -130,6 +134,7 @@ Changes to the RISC-V Backend
   match GNU objdump. The bytes within the groups are in big endian order.
 * Added smstateen extension to -march. CSR names for smstateen were already supported.
 * Zaamo and Zalrsc are no longer experimental.
+* Processors that enable post reg-alloc scheduling (PostMachineScheduler) by default should use the `UsePostRAScheduler` subtarget feature. Setting `PostRAScheduler = 1` in the scheduler model will have no effect on the enabling of the PostMachineScheduler.
 
 Changes to the WebAssembly Backend
 ----------------------------------
@@ -140,6 +145,9 @@ Changes to the Windows Target
 Changes to the X86 Backend
 --------------------------
 
+- Removed knl/knm specific ISA intrinsics: AVX512PF, AVX512ER, PREFETCHWT1,
+  while assembly encoding/decoding supports are kept.
+
 Changes to the OCaml bindings
 -----------------------------
 
@@ -239,6 +247,11 @@ Changes to the LLVM tools
   documented in `--help` output and the command guide. (`#90474
   <https://github.com/llvm/llvm-project/pull/90474>`)
 
+* llvm-readobj's LLVM output format for ELF core files has been changed.
+  Similarly, the JSON format has been fixed for this case. The NT_FILE note
+  now has a map for the mapped files. (`#92835
+  <https://github.com/llvm/llvm-project/pull/92835>`).
+
 Changes to LLDB
 ---------------------------------
 
diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst
index d27177a..657b0fb 100644
--- a/llvm/docs/SPIRVUsage.rst
+++ b/llvm/docs/SPIRVUsage.rst
@@ -143,6 +143,8 @@ list of supported SPIR-V extensions, sorted alphabetically by their extension na
      - Adds instructions to convert between single-precision 32-bit floating-point values and 16-bit bfloat16 values.
    * - ``SPV_INTEL_function_pointers``
      - Allows translation of function pointers.
+   * - ``SPV_INTEL_inline_assembly``
+     - Allows to use inline assembly.
    * - ``SPV_INTEL_optnone``
      - Adds OptNoneINTEL value for Function Control mask that indicates a request to not optimize the function.
    * - ``SPV_INTEL_subgroups``
@@ -161,6 +163,8 @@ list of supported SPIR-V extensions, sorted alphabetically by their extension na
      - Allows to use the LinkOnceODR linkage type that lets a function or global variable to be merged with other functions or global variables of the same name when linkage occurs.
    * - ``SPV_KHR_no_integer_wrap_decoration``
      - Adds decorations to indicate that a given instruction does not cause integer wrapping.
+   * - ``SPV_KHR_shader_clock``
+     - Adds the extension cl_khr_kernel_clock that adds the ability for a kernel to sample the value from clocks provided by compute units.
    * - ``SPV_KHR_subgroup_rotate``
      - Adds a new instruction that enables rotating values across invocations within a subgroup.
    * - ``SPV_KHR_uniform_group_instructions``
@@ -333,6 +337,10 @@ SPIR-V backend, along with their descriptions and argument details.
      - 32-bit Integer
      - `[]`
      - Generates an undefined value. Useful for optimizations and indicating uninitialized variables.
+   * - `int_spv_inline_asm`
+     - None
+     - `[Metadata, Metadata, Vararg]`
+     - Associates inline assembly features to inline assembly call instances by creating metadatas and preserving original arguments. Not emitted directly but used to support SPIR-V representation in LLVM IR.
    * - `int_spv_assume`
      - None
      - `[1-bit Integer]`
diff --git a/llvm/include/llvm/Analysis/CFG.h b/llvm/include/llvm/Analysis/CFG.h
index 86b01c1..23bc10a 100644
--- a/llvm/include/llvm/Analysis/CFG.h
+++ b/llvm/include/llvm/Analysis/CFG.h
@@ -96,6 +96,18 @@ bool isPotentiallyReachableFromMany(
     const SmallPtrSetImpl<BasicBlock *> *ExclusionSet,
     const DominatorTree *DT = nullptr, const LoopInfo *LI = nullptr);
 
+/// Determine whether there is a potentially a path from at least one block in
+/// 'Worklist' to at least one block in 'StopSet' within a single function
+/// without passing through any of the blocks in 'ExclusionSet'. Returns false
+/// only if we can prove that once any block in 'Worklist' has been reached then
+/// no blocks in 'StopSet' can be executed without passing through any blocks in
+/// 'ExclusionSet'. Conservatively returns true.
+bool isManyPotentiallyReachableFromMany(
+    SmallVectorImpl<BasicBlock *> &Worklist,
+    const SmallPtrSetImpl<const BasicBlock *> &StopSet,
+    const SmallPtrSetImpl<BasicBlock *> *ExclusionSet,
+    const DominatorTree *DT = nullptr, const LoopInfo *LI = nullptr);
+
 /// Return true if the control flow in \p RPOTraversal is irreducible.
 ///
 /// This is a generic implementation to detect CFG irreducibility based on loop
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 0c3a6b3..cefce93 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1397,7 +1397,7 @@ public:
   InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
                                             int VF,
                                             const APInt &DemandedDstElts,
-                                            TTI::TargetCostKind CostKind);
+                                            TTI::TargetCostKind CostKind) const;
 
   /// \return The cost of Load and Store instructions.
   InstructionCost
diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
index 10f1333..e12eb70 100644
--- a/llvm/include/llvm/Analysis/VecFuncs.def
+++ b/llvm/include/llvm/Analysis/VecFuncs.def
@@ -12,11 +12,6 @@
 // This .def file also allows creating an array of vector functions supported in
 // the specified framework or library.
 
-#if defined(TLI_DEFINE_MASSV_VECFUNCS_NAMES)
-#define TLI_DEFINE_MASSV_VECFUNCS
-#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, VABI_PREFIX) VEC,
-#endif
-
 #define FIXED(NL) ElementCount::getFixed(NL)
 #define SCALABLE(NL) ElementCount::getScalable(NL)
 #define NOMASK false
@@ -1276,14 +1271,3 @@ TLI_DEFINE_VECFUNC("cbrtf", "amd_vrs4_cbrtf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 #undef FIXED
 
 #undef TLI_DEFINE_VECFUNC
-#undef TLI_DEFINE_ACCELERATE_VECFUNCS
-#undef TLI_DEFINE_DARWIN_LIBSYSTEM_M_VECFUNCS
-#undef TLI_DEFINE_LIBMVEC_X86_VECFUNCS
-#undef TLI_DEFINE_MASSV_VECFUNCS
-#undef TLI_DEFINE_SVML_VECFUNCS
-#undef TLI_DEFINE_SLEEFGNUABI_VF2_VECFUNCS
-#undef TLI_DEFINE_SLEEFGNUABI_VF4_VECFUNCS
-#undef TLI_DEFINE_SLEEFGNUABI_SCALABLE_VECFUNCS
-#undef TLI_DEFINE_MASSV_VECFUNCS_NAMES
-#undef TLI_DEFINE_ARMPL_VECFUNCS
-#undef TLI_DEFINE_AMDLIBM_VECFUNCS
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index f296acc..67cacae 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -796,11 +796,13 @@ enum : unsigned {
   EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC   = 0x053,
   EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC     = 0x054,
   EF_AMDGPU_MACH_AMDGCN_RESERVED_0X55 = 0x055,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X56 = 0x056,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X57 = 0x057,
   // clang-format on
 
   // First/last AMDGCN-based processors.
   EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
-  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC,
+  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_RESERVED_0X57,
 
   // Indicates if the "xnack" target feature is enabled for all code contained
   // in the object.
diff --git a/llvm/include/llvm/Bitcode/BitcodeWriter.h b/llvm/include/llvm/Bitcode/BitcodeWriter.h
index 248d33f..a343f0e 100644
--- a/llvm/include/llvm/Bitcode/BitcodeWriter.h
+++ b/llvm/include/llvm/Bitcode/BitcodeWriter.h
@@ -102,7 +102,8 @@ class raw_ostream;
 
     void writeIndex(
         const ModuleSummaryIndex *Index,
-        const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex);
+        const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex,
+        const GVSummaryPtrSet *DecSummaries);
   };
 
   /// Write the specified module to the specified raw output stream.
@@ -147,10 +148,12 @@ class raw_ostream;
   /// where it will be written in a new bitcode block. This is used when
   /// writing the combined index file for ThinLTO. When writing a subset of the
   /// index for a distributed backend, provide the \p ModuleToSummariesForIndex
-  /// map.
+  /// map. \p DecSummaries specifies the set of summaries for which the
+  /// corresponding value should be imported as a declaration (prototype).
   void writeIndexToFile(const ModuleSummaryIndex &Index, raw_ostream &Out,
                         const std::map<std::string, GVSummaryMapTy>
-                            *ModuleToSummariesForIndex = nullptr);
+                            *ModuleToSummariesForIndex = nullptr,
+                        const GVSummaryPtrSet *DecSummaries = nullptr);
 
   /// If EmbedBitcode is set, save a copy of the llvm IR as data in the
   ///  __LLVM,__bitcode section (.llvmbc on non-MacOS).
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index a9a33c7..2111e82 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -869,6 +869,9 @@ public:
   /// Combine insert vector element OOB.
   bool matchInsertVectorElementOOB(MachineInstr &MI, BuildFnTy &MatchInfo);
 
+  bool matchFreezeOfSingleMaybePoisonOperand(MachineInstr &MI,
+                                             BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index 2a3145b..2b3efc3 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -34,6 +34,17 @@ public:
   static bool classof(const MachineInstr *MI) {
     return isPreISelGenericOpcode(MI->getOpcode());
   }
+
+  bool hasPoisonGeneratingFlags() const {
+    return getFlags() & (NoUWrap | NoSWrap | IsExact | Disjoint | NonNeg |
+                         FmNoNans | FmNoInfs);
+  }
+
+  void dropPoisonGeneratingFlags() {
+    clearFlags(NoUWrap | NoSWrap | IsExact | Disjoint | NonNeg | FmNoNans |
+               FmNoInfs);
+    assert(!hasPoisonGeneratingFlags());
+  }
 };
 
 /// Provides common memory operand functionality.
diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index 2b0c5d1..db48a0a 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -416,6 +416,12 @@ public:
     Flags &= ~((uint32_t)Flag);
   }
 
+  void clearFlags(unsigned flags) {
+    assert(isUInt<LLVM_MI_FLAGS_BITS>(flags) &&
+           "flags to be cleared are out of range for the Flags field");
+    Flags &= ~flags;
+  }
+
   /// Return true if MI is in a bundle (but not the first MI in a bundle).
   ///
   /// A bundle looks like this before it's finalized:
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.h b/llvm/include/llvm/CodeGen/ValueTypes.h
index b66c66d..dab6c42 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.h
+++ b/llvm/include/llvm/CodeGen/ValueTypes.h
@@ -488,8 +488,10 @@ namespace llvm {
     Type *getTypeForEVT(LLVMContext &Context) const;
 
     /// Return the value type corresponding to the specified type.
-    /// This returns all pointers as iPTR.  If HandleUnknown is true, unknown
-    /// types are returned as Other, otherwise they are invalid.
+    /// If HandleUnknown is true, unknown types are returned as Other,
+    /// otherwise they are invalid.
+    /// NB: This includes pointer types, which require a DataLayout to convert
+    /// to a concrete value type.
     static EVT getEVT(Type *Ty, bool HandleUnknown = false);
 
     intptr_t getRawBits() const {
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index 900b30d..c3e378e 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -296,18 +296,23 @@ def MetadataVT : ValueType<0, 249> { // Metadata
 def iPTRAny    : VTAny<250>;
 
 // Pseudo valuetype to represent "vector of any size"
+// Should only be used in TableGen.
 def vAny       : VTAny<251>;
 
 // Pseudo valuetype to represent "float of any format"
+// Should only be used in TableGen.
 def fAny       : VTAny<252>;
 
 // Pseudo valuetype to represent "integer of any bit width"
+// Should only be used in TableGen.
 def iAny       : VTAny<253>;
 
 // Pseudo valuetype mapped to the current pointer size.
+// Should only be used in TableGen.
 def iPTR       : ValueType<0, 254>;
 
 // Pseudo valuetype to represent "any type of any size".
+// Should only be used in TableGen.
 def Any        : VTAny<255>;
 
 } // end defset ValueTypes
diff --git a/llvm/include/llvm/CodeGenTypes/MachineValueType.h b/llvm/include/llvm/CodeGenTypes/MachineValueType.h
index 9aceb98..3b2a9b5 100644
--- a/llvm/include/llvm/CodeGenTypes/MachineValueType.h
+++ b/llvm/include/llvm/CodeGenTypes/MachineValueType.h
@@ -476,9 +476,11 @@ namespace llvm {
       return getVectorVT(VT, EC.getKnownMinValue());
     }
 
-    /// Return the value type corresponding to the specified type.  This returns
-    /// all pointers as iPTR.  If HandleUnknown is true, unknown types are
-    /// returned as Other, otherwise they are invalid.
+    /// Return the value type corresponding to the specified type.
+    /// If HandleUnknown is true, unknown types are returned as Other,
+    /// otherwise they are invalid.
+    /// NB: This includes pointer types, which require a DataLayout to convert
+    /// to a concrete value type.
     static MVT getVT(Type *Ty, bool HandleUnknown = false);
 
   public:
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index 977c182..ff30741 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -209,9 +209,6 @@
 /* Define to 1 if you have the <sys/types.h> header file. */
 #cmakedefine HAVE_SYS_TYPES_H ${HAVE_SYS_TYPES_H}
 
-/* Define if the setupterm() function is supported this platform. */
-#cmakedefine LLVM_ENABLE_TERMINFO ${LLVM_ENABLE_TERMINFO}
-
 /* Define to 1 if you have the <termios.h> header file. */
 #cmakedefine HAVE_TERMIOS_H ${HAVE_TERMIOS_H}
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index 13a3726..5a3f8c6 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -19,7 +19,7 @@
 // - EmptyTrait: the class has no data members.
 // - WrapperTrait: the class has a single member `v`
 // - TupleTrait: the class has a tuple member `t`
-// - UnionTrait the class has a varuant member `u`
+// - UnionTrait the class has a variant member `u`
 // - IncompleteTrait: the class is a placeholder class that is currently empty,
 //   but will be completed at a later time.
 // Note: This structure follows the one used in flang parser.
diff --git a/llvm/include/llvm/IR/ConstantRange.h b/llvm/include/llvm/IR/ConstantRange.h
index e718e6e..a5e2f80 100644
--- a/llvm/include/llvm/IR/ConstantRange.h
+++ b/llvm/include/llvm/IR/ConstantRange.h
@@ -419,6 +419,15 @@ public:
   /// treating both this and \p Other as unsigned ranges.
   ConstantRange multiply(const ConstantRange &Other) const;
 
+  /// Return a new range representing the possible values resulting
+  /// from a multiplication with wrap type \p NoWrapKind of a value in this
+  /// range and a value in \p Other.
+  /// If the result range is disjoint, the preferred range is determined by the
+  /// \p PreferredRangeType.
+  ConstantRange
+  multiplyWithNoWrap(const ConstantRange &Other, unsigned NoWrapKind,
+                     PreferredRangeType RangeType = Smallest) const;
+
   /// Return range of possible values for a signed multiplication of this and
   /// \p Other. However, if overflow is possible always return a full range
   /// rather than trying to determine a more precise result.
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 0d87463..40a9cf50 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -455,7 +455,7 @@ public:
   /// block.
   GlobalVariable *CreateGlobalString(StringRef Str, const Twine &Name = "",
                                      unsigned AddressSpace = 0,
-                                     Module *M = nullptr);
+                                     Module *M = nullptr, bool AddNull = true);
 
   /// Get a constant value representing either true or false.
   ConstantInt *getInt1(bool V) {
@@ -1992,8 +1992,9 @@ public:
   /// block.
   Constant *CreateGlobalStringPtr(StringRef Str, const Twine &Name = "",
                                   unsigned AddressSpace = 0,
-                                  Module *M = nullptr) {
-    GlobalVariable *GV = CreateGlobalString(Str, Name, AddressSpace, M);
+                                  Module *M = nullptr, bool AddNull = true) {
+    GlobalVariable *GV =
+        CreateGlobalString(Str, Name, AddressSpace, M, AddNull);
     Constant *Zero = ConstantInt::get(Type::getInt32Ty(Context), 0);
     Constant *Indices[] = {Zero, Zero};
     return ConstantExpr::getInBoundsGetElementPtr(GV->getValueType(), GV,
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index be8048c..d4a8954 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2466,25 +2466,27 @@ def int_amdgcn_perm :
 // GFX9 Intrinsics
 //===----------------------------------------------------------------------===//
 
-class AMDGPUGlobalLoadLDS : Intrinsic <
-  [],
-  [LLVMQualPointerType<1>,             // Base global pointer to load from
-   LLVMQualPointerType<3>,             // LDS base pointer to store to
-   llvm_i32_ty,                        // Data byte size: 1/2/4
-   llvm_i32_ty,                        // imm offset (applied to both global and LDS address)
-   llvm_i32_ty],                       // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0,
-                                       //                                   bit 1 = slc/sc1,
-                                       //                                   bit 2 = dlc on gfx10/gfx11))
-                                       //                                   bit 4 = scc/nt on gfx90a+))
-                                       //                  gfx12+:
-                                       //                      cachepolicy (bits [0-2] = th,
-                                       //                                   bits [3-4] = scope)
-                                       //                      swizzled buffer (bit 6 = swz),
-  [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
-   ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
-  "", [SDNPMemOperand]>;
+class AMDGPUGlobalLoadLDS :
+  ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
+  Intrinsic <
+    [],
+    [LLVMQualPointerType<1>,            // Base global pointer to load from
+     LLVMQualPointerType<3>,            // LDS base pointer to store to
+     llvm_i32_ty,                       // Data byte size: 1/2/4
+     llvm_i32_ty,                       // imm offset (applied to both global and LDS address)
+     llvm_i32_ty],                      // auxiliary data (imm, cachepolicy (bit 0 = sc0,
+                                        //                                   bit 1 = sc1,
+                                        //                                   bit 4 = scc))
+    [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
+     ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
+     "", [SDNPMemOperand]>;
 def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
 
+// Use read/write of inaccessible memory to model the fact that this reads a
+// volatile value.
+def int_amdgcn_pops_exiting_wave_id :
+  DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrInaccessibleMemOnly]>;
+
 //===----------------------------------------------------------------------===//
 // GFX10 Intrinsics
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index cc84dec..90f1267 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -36,6 +36,7 @@ let TargetPrefix = "spv" in {
   def int_spv_alloca : Intrinsic<[llvm_any_ty], []>;
   def int_spv_alloca_array : Intrinsic<[llvm_any_ty], [llvm_anyint_ty]>;
   def int_spv_undef : Intrinsic<[llvm_i32_ty], []>;
+  def int_spv_inline_asm : Intrinsic<[], [llvm_metadata_ty, llvm_metadata_ty, llvm_vararg_ty]>;
 
   // Expect, Assume Intrinsics
   def int_spv_assume : Intrinsic<[], [llvm_i1_ty]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index 572d334..237f268 100644
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -337,6 +337,14 @@ def int_wasm_storef16_f32:
             [llvm_float_ty, llvm_ptr_ty],
             [IntrWriteMem, IntrArgMemOnly],
              "", [SDNPMemOperand]>;
+def int_wasm_splat_f16x8:
+  DefaultAttrsIntrinsic<[llvm_v8f16_ty],
+                        [llvm_float_ty],
+                        [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_extract_lane_f16x8:
+  DefaultAttrsIntrinsic<[llvm_float_ty],
+                        [llvm_v8f16_ty, llvm_i32_ty],
+                        [IntrNoMem, IntrSpeculatable]>;
 
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index fdc2b0f..aee8040 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -3843,58 +3843,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
       DefaultAttrsIntrinsic<[llvm_v16f32_ty],
                             [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty],
                             [IntrNoMem]>;
-
-  def int_x86_avx512_rcp28_ps : ClangBuiltin<"__builtin_ia32_rcp28ps_mask">,
-      DefaultAttrsIntrinsic<[llvm_v16f32_ty],
-                            [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
-                             llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
-  def int_x86_avx512_rcp28_pd : ClangBuiltin<"__builtin_ia32_rcp28pd_mask">,
-      DefaultAttrsIntrinsic<[llvm_v8f64_ty],
-                            [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
-                             llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
-  def int_x86_avx512_exp2_ps : ClangBuiltin<"__builtin_ia32_exp2ps_mask">,
-      DefaultAttrsIntrinsic<[llvm_v16f32_ty],
-                            [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
-                             llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
-  def int_x86_avx512_exp2_pd : ClangBuiltin<"__builtin_ia32_exp2pd_mask">,
-      DefaultAttrsIntrinsic<[llvm_v8f64_ty],
-                            [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
-                             llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
-
-  def int_x86_avx512_rcp28_ss : ClangBuiltin<"__builtin_ia32_rcp28ss_round_mask">,
-      DefaultAttrsIntrinsic<[llvm_v4f32_ty],
-                            [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
-                             llvm_i8_ty, llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_rcp28_sd : ClangBuiltin<"__builtin_ia32_rcp28sd_round_mask">,
-      DefaultAttrsIntrinsic<[llvm_v2f64_ty],
-                            [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
-                             llvm_i8_ty, llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_rsqrt28_ps : ClangBuiltin<"__builtin_ia32_rsqrt28ps_mask">,
-      DefaultAttrsIntrinsic<[llvm_v16f32_ty],
-                            [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
-                             llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
-  def int_x86_avx512_rsqrt28_pd : ClangBuiltin<"__builtin_ia32_rsqrt28pd_mask">,
-      DefaultAttrsIntrinsic<[llvm_v8f64_ty],
-                            [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
-                             llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
-  def int_x86_avx512_rsqrt28_ss : ClangBuiltin<"__builtin_ia32_rsqrt28ss_round_mask">,
-      DefaultAttrsIntrinsic<[llvm_v4f32_ty],
-                            [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
-                             llvm_i8_ty, llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_rsqrt28_sd : ClangBuiltin<"__builtin_ia32_rsqrt28sd_round_mask">,
-      DefaultAttrsIntrinsic<[llvm_v2f64_ty],
-                            [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
-                             llvm_i8_ty, llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_psad_bw_512 : ClangBuiltin<"__builtin_ia32_psadbw512">,
       DefaultAttrsIntrinsic<[llvm_v8i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty],
                             [IntrNoMem, Commutative]>;
@@ -4177,38 +4125,6 @@ let TargetPrefix = "x86" in {
           Intrinsic<[],
           [llvm_ptr_ty, llvm_i8_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty],
           [ImmArg<ArgIndex<4>>]>;
-
-  // gather prefetch
-  // NOTE: These can't be ArgMemOnly because you can put the address completely
-  // in the index register.
-  def int_x86_avx512_gatherpf_dpd_512  : ClangBuiltin<"__builtin_ia32_gatherpfdpd">,
-          Intrinsic<[], [llvm_i8_ty, llvm_v8i32_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_gatherpf_dps_512  : ClangBuiltin<"__builtin_ia32_gatherpfdps">,
-          Intrinsic<[], [llvm_i16_ty, llvm_v16i32_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_gatherpf_qpd_512  : ClangBuiltin<"__builtin_ia32_gatherpfqpd">,
-          Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_gatherpf_qps_512  : ClangBuiltin<"__builtin_ia32_gatherpfqps">,
-          Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-
-  // scatter prefetch
-  // NOTE: These can't be ArgMemOnly because you can put the address completely
-  // in the index register.
-  def int_x86_avx512_scatterpf_dpd_512  : ClangBuiltin<"__builtin_ia32_scatterpfdpd">,
-          Intrinsic<[], [llvm_i8_ty, llvm_v8i32_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_scatterpf_dps_512  : ClangBuiltin<"__builtin_ia32_scatterpfdps">,
-          Intrinsic<[], [llvm_i16_ty, llvm_v16i32_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_scatterpf_qpd_512  : ClangBuiltin<"__builtin_ia32_scatterpfqpd">,
-          Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_scatterpf_qps_512  : ClangBuiltin<"__builtin_ia32_scatterpfqps">,
-          Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
 }
 
 // AVX512 gather/scatter intrinsics that use vXi1 masks.
diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
index 20f5bb2..8eced07 100644
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -174,10 +174,10 @@ HELPER_REGISTER_BINARY_INT_VP(vp_add, VP_ADD, Add, ADD)
 HELPER_REGISTER_BINARY_INT_VP(vp_and, VP_AND, And, AND)
 
 // llvm.vp.ashr(x,y,mask,vlen)
-HELPER_REGISTER_BINARY_INT_VP(vp_ashr, VP_ASHR, AShr, SRA)
+HELPER_REGISTER_BINARY_INT_VP(vp_ashr, VP_SRA, AShr, SRA)
 
 // llvm.vp.lshr(x,y,mask,vlen)
-HELPER_REGISTER_BINARY_INT_VP(vp_lshr, VP_LSHR, LShr, SRL)
+HELPER_REGISTER_BINARY_INT_VP(vp_lshr, VP_SRL, LShr, SRL)
 
 // llvm.vp.mul(x,y,mask,vlen)
 HELPER_REGISTER_BINARY_INT_VP(vp_mul, VP_MUL, Mul, MUL)
diff --git a/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h b/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
index c450acd..f1337e8 100644
--- a/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
+++ b/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
@@ -271,12 +271,13 @@ public:
                          const lto::InputFile &File);
 
   /**
-   * Compute the list of summaries needed for importing into module.
+   * Compute the list of summaries and the subset of declaration summaries
+   * needed for importing into module.
    */
   void gatherImportedSummariesForModule(
       Module &Module, ModuleSummaryIndex &Index,
       std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex,
-      const lto::InputFile &File);
+      GVSummaryPtrSet &DecSummaries, const lto::InputFile &File);
 
   /**
    * Perform internalization. Index is updated to reflect linkage changes.
diff --git a/llvm/include/llvm/MCA/InstrBuilder.h b/llvm/include/llvm/MCA/InstrBuilder.h
index 3594372..00c7942 100644
--- a/llvm/include/llvm/MCA/InstrBuilder.h
+++ b/llvm/include/llvm/MCA/InstrBuilder.h
@@ -78,6 +78,7 @@ class InstrBuilder {
 
   bool FirstCallInst;
   bool FirstReturnInst;
+  unsigned CallLatency;
 
   using InstRecycleCallback = std::function<Instruction *(const InstrDesc &)>;
   InstRecycleCallback InstRecycleCB;
@@ -98,7 +99,7 @@ class InstrBuilder {
 public:
   InstrBuilder(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
                const MCRegisterInfo &RI, const MCInstrAnalysis *IA,
-               const InstrumentManager &IM);
+               const InstrumentManager &IM, unsigned CallLatency);
 
   void clear() {
     Descriptors.clear();
diff --git a/llvm/include/llvm/Object/ObjectFile.h b/llvm/include/llvm/Object/ObjectFile.h
index 8c868c7..f49763e 100644
--- a/llvm/include/llvm/Object/ObjectFile.h
+++ b/llvm/include/llvm/Object/ObjectFile.h
@@ -302,6 +302,7 @@ protected:
 public:
   ObjectFile() = delete;
   ObjectFile(const ObjectFile &other) = delete;
+  ObjectFile &operator=(const ObjectFile &other) = delete;
 
   uint64_t getCommonSymbolSize(DataRefImpl Symb) const {
     Expected<uint32_t> SymbolFlagsOrErr = getSymbolFlags(Symb);
diff --git a/llvm/include/llvm/Option/ArgList.h b/llvm/include/llvm/Option/ArgList.h
index fcde68e..09812f9 100644
--- a/llvm/include/llvm/Option/ArgList.h
+++ b/llvm/include/llvm/Option/ArgList.h
@@ -319,11 +319,15 @@ public:
   }
 
   /// Render only the last argument match \p Id0, if present.
-  template<typename ...OptSpecifiers>
-  void AddLastArg(ArgStringList &Output, OptSpecifiers ...Ids) const {
+  template <typename... OptSpecifiers>
+  void addLastArg(ArgStringList &Output, OptSpecifiers... Ids) const {
     if (Arg *A = getLastArg(Ids...)) // Calls claim() on all Ids's Args.
       A->render(*this, Output);
   }
+  template <typename... OptSpecifiers>
+  void AddLastArg(ArgStringList &Output, OptSpecifiers... Ids) const {
+    addLastArg(Output, Ids...);
+  }
 
   /// AddAllArgsExcept - Render all arguments matching any of the given ids
   /// and not matching any of the excluded ids.
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 88c7fe4..2cee928 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -385,8 +385,9 @@ struct TemporalProfTraceTy {
   /// Use a set of temporal profile traces to create a list of balanced
   /// partitioning function nodes used by BalancedPartitioning to generate a
   /// function order that reduces page faults during startup
-  static std::vector<BPFunctionNode>
-  createBPFunctionNodes(ArrayRef<TemporalProfTraceTy> Traces);
+  static void createBPFunctionNodes(ArrayRef<TemporalProfTraceTy> Traces,
+                                    std::vector<BPFunctionNode> &Nodes,
+                                    bool RemoveOutlierUNs = true);
 };
 
 inline std::error_code make_error_code(instrprof_error E) {
@@ -1184,35 +1185,32 @@ inline uint64_t ComputeHash(StringRef K) { return ComputeHash(HashType, K); }
 // data file in indexed-format. Please update llvm/docs/InstrProfileFormat.rst
 // as appropriate when updating the indexed profile format.
 struct Header {
-  uint64_t Magic;
+  uint64_t Magic = IndexedInstrProf::Magic;
   // The lower 32 bits specify the version of the indexed profile.
   // The most significant 32 bits are reserved to specify the variant types of
   // the profile.
-  uint64_t Version;
-  uint64_t Unused; // Becomes unused since version 4
-  uint64_t HashType;
+  uint64_t Version = 0;
+  uint64_t Unused = 0; // Becomes unused since version 4
+  uint64_t HashType = static_cast<uint64_t>(IndexedInstrProf::HashType);
   // This field records the offset of this hash table's metadata (i.e., the
   // number of buckets and entries), which follows right after the payload of
   // the entire hash table.
-  uint64_t HashOffset;
-  uint64_t MemProfOffset;
-  uint64_t BinaryIdOffset;
-  uint64_t TemporalProfTracesOffset;
-  uint64_t VTableNamesOffset;
+  uint64_t HashOffset = 0;
+  uint64_t MemProfOffset = 0;
+  uint64_t BinaryIdOffset = 0;
+  uint64_t TemporalProfTracesOffset = 0;
+  uint64_t VTableNamesOffset = 0;
   // New fields should only be added at the end to ensure that the size
   // computation is correct. The methods below need to be updated to ensure that
   // the new field is read correctly.
 
-  // Reads a header struct from the buffer.
+  // Reads a header struct from the buffer. Header fields are in machine native
+  // endianness.
   static Expected<Header> readFromBuffer(const unsigned char *Buffer);
 
   // Returns the size of the header in bytes for all valid fields based on the
   // version. I.e a older version header will return a smaller size.
   size_t size() const;
-
-  // Returns the format version in little endian. The header retains the version
-  // in native endian of the compiler runtime.
-  uint64_t formatVersion() const;
 };
 
 // Profile summary data recorded in the profile data file in indexed
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index 9b35768..46aa1b6 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -649,6 +649,8 @@ public:
 
 class IndexedMemProfReader {
 private:
+  /// The MemProf version.
+  memprof::IndexedVersion Version = memprof::Version0;
   /// MemProf profile schema (if available).
   memprof::MemProfSchema Schema;
   /// MemProf record profile data on-disk indexed via llvm::md5(FunctionName).
diff --git a/llvm/include/llvm/ProfileData/InstrProfWriter.h b/llvm/include/llvm/ProfileData/InstrProfWriter.h
index 97f6a95..b8b6c68 100644
--- a/llvm/include/llvm/ProfileData/InstrProfWriter.h
+++ b/llvm/include/llvm/ProfileData/InstrProfWriter.h
@@ -218,6 +218,9 @@ private:
   // back patching.
   uint64_t writeHeader(const IndexedInstrProf::Header &header,
                        const bool WritePrevVersion, ProfOStream &OS);
+
+  // Writes compressed vtable names to profiles.
+  Error writeVTableNames(ProfOStream &OS);
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Support/Error.h b/llvm/include/llvm/Support/Error.h
index 217130c..662c3ea4 100644
--- a/llvm/include/llvm/Support/Error.h
+++ b/llvm/include/llvm/Support/Error.h
@@ -1236,10 +1236,10 @@ class StringError : public ErrorInfo<StringError> {
 public:
   static char ID;
 
-  // Prints EC + S and converts to EC
+  StringError(std::string &&S, std::error_code EC, bool PrintMsgOnly);
+  /// Prints EC + S and converts to EC.
   StringError(std::error_code EC, const Twine &S = Twine());
-
-  // Prints S and converts to EC
+  /// Prints S and converts to EC.
   StringError(const Twine &S, std::error_code EC);
 
   void log(raw_ostream &OS) const override;
@@ -1258,15 +1258,23 @@ template <typename... Ts>
 inline Error createStringError(std::error_code EC, char const *Fmt,
                                const Ts &... Vals) {
   std::string Buffer;
-  raw_string_ostream Stream(Buffer);
-  Stream << format(Fmt, Vals...);
-  return make_error<StringError>(Stream.str(), EC);
+  raw_string_ostream(Buffer) << format(Fmt, Vals...);
+  return make_error<StringError>(Buffer, EC);
 }
 
-Error createStringError(std::error_code EC, char const *Msg);
+Error createStringError(std::string &&Msg, std::error_code EC);
+
+inline Error createStringError(std::error_code EC, const char *S) {
+  return createStringError(std::string(S), EC);
+}
 
 inline Error createStringError(std::error_code EC, const Twine &S) {
-  return createStringError(EC, S.str().c_str());
+  return createStringError(S.str(), EC);
+}
+
+/// Create a StringError with an inconvertible error code.
+inline Error createStringError(const Twine &S) {
+  return createStringError(llvm::inconvertibleErrorCode(), S);
 }
 
 template <typename... Ts>
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 5d4b5a2..8012f91 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -220,6 +220,13 @@ def idempotent_prop : GICombineRule<
    (match (idempotent_prop_frags $dst, $src)),
    (apply (GIReplaceReg $dst, $src))>;
 
+// Convert freeze(Op(Op0, NonPoisonOps...)) to Op(freeze(Op0), NonPoisonOps...)
+// when Op0 is not guaranteed non-poison
+def push_freeze_to_prevent_poison_from_propagating : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (G_FREEZE $dst, $src):$root,
+         [{ return !isGuaranteedNotToBePoison(${src}.getReg(), MRI) && Helper.matchFreezeOfSingleMaybePoisonOperand(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
 
 def extending_loads : GICombineRule<
   (defs root:$root, extending_load_matchdata:$matchinfo),
@@ -1634,6 +1641,78 @@ extract_vector_element_shuffle_vector,
 insert_vector_element_extract_vector_element
 ]>;
 
+
+// fold ((0-A) + B) -> B-A
+def ZeroMinusAPlusB : GICombineRule<
+   (defs root:$root),
+   (match (G_SUB $sub, 0, $A),
+          (G_ADD $root, $sub, $B)),
+   (apply (G_SUB $root, $B, $A))>;
+
+// fold (A + (0-B)) -> A-B
+def APlusZeroMinusB : GICombineRule<
+   (defs root:$root),
+   (match (G_SUB $sub, 0, $B),
+          (G_ADD $root, $A, $sub)),
+   (apply (G_SUB $root, $A, $B))>;
+
+ // fold (A+(B-A)) -> B
+ def APlusBMinusB : GICombineRule<
+   (defs root:$root),
+   (match (G_SUB $sub, $B, $A),
+          (G_ADD $root, $A, $sub)),
+   (apply (GIReplaceReg $root, $B))>;
+
+// fold ((B-A)+A) -> B
+ def BMinusAPlusA : GICombineRule<
+   (defs root:$root),
+   (match (G_SUB $sub, $B, $A),
+          (G_ADD $root, $sub, $A)),
+   (apply (GIReplaceReg $root, $B))>;
+
+// fold ((A-B)+(C-A)) -> (C-B)
+def AMinusBPlusCMinusA : GICombineRule<
+   (defs root:$root),
+   (match (G_SUB $sub1, $A, $B),
+          (G_SUB $sub2, $C, $A),
+          (G_ADD $root, $sub1, $sub2)),
+   (apply (G_SUB $root, $C, $B))>;
+
+// fold ((A-B)+(B-C)) -> (A-C)
+def AMinusBPlusBMinusC : GICombineRule<
+   (defs root:$root),
+   (match (G_SUB $sub1, $A, $B),
+          (G_SUB $sub2, $B, $C),
+          (G_ADD $root, $sub1, $sub2)),
+   (apply (G_SUB $root, $A, $C))>;
+
+// fold (A+(B-(A+C))) to (B-C)
+def APlusBMinusAplusC : GICombineRule<
+   (defs root:$root),
+   (match (G_ADD $add1, $A, $C),
+          (G_SUB $sub1, $B, $add1),
+          (G_ADD $root, $A, $sub1)),
+   (apply (G_SUB $root, $B, $C))>;
+
+// fold (A+(B-(C+A))) to (B-C)
+def APlusBMinusCPlusA : GICombineRule<
+   (defs root:$root),
+   (match (G_ADD $add1, $C, $A),
+          (G_SUB $sub1, $B, $add1),
+          (G_ADD $root, $A, $sub1)),
+   (apply (G_SUB $root, $B, $C))>;
+
+def integer_reassoc_combines: GICombineGroup<[
+  ZeroMinusAPlusB,
+  APlusZeroMinusB,
+  APlusBMinusB,
+  BMinusAPlusA,
+  AMinusBPlusCMinusA,
+  AMinusBPlusBMinusC,
+  APlusBMinusAplusC,
+  APlusBMinusCPlusA
+]>;
+
 // FIXME: These should use the custom predicate feature once it lands.
 def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
                                      undef_to_negative_one,
@@ -1691,7 +1770,8 @@ def fma_combines : GICombineGroup<[combine_fadd_fmul_to_fmad_or_fma,
 def constant_fold_binops : GICombineGroup<[constant_fold_binop,
                                            constant_fold_fp_binop]>;
 
-def all_combines : GICombineGroup<[trivial_combines, vector_ops_combines,
+def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
+    vector_ops_combines,
     insert_vec_elt_combines, extract_vec_elt_combines, combines_for_extload,
     combine_extracted_vector_load,
     undef_combines, identity_combines, phi_combines,
@@ -1713,7 +1793,8 @@ def all_combines : GICombineGroup<[trivial_combines, vector_ops_combines,
     sub_add_reg, select_to_minmax, redundant_binop_in_equality,
     fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
     combine_concat_vector, double_icmp_zero_and_or_combine, match_addos,
-    sext_trunc, zext_trunc, combine_shuffle_concat]>;
+    sext_trunc, zext_trunc, combine_shuffle_concat,
+    push_freeze_to_prevent_poison_from_propagating]>;
 
 // A combine group used to for prelegalizer combiners at -O0. The combines in
 // this group have been selected based on experiments to balance code size and
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index 5670767..8daa8a6 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -159,20 +159,20 @@ X86_FEATURE_COMPAT(AVX512VL,        "avx512vl",              20)
 X86_FEATURE_COMPAT(AVX512BW,        "avx512bw",              21)
 X86_FEATURE_COMPAT(AVX512DQ,        "avx512dq",              22)
 X86_FEATURE_COMPAT(AVX512CD,        "avx512cd",              23)
-X86_FEATURE_COMPAT(AVX512ER,        "avx512er",              24)
-X86_FEATURE_COMPAT(AVX512PF,        "avx512pf",              25)
-X86_FEATURE_COMPAT(AVX512VBMI,      "avx512vbmi",            26)
-X86_FEATURE_COMPAT(AVX512IFMA,      "avx512ifma",            27)
-X86_FEATURE_COMPAT(AVX5124VNNIW,    "avx5124vnniw",          28)
-X86_FEATURE_COMPAT(AVX5124FMAPS,    "avx5124fmaps",          29)
-X86_FEATURE_COMPAT(AVX512VPOPCNTDQ, "avx512vpopcntdq",       30)
-X86_FEATURE_COMPAT(AVX512VBMI2,     "avx512vbmi2",           31)
-X86_FEATURE_COMPAT(GFNI,            "gfni",                  32)
-X86_FEATURE_COMPAT(VPCLMULQDQ,      "vpclmulqdq",            33)
-X86_FEATURE_COMPAT(AVX512VNNI,      "avx512vnni",            34)
-X86_FEATURE_COMPAT(AVX512BITALG,    "avx512bitalg",          35)
-X86_FEATURE_COMPAT(AVX512BF16,      "avx512bf16",            36)
-X86_FEATURE_COMPAT(AVX512VP2INTERSECT, "avx512vp2intersect", 37)
+X86_FEATURE       (NF,              "nf")
+X86_FEATURE       (CF,              "cf")
+X86_FEATURE_COMPAT(AVX512VBMI,      "avx512vbmi",            24)
+X86_FEATURE_COMPAT(AVX512IFMA,      "avx512ifma",            25)
+X86_FEATURE_COMPAT(AVX5124VNNIW,    "avx5124vnniw",          26)
+X86_FEATURE_COMPAT(AVX5124FMAPS,    "avx5124fmaps",          27)
+X86_FEATURE_COMPAT(AVX512VPOPCNTDQ, "avx512vpopcntdq",       28)
+X86_FEATURE_COMPAT(AVX512VBMI2,     "avx512vbmi2",           29)
+X86_FEATURE_COMPAT(GFNI,            "gfni",                  30)
+X86_FEATURE_COMPAT(VPCLMULQDQ,      "vpclmulqdq",            31)
+X86_FEATURE_COMPAT(AVX512VNNI,      "avx512vnni",            32)
+X86_FEATURE_COMPAT(AVX512BITALG,    "avx512bitalg",          33)
+X86_FEATURE_COMPAT(AVX512BF16,      "avx512bf16",            34)
+X86_FEATURE_COMPAT(AVX512VP2INTERSECT, "avx512vp2intersect", 35)
 // Below Features has some missings comparing to gcc, it's because gcc has some
 // not one-to-one mapped in llvm.
 X86_FEATURE_COMPAT(3DNOW,           "3dnow",                  0)
@@ -202,7 +202,7 @@ X86_FEATURE_COMPAT(MWAITX,          "mwaitx",                 0)
 X86_FEATURE       (X87,             "x87")
 X86_FEATURE_COMPAT(PCONFIG,         "pconfig",                0)
 X86_FEATURE_COMPAT(PKU,             "pku",                    0)
-X86_FEATURE_COMPAT(PREFETCHWT1,     "prefetchwt1",            0)
+X86_FEATURE       (EVEX512,         "evex512")
 X86_FEATURE_COMPAT(PRFCHW,          "prfchw",                 0)
 X86_FEATURE_COMPAT(PTWRITE,         "ptwrite",                0)
 X86_FEATURE_COMPAT(RDPID,           "rdpid",                  0)
@@ -252,9 +252,6 @@ X86_FEATURE       (EGPR,            "egpr")
 X86_FEATURE_COMPAT(USERMSR,         "usermsr",                0)
 X86_FEATURE_COMPAT(AVX10_1,         "avx10.1-256",            0)
 X86_FEATURE_COMPAT(AVX10_1_512,     "avx10.1-512",            0)
-X86_FEATURE       (EVEX512,         "evex512")
-X86_FEATURE       (NF,              "nf")
-X86_FEATURE       (CF,              "cf")
 // These features aren't really CPU features, but the frontend can set them.
 X86_FEATURE       (RETPOLINE_EXTERNAL_THUNK,    "retpoline-external-thunk")
 X86_FEATURE       (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches")
diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
index 024bba8..72a0823 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
@@ -212,11 +212,15 @@ bool convertToDeclaration(GlobalValue &GV);
 /// \p ModuleToSummariesForIndex will be populated with the needed summaries
 /// from each required module path. Use a std::map instead of StringMap to get
 /// stable order for bitcode emission.
+///
+/// \p DecSummaries will be popluated with the subset of of summary pointers
+/// that have 'declaration' import type among all summaries the module need.
 void gatherImportedSummariesForModule(
     StringRef ModulePath,
     const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
     const FunctionImporter::ImportMapTy &ImportList,
-    std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex);
+    std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex,
+    GVSummaryPtrSet &DecSummaries);
 
 /// Emit into \p OutputFilename the files module \p ModulePath will import from.
 std::error_code EmitImportsFiles(
diff --git a/llvm/lib/Analysis/CFG.cpp b/llvm/lib/Analysis/CFG.cpp
index 8528aa9..841b835 100644
--- a/llvm/lib/Analysis/CFG.cpp
+++ b/llvm/lib/Analysis/CFG.cpp
@@ -130,14 +130,21 @@ static const Loop *getOutermostLoop(const LoopInfo *LI, const BasicBlock *BB) {
   return L ? L->getOutermostLoop() : nullptr;
 }
 
-bool llvm::isPotentiallyReachableFromMany(
-    SmallVectorImpl<BasicBlock *> &Worklist, const BasicBlock *StopBB,
-    const SmallPtrSetImpl<BasicBlock *> *ExclusionSet, const DominatorTree *DT,
-    const LoopInfo *LI) {
-  // When the stop block is unreachable, it's dominated from everywhere,
+template <class StopSetT>
+static bool isReachableImpl(SmallVectorImpl<BasicBlock *> &Worklist,
+                            const StopSetT &StopSet,
+                            const SmallPtrSetImpl<BasicBlock *> *ExclusionSet,
+                            const DominatorTree *DT, const LoopInfo *LI) {
+  // When a stop block is unreachable, it's dominated from everywhere,
   // regardless of whether there's a path between the two blocks.
-  if (DT && !DT->isReachableFromEntry(StopBB))
-    DT = nullptr;
+  if (DT) {
+    for (auto *BB : StopSet) {
+      if (!DT->isReachableFromEntry(BB)) {
+        DT = nullptr;
+        break;
+      }
+    }
+  }
 
   // We can't skip directly from a block that dominates the stop block if the
   // exclusion block is potentially in between.
@@ -155,7 +162,13 @@ bool llvm::isPotentiallyReachableFromMany(
     }
   }
 
-  const Loop *StopLoop = LI ? getOutermostLoop(LI, StopBB) : nullptr;
+  SmallPtrSet<const Loop *, 2> StopLoops;
+  if (LI) {
+    for (auto *StopSetBB : StopSet) {
+      if (const Loop *L = getOutermostLoop(LI, StopSetBB))
+        StopLoops.insert(L);
+    }
+  }
 
   unsigned Limit = DefaultMaxBBsToExplore;
   SmallPtrSet<const BasicBlock*, 32> Visited;
@@ -163,12 +176,16 @@ bool llvm::isPotentiallyReachableFromMany(
     BasicBlock *BB = Worklist.pop_back_val();
     if (!Visited.insert(BB).second)
       continue;
-    if (BB == StopBB)
+    if (StopSet.contains(BB))
       return true;
     if (ExclusionSet && ExclusionSet->count(BB))
       continue;
-    if (DT && DT->dominates(BB, StopBB))
-      return true;
+    if (DT) {
+      if (llvm::any_of(StopSet, [&](const BasicBlock *StopBB) {
+            return DT->dominates(BB, StopBB);
+          }))
+        return true;
+    }
 
     const Loop *Outer = nullptr;
     if (LI) {
@@ -179,7 +196,7 @@ bool llvm::isPotentiallyReachableFromMany(
       // excluded block. Clear Outer so we process BB's successors.
       if (LoopsWithHoles.count(Outer))
         Outer = nullptr;
-      if (StopLoop && Outer == StopLoop)
+      if (StopLoops.contains(Outer))
         return true;
     }
 
@@ -204,6 +221,39 @@ bool llvm::isPotentiallyReachableFromMany(
   return false;
 }
 
+template <class T> class SingleEntrySet {
+public:
+  using const_iterator = const T *;
+
+  SingleEntrySet(T Elem) : Elem(Elem) {}
+
+  bool contains(T Other) const { return Elem == Other; }
+
+  const_iterator begin() const { return &Elem; }
+  const_iterator end() const { return &Elem + 1; }
+
+private:
+  T Elem;
+};
+
+bool llvm::isPotentiallyReachableFromMany(
+    SmallVectorImpl<BasicBlock *> &Worklist, const BasicBlock *StopBB,
+    const SmallPtrSetImpl<BasicBlock *> *ExclusionSet, const DominatorTree *DT,
+    const LoopInfo *LI) {
+  return isReachableImpl<SingleEntrySet<const BasicBlock *>>(
+      Worklist, SingleEntrySet<const BasicBlock *>(StopBB), ExclusionSet, DT,
+      LI);
+}
+
+bool llvm::isManyPotentiallyReachableFromMany(
+    SmallVectorImpl<BasicBlock *> &Worklist,
+    const SmallPtrSetImpl<const BasicBlock *> &StopSet,
+    const SmallPtrSetImpl<BasicBlock *> *ExclusionSet, const DominatorTree *DT,
+    const LoopInfo *LI) {
+  return isReachableImpl<SmallPtrSetImpl<const BasicBlock *>>(
+      Worklist, StopSet, ExclusionSet, DT, LI);
+}
+
 bool llvm::isPotentiallyReachable(
     const BasicBlock *A, const BasicBlock *B,
     const SmallPtrSetImpl<BasicBlock *> *ExclusionSet, const DominatorTree *DT,
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 2a967f5..bc8b9b8 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -392,9 +392,9 @@ void RuntimePointerChecking::generateChecks(
 
 bool RuntimePointerChecking::needsChecking(
     const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const {
-  for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I)
-    for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J)
-      if (needsChecking(M.Members[I], N.Members[J]))
+  for (const auto &I : M.Members)
+    for (const auto &J : N.Members)
+      if (needsChecking(I, J))
         return true;
   return false;
 }
@@ -408,9 +408,7 @@ static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
 
   if (!C)
     return nullptr;
-  if (C->getValue()->isNegative())
-    return J;
-  return I;
+  return C->getValue()->isNegative() ? J : I;
 }
 
 bool RuntimeCheckingPtrGroup::addPointer(unsigned Index,
@@ -508,8 +506,8 @@ void RuntimePointerChecking::groupChecks(
 
   DenseMap<Value *, SmallVector<unsigned>> PositionMap;
   for (unsigned Index = 0; Index < Pointers.size(); ++Index) {
-    auto Iter = PositionMap.insert({Pointers[Index].PointerValue, {}});
-    Iter.first->second.push_back(Index);
+    auto [It, _] = PositionMap.insert({Pointers[Index].PointerValue, {}});
+    It->second.push_back(Index);
   }
 
   // We need to keep track of what pointers we've already seen so we
@@ -608,16 +606,16 @@ void RuntimePointerChecking::printChecks(
     raw_ostream &OS, const SmallVectorImpl<RuntimePointerCheck> &Checks,
     unsigned Depth) const {
   unsigned N = 0;
-  for (const auto &Check : Checks) {
-    const auto &First = Check.first->Members, &Second = Check.second->Members;
+  for (const auto &[Check1, Check2] : Checks) {
+    const auto &First = Check1->Members, &Second = Check2->Members;
 
     OS.indent(Depth) << "Check " << N++ << ":\n";
 
-    OS.indent(Depth + 2) << "Comparing group (" << Check.first << "):\n";
+    OS.indent(Depth + 2) << "Comparing group (" << Check1 << "):\n";
     for (unsigned K = 0; K < First.size(); ++K)
       OS.indent(Depth + 2) << *Pointers[First[K]].PointerValue << "\n";
 
-    OS.indent(Depth + 2) << "Against group (" << Check.second << "):\n";
+    OS.indent(Depth + 2) << "Against group (" << Check2 << "):\n";
     for (unsigned K = 0; K < Second.size(); ++K)
       OS.indent(Depth + 2) << *Pointers[Second[K]].PointerValue << "\n";
   }
@@ -1158,8 +1156,8 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
     // First, count how many write and read accesses are in the alias set. Also
     // collect MemAccessInfos for later.
     SmallVector<MemAccessInfo, 4> AccessInfos;
-    for (const Value *Ptr_ : ASPointers) {
-      Value *Ptr = const_cast<Value *>(Ptr_);
+    for (const Value *ConstPtr : ASPointers) {
+      Value *Ptr = const_cast<Value *>(ConstPtr);
       bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
       if (IsWrite)
         ++NumWritePtrChecks;
@@ -1215,9 +1213,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
       // We know that we need these checks, so we can now be more aggressive
       // and add further checks if required (overflow checks).
       CanDoAliasSetRT = true;
-      for (auto Retry : Retries) {
-        MemAccessInfo Access = Retry.first;
-        Type *AccessTy = Retry.second;
+      for (const auto &[Access, AccessTy] : Retries) {
         if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
                                   DepSetId, TheLoop, RunningDepId, ASId,
                                   ShouldCheckWrap, /*Assume=*/true)) {
@@ -1289,12 +1285,11 @@ void AccessAnalysis::processMemAccesses() {
   LLVM_DEBUG(dbgs() << "  AST: "; AST.dump());
   LLVM_DEBUG(dbgs() << "LAA:   Accesses(" << Accesses.size() << "):\n");
   LLVM_DEBUG({
-    for (auto A : Accesses)
-      dbgs() << "\t" << *A.first.getPointer() << " ("
-             << (A.first.getInt()
-                     ? "write"
-                     : (ReadOnlyPtr.count(A.first.getPointer()) ? "read-only"
-                                                                : "read"))
+    for (const auto &[A, _] : Accesses)
+      dbgs() << "\t" << *A.getPointer() << " ("
+             << (A.getInt() ? "write"
+                            : (ReadOnlyPtr.count(A.getPointer()) ? "read-only"
+                                                                 : "read"))
              << ")\n";
   });
 
@@ -1323,16 +1318,16 @@ void AccessAnalysis::processMemAccesses() {
       bool UseDeferred = SetIteration > 0;
       PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses;
 
-      for (const Value *Ptr_ : ASPointers) {
-        Value *Ptr = const_cast<Value *>(Ptr_);
+      for (const Value *ConstPtr : ASPointers) {
+        Value *Ptr = const_cast<Value *>(ConstPtr);
 
         // For a single memory access in AliasSetTracker, Accesses may contain
         // both read and write, and they both need to be handled for CheckDeps.
-        for (const auto &AC : S) {
-          if (AC.first.getPointer() != Ptr)
+        for (const auto &[AC, _] : S) {
+          if (AC.getPointer() != Ptr)
             continue;
 
-          bool IsWrite = AC.first.getInt();
+          bool IsWrite = AC.getInt();
 
           // If we're using the deferred access set, then it contains only
           // reads.
@@ -1859,10 +1854,7 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
   // (If so, then we have proven (**) because |Dist| >= -1*Dist)
   const SCEV *NegDist = SE.getNegativeSCEV(CastedDist);
   Minus = SE.getMinusSCEV(NegDist, CastedProduct);
-  if (SE.isKnownPositive(Minus))
-    return true;
-
-  return false;
+  return SE.isKnownPositive(Minus);
 }
 
 /// Check the dependence for two accesses with the same stride \p Stride.
@@ -2050,7 +2042,7 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
   if (isa<SCEVCouldNotCompute>(Dist)) {
     // TODO: Relax requirement that there is a common stride to retry with
     // non-constant distance dependencies.
-    FoundNonConstantDistanceDependence |= !!CommonStride;
+    FoundNonConstantDistanceDependence |= CommonStride.has_value();
     LLVM_DEBUG(dbgs() << "LAA: Dependence because of uncomputable distance.\n");
     return Dependence::Unknown;
   }
@@ -2093,11 +2085,10 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
       if (HasSameSize) {
         // Write to the same location with the same size.
         return Dependence::Forward;
-      } else {
-        LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but "
-                             "different type sizes\n");
-        return Dependence::Unknown;
       }
+      LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but "
+                           "different type sizes\n");
+      return Dependence::Unknown;
     }
 
     bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
@@ -2343,7 +2334,7 @@ bool MemoryDepChecker::areDepsSafe(
           }
         ++OI;
       }
-      AI++;
+      ++AI;
     }
   }
 
@@ -2352,8 +2343,8 @@ bool MemoryDepChecker::areDepsSafe(
 }
 
 SmallVector<Instruction *, 4>
-MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool isWrite) const {
-  MemAccessInfo Access(Ptr, isWrite);
+MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool IsWrite) const {
+  MemAccessInfo Access(Ptr, IsWrite);
   auto &IndexVector = Accesses.find(Access)->second;
 
   SmallVector<Instruction *, 4> Insts;
@@ -2729,13 +2720,14 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
 }
 
 void LoopAccessInfo::emitUnsafeDependenceRemark() {
-  auto Deps = getDepChecker().getDependences();
+  const auto *Deps = getDepChecker().getDependences();
   if (!Deps)
     return;
-  auto Found = llvm::find_if(*Deps, [](const MemoryDepChecker::Dependence &D) {
-    return MemoryDepChecker::Dependence::isSafeForVectorization(D.Type) !=
-           MemoryDepChecker::VectorizationSafetyStatus::Safe;
-  });
+  const auto *Found =
+      llvm::find_if(*Deps, [](const MemoryDepChecker::Dependence &D) {
+        return MemoryDepChecker::Dependence::isSafeForVectorization(D.Type) !=
+               MemoryDepChecker::VectorizationSafetyStatus::Safe;
+      });
   if (Found == Deps->end())
     return;
   MemoryDepChecker::Dependence Dep = *Found;
@@ -2874,9 +2866,9 @@ static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
 
   // Check that all of the gep indices are uniform except for our induction
   // operand.
-  for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i)
-    if (i != InductionOperand &&
-        !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp))
+  for (unsigned I = 0, E = GEP->getNumOperands(); I != E; ++I)
+    if (I != InductionOperand &&
+        !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(I)), Lp))
       return Ptr;
   return GEP->getOperand(InductionOperand);
 }
@@ -3072,9 +3064,8 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
   DepChecker =
       std::make_unique<MemoryDepChecker>(*PSE, L, MaxTargetVectorWidthInBits);
   PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
-  if (canAnalyzeLoop()) {
+  if (canAnalyzeLoop())
     analyzeLoop(AA, LI, TLI, DT);
-  }
 }
 
 void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
@@ -3126,13 +3117,13 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
 }
 
 const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L) {
-  auto I = LoopAccessInfoMap.insert({&L, nullptr});
+  auto [It, Inserted] = LoopAccessInfoMap.insert({&L, nullptr});
 
-  if (I.second)
-    I.first->second =
+  if (Inserted)
+    It->second =
         std::make_unique<LoopAccessInfo>(&L, &SE, TTI, TLI, &AA, &DT, &LI);
 
-  return *I.first->second;
+  return *It->second;
 }
 
 bool LoopAccessInfoManager::invalidate(
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 704f926..b83e2b4 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -9198,8 +9198,25 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromICmp(
     // Since the loop is finite, an invariant RHS cannot include the boundary
     // value, otherwise it would loop forever.
     if (!EnableFiniteLoopControl || !ControllingFiniteLoop ||
-        !isLoopInvariant(RHS, L))
-      break;
+        !isLoopInvariant(RHS, L)) {
+      // Otherwise, perform the addition in a wider type, to avoid overflow.
+      // If the LHS is an addrec with the appropriate nowrap flag, the
+      // extension will be sunk into it and the exit count can be analyzed.
+      auto *OldType = dyn_cast<IntegerType>(LHS->getType());
+      if (!OldType)
+        break;
+      // Prefer doubling the bitwidth over adding a single bit to make it more
+      // likely that we use a legal type.
+      auto *NewType =
+          Type::getIntNTy(OldType->getContext(), OldType->getBitWidth() * 2);
+      if (ICmpInst::isSigned(Pred)) {
+        LHS = getSignExtendExpr(LHS, NewType);
+        RHS = getSignExtendExpr(RHS, NewType);
+      } else {
+        LHS = getZeroExtendExpr(LHS, NewType);
+        RHS = getZeroExtendExpr(RHS, NewType);
+      }
+    }
     RHS = getAddExpr(getOne(RHS->getType()), RHS);
     [[fallthrough]];
   case ICmpInst::ICMP_SLT:
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 592caf2..6b760fb 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -1204,26 +1204,31 @@ void TargetLibraryInfoImpl::addVectorizableFunctions(ArrayRef<VecDesc> Fns) {
 static const VecDesc VecFuncs_Accelerate[] = {
 #define TLI_DEFINE_ACCELERATE_VECFUNCS
 #include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_ACCELERATE_VECFUNCS
 };
 
 static const VecDesc VecFuncs_DarwinLibSystemM[] = {
 #define TLI_DEFINE_DARWIN_LIBSYSTEM_M_VECFUNCS
 #include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_DARWIN_LIBSYSTEM_M_VECFUNCS
 };
 
 static const VecDesc VecFuncs_LIBMVEC_X86[] = {
 #define TLI_DEFINE_LIBMVEC_X86_VECFUNCS
 #include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_LIBMVEC_X86_VECFUNCS
 };
 
 static const VecDesc VecFuncs_MASSV[] = {
 #define TLI_DEFINE_MASSV_VECFUNCS
 #include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_MASSV_VECFUNCS
 };
 
 static const VecDesc VecFuncs_SVML[] = {
 #define TLI_DEFINE_SVML_VECFUNCS
 #include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_SVML_VECFUNCS
 };
 
 static const VecDesc VecFuncs_SLEEFGNUABI_VF2[] = {
@@ -1231,18 +1236,21 @@ static const VecDesc VecFuncs_SLEEFGNUABI_VF2[] = {
 #define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, VABI_PREFIX)                         \
   {SCAL, VEC, VF, /* MASK = */ false, VABI_PREFIX},
 #include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_SLEEFGNUABI_VF2_VECFUNCS
 };
 static const VecDesc VecFuncs_SLEEFGNUABI_VF4[] = {
 #define TLI_DEFINE_SLEEFGNUABI_VF4_VECFUNCS
 #define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, VABI_PREFIX)                         \
   {SCAL, VEC, VF, /* MASK = */ false, VABI_PREFIX},
 #include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_SLEEFGNUABI_VF4_VECFUNCS
 };
 static const VecDesc VecFuncs_SLEEFGNUABI_VFScalable[] = {
 #define TLI_DEFINE_SLEEFGNUABI_SCALABLE_VECFUNCS
 #define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, MASK, VABI_PREFIX)                   \
   {SCAL, VEC, VF, MASK, VABI_PREFIX},
 #include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_SLEEFGNUABI_SCALABLE_VECFUNCS
 };
 
 static const VecDesc VecFuncs_ArmPL[] = {
@@ -1250,6 +1258,7 @@ static const VecDesc VecFuncs_ArmPL[] = {
 #define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, MASK, VABI_PREFIX)                   \
   {SCAL, VEC, VF, MASK, VABI_PREFIX},
 #include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_ARMPL_VECFUNCS
 };
 
 const VecDesc VecFuncs_AMDLIBM[] = {
@@ -1257,6 +1266,7 @@ const VecDesc VecFuncs_AMDLIBM[] = {
 #define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, MASK, VABI_PREFIX)                   \
   {SCAL, VEC, VF, MASK, VABI_PREFIX},
 #include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_AMDLIBM_VECFUNCS
 };
 
 void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index f6a458f..82b6d7e 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1037,7 +1037,7 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
 
 InstructionCost TargetTransformInfo::getReplicationShuffleCost(
     Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts,
-    TTI::TargetCostKind CostKind) {
+    TTI::TargetCostKind CostKind) const {
   InstructionCost Cost = TTIImpl->getReplicationShuffleCost(
       EltTy, ReplicationFactor, VF, DemandedDstElts, CostKind);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index c4cea3d..c5fdd11 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -428,6 +428,11 @@ class IndexBitcodeWriter : public BitcodeWriterBase {
   /// The combined index to write to bitcode.
   const ModuleSummaryIndex &Index;
 
+  /// When writing combined summaries, provides the set of global value
+  /// summaries for which the value (function, function alias, etc) should be
+  /// imported as a declaration.
+  const GVSummaryPtrSet *DecSummaries = nullptr;
+
   /// When writing a subset of the index for distributed backends, client
   /// provides a map of modules to the corresponding GUIDs/summaries to write.
   const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex;
@@ -452,11 +457,16 @@ public:
   /// Constructs a IndexBitcodeWriter object for the given combined index,
   /// writing to the provided \p Buffer. When writing a subset of the index
   /// for a distributed backend, provide a \p ModuleToSummariesForIndex map.
+  /// If provided, \p ModuleToDecSummaries specifies the set of summaries for
+  /// which the corresponding functions or aliased functions should be imported
+  /// as a declaration (but not definition) for each module.
   IndexBitcodeWriter(BitstreamWriter &Stream, StringTableBuilder &StrtabBuilder,
                      const ModuleSummaryIndex &Index,
+                     const GVSummaryPtrSet *DecSummaries = nullptr,
                      const std::map<std::string, GVSummaryMapTy>
                          *ModuleToSummariesForIndex = nullptr)
       : BitcodeWriterBase(Stream, StrtabBuilder), Index(Index),
+        DecSummaries(DecSummaries),
         ModuleToSummariesForIndex(ModuleToSummariesForIndex) {
     // Assign unique value ids to all summaries to be written, for use
     // in writing out the call graph edges. Save the mapping from GUID
@@ -1202,7 +1212,8 @@ static uint64_t getEncodedFFlags(FunctionSummary::FFlags Flags) {
 
 // Decode the flags for GlobalValue in the summary. See getDecodedGVSummaryFlags
 // in BitcodeReader.cpp.
-static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) {
+static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags,
+                                         bool ImportAsDecl = false) {
   uint64_t RawFlags = 0;
 
   RawFlags |= Flags.NotEligibleToImport; // bool
@@ -1217,7 +1228,8 @@ static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) {
 
   RawFlags |= (Flags.Visibility << 8); // 2 bits
 
-  RawFlags |= (Flags.ImportType << 10); // 1 bit
+  unsigned ImportType = Flags.ImportType | ImportAsDecl;
+  RawFlags |= (ImportType << 10); // 1 bit
 
   return RawFlags;
 }
@@ -4543,6 +4555,12 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
   unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
+  auto shouldImportValueAsDecl = [&](GlobalValueSummary *GVS) -> bool {
+    if (DecSummaries == nullptr)
+      return false;
+    return DecSummaries->contains(GVS);
+  };
+
   // The aliases are emitted as a post-pass, and will point to the value
   // id of the aliasee. Save them in a vector for post-processing.
   SmallVector<AliasSummary *, 64> Aliases;
@@ -4653,7 +4671,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     NameVals.push_back(*ValueId);
     assert(ModuleIdMap.count(FS->modulePath()));
     NameVals.push_back(ModuleIdMap[FS->modulePath()]);
-    NameVals.push_back(getEncodedGVSummaryFlags(FS->flags()));
+    NameVals.push_back(
+        getEncodedGVSummaryFlags(FS->flags(), shouldImportValueAsDecl(FS)));
     NameVals.push_back(FS->instCount());
     NameVals.push_back(getEncodedFFlags(FS->fflags()));
     NameVals.push_back(FS->entryCount());
@@ -4702,7 +4721,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     NameVals.push_back(AliasValueId);
     assert(ModuleIdMap.count(AS->modulePath()));
     NameVals.push_back(ModuleIdMap[AS->modulePath()]);
-    NameVals.push_back(getEncodedGVSummaryFlags(AS->flags()));
+    NameVals.push_back(
+        getEncodedGVSummaryFlags(AS->flags(), shouldImportValueAsDecl(AS)));
     auto AliaseeValueId = SummaryToValueIdMap[&AS->getAliasee()];
     assert(AliaseeValueId);
     NameVals.push_back(AliaseeValueId);
@@ -5036,8 +5056,9 @@ void BitcodeWriter::writeModule(const Module &M,
 
 void BitcodeWriter::writeIndex(
     const ModuleSummaryIndex *Index,
-    const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex) {
-  IndexBitcodeWriter IndexWriter(*Stream, StrtabBuilder, *Index,
+    const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex,
+    const GVSummaryPtrSet *DecSummaries) {
+  IndexBitcodeWriter IndexWriter(*Stream, StrtabBuilder, *Index, DecSummaries,
                                  ModuleToSummariesForIndex);
   IndexWriter.write();
 }
@@ -5090,12 +5111,13 @@ void IndexBitcodeWriter::write() {
 // index for a distributed backend, provide a \p ModuleToSummariesForIndex map.
 void llvm::writeIndexToFile(
     const ModuleSummaryIndex &Index, raw_ostream &Out,
-    const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex) {
+    const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex,
+    const GVSummaryPtrSet *DecSummaries) {
   SmallVector<char, 0> Buffer;
   Buffer.reserve(256 * 1024);
 
   BitcodeWriter Writer(Buffer);
-  Writer.writeIndex(&Index, ModuleToSummariesForIndex);
+  Writer.writeIndex(&Index, ModuleToSummariesForIndex, DecSummaries);
   Writer.writeStrtab();
 
   Out.write((char *)&Buffer.front(), Buffer.size());
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 6022afb..c1e7f01 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -1539,8 +1539,8 @@ void DwarfCompileUnit::addGlobalNameForTypeUnit(StringRef Name,
 }
 
 /// Add a new global type to the unit.
-void DwarfCompileUnit::addGlobalType(const DIType *Ty, const DIE &Die,
-                                     const DIScope *Context) {
+void DwarfCompileUnit::addGlobalTypeImpl(const DIType *Ty, const DIE &Die,
+                                         const DIScope *Context) {
   if (!hasDwarfPubSections())
     return;
   std::string FullName = getParentContextString(Context) + Ty->getName().str();
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index dc772bb..76584b3 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -335,8 +335,8 @@ public:
   void addGlobalNameForTypeUnit(StringRef Name, const DIScope *Context);
 
   /// Add a new global type to the compile unit.
-  void addGlobalType(const DIType *Ty, const DIE &Die,
-                     const DIScope *Context) override;
+  void addGlobalTypeImpl(const DIType *Ty, const DIE &Die,
+                         const DIScope *Context) override;
 
   /// Add a new global type present in a type unit to this compile unit.
   void addGlobalTypeUnitType(const DIType *Ty, const DIScope *Context);
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 1e33c27..6c04fa1 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -578,28 +578,33 @@ DIE *DwarfUnit::createTypeDIE(const DIScope *Context, DIE &ContextDIE,
   // Create new type.
   DIE &TyDIE = createAndAddDIE(Ty->getTag(), ContextDIE, Ty);
 
-  updateAcceleratorTables(Context, Ty, TyDIE);
+  auto construct = [&](const auto *Ty) {
+    updateAcceleratorTables(Context, Ty, TyDIE);
+    constructTypeDIE(TyDIE, Ty);
+  };
 
-  if (auto *BT = dyn_cast<DIBasicType>(Ty))
-    constructTypeDIE(TyDIE, BT);
-  else if (auto *ST = dyn_cast<DIStringType>(Ty))
-    constructTypeDIE(TyDIE, ST);
-  else if (auto *STy = dyn_cast<DISubroutineType>(Ty))
-    constructTypeDIE(TyDIE, STy);
-  else if (auto *CTy = dyn_cast<DICompositeType>(Ty)) {
+  if (auto *CTy = dyn_cast<DICompositeType>(Ty)) {
     if (DD->generateTypeUnits() && !Ty->isForwardDecl() &&
         (Ty->getRawName() || CTy->getRawIdentifier())) {
       // Skip updating the accelerator tables since this is not the full type.
-      if (MDString *TypeId = CTy->getRawIdentifier())
+      if (MDString *TypeId = CTy->getRawIdentifier()) {
+        addGlobalType(Ty, TyDIE, Context);
         DD->addDwarfTypeUnitType(getCU(), TypeId->getString(), TyDIE, CTy);
-      else
+      } else {
+        updateAcceleratorTables(Context, Ty, TyDIE);
         finishNonUnitTypeDIE(TyDIE, CTy);
+      }
       return &TyDIE;
     }
-    constructTypeDIE(TyDIE, CTy);
-  } else {
-    constructTypeDIE(TyDIE, cast<DIDerivedType>(Ty));
-  }
+    construct(CTy);
+  } else if (auto *BT = dyn_cast<DIBasicType>(Ty))
+    construct(BT);
+  else if (auto *ST = dyn_cast<DIStringType>(Ty))
+    construct(ST);
+  else if (auto *STy = dyn_cast<DISubroutineType>(Ty))
+    construct(STy);
+  else
+    construct(cast<DIDerivedType>(Ty));
 
   return &TyDIE;
 }
@@ -633,21 +638,31 @@ DIE *DwarfUnit::getOrCreateTypeDIE(const MDNode *TyNode) {
 
 void DwarfUnit::updateAcceleratorTables(const DIScope *Context,
                                         const DIType *Ty, const DIE &TyDIE) {
-  if (!Ty->getName().empty() && !Ty->isForwardDecl()) {
-    bool IsImplementation = false;
-    if (auto *CT = dyn_cast<DICompositeType>(Ty)) {
-      // A runtime language of 0 actually means C/C++ and that any
-      // non-negative value is some version of Objective-C/C++.
-      IsImplementation = CT->getRuntimeLang() == 0 || CT->isObjcClassComplete();
-    }
-    unsigned Flags = IsImplementation ? dwarf::DW_FLAG_type_implementation : 0;
-    DD->addAccelType(*this, CUNode->getNameTableKind(), Ty->getName(), TyDIE,
-                     Flags);
+  if (Ty->getName().empty())
+    return;
+  if (Ty->isForwardDecl())
+    return;
 
-    if (!Context || isa<DICompileUnit>(Context) || isa<DIFile>(Context) ||
-        isa<DINamespace>(Context) || isa<DICommonBlock>(Context))
-      addGlobalType(Ty, TyDIE, Context);
+  // add temporary record for this type to be added later
+
+  bool IsImplementation = false;
+  if (auto *CT = dyn_cast<DICompositeType>(Ty)) {
+    // A runtime language of 0 actually means C/C++ and that any
+    // non-negative value is some version of Objective-C/C++.
+    IsImplementation = CT->getRuntimeLang() == 0 || CT->isObjcClassComplete();
   }
+  unsigned Flags = IsImplementation ? dwarf::DW_FLAG_type_implementation : 0;
+  DD->addAccelType(*this, CUNode->getNameTableKind(), Ty->getName(), TyDIE,
+                   Flags);
+
+  addGlobalType(Ty, TyDIE, Context);
+}
+
+void DwarfUnit::addGlobalType(const DIType *Ty, const DIE &TyDIE,
+                              const DIScope *Context) {
+  if (!Context || isa<DICompileUnit>(Context) || isa<DIFile>(Context) ||
+      isa<DINamespace>(Context) || isa<DICommonBlock>(Context))
+    addGlobalTypeImpl(Ty, TyDIE, Context);
 }
 
 void DwarfUnit::addType(DIE &Entity, const DIType *Ty,
@@ -1844,8 +1859,8 @@ void DwarfTypeUnit::addGlobalName(StringRef Name, const DIE &Die,
   getCU().addGlobalNameForTypeUnit(Name, Context);
 }
 
-void DwarfTypeUnit::addGlobalType(const DIType *Ty, const DIE &Die,
-                                  const DIScope *Context) {
+void DwarfTypeUnit::addGlobalTypeImpl(const DIType *Ty, const DIE &Die,
+                                      const DIScope *Context) {
   getCU().addGlobalTypeUnitType(Ty, Context);
 }
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 18f50f8..0225654 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -128,8 +128,10 @@ public:
                              const DIScope *Context) = 0;
 
   /// Add a new global type to the compile unit.
-  virtual void addGlobalType(const DIType *Ty, const DIE &Die,
-                             const DIScope *Context) = 0;
+  virtual void addGlobalTypeImpl(const DIType *Ty, const DIE &Die,
+                                 const DIScope *Context) = 0;
+
+  void addGlobalType(const DIType *Ty, const DIE &Die, const DIScope *Context);
 
   /// Returns the DIE map slot for the specified debug variable.
   ///
@@ -397,8 +399,8 @@ public:
   }
   void addGlobalName(StringRef Name, const DIE &Die,
                      const DIScope *Context) override;
-  void addGlobalType(const DIType *Ty, const DIE &Die,
-                     const DIScope *Context) override;
+  void addGlobalTypeImpl(const DIType *Ty, const DIE &Die,
+                         const DIScope *Context) override;
   DwarfCompileUnit &getCU() override { return CU; }
 };
 } // end llvm namespace
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index ee44e93..d2b756e 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -37,6 +37,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
@@ -937,6 +938,36 @@ void AtomicExpandImpl::expandPartwordAtomicRMW(
   AI->eraseFromParent();
 }
 
+/// Copy metadata that's safe to preserve when widening atomics.
+static void copyMetadataForAtomic(Instruction &Dest,
+                                  const Instruction &Source) {
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
+  Source.getAllMetadata(MD);
+  LLVMContext &Ctx = Dest.getContext();
+  MDBuilder MDB(Ctx);
+
+  for (auto [ID, N] : MD) {
+    switch (ID) {
+    case LLVMContext::MD_dbg:
+    case LLVMContext::MD_tbaa:
+    case LLVMContext::MD_tbaa_struct:
+    case LLVMContext::MD_alias_scope:
+    case LLVMContext::MD_noalias:
+    case LLVMContext::MD_access_group:
+    case LLVMContext::MD_mmra:
+      Dest.setMetadata(ID, N);
+      break;
+    default:
+      if (ID == Ctx.getMDKindID("amdgpu.no.remote.memory"))
+        Dest.setMetadata(ID, N);
+      else if (ID == Ctx.getMDKindID("amdgpu.no.fine.grained.memory"))
+        Dest.setMetadata(ID, N);
+
+      break;
+    }
+  }
+}
+
 // Widen the bitwise atomicrmw (or/xor/and) to the minimum supported width.
 AtomicRMWInst *AtomicExpandImpl::widenPartwordAtomicRMW(AtomicRMWInst *AI) {
   ReplacementIRBuilder Builder(AI, *DL);
@@ -965,7 +996,8 @@ AtomicRMWInst *AtomicExpandImpl::widenPartwordAtomicRMW(AtomicRMWInst *AI) {
   AtomicRMWInst *NewAI = Builder.CreateAtomicRMW(
       Op, PMV.AlignedAddr, NewOperand, PMV.AlignedAddrAlignment,
       AI->getOrdering(), AI->getSyncScopeID());
-  // TODO: Preserve metadata
+
+  copyMetadataForAtomic(*NewAI, *AI);
 
   Value *FinalOldResult = extractMaskedValue(Builder, NewAI, PMV);
   AI->replaceAllUsesWith(FinalOldResult);
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 22eb4a3..4cc602b 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -223,6 +223,70 @@ void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
   replaceRegWith(MRI, DstReg, SrcReg);
 }
 
+bool CombinerHelper::matchFreezeOfSingleMaybePoisonOperand(
+    MachineInstr &MI, BuildFnTy &MatchInfo) {
+  // Ported from InstCombinerImpl::pushFreezeToPreventPoisonFromPropagating.
+  Register DstOp = MI.getOperand(0).getReg();
+  Register OrigOp = MI.getOperand(1).getReg();
+
+  if (!MRI.hasOneNonDBGUse(OrigOp))
+    return false;
+
+  MachineInstr *OrigDef = MRI.getUniqueVRegDef(OrigOp);
+  // Even if only a single operand of the PHI is not guaranteed non-poison,
+  // moving freeze() backwards across a PHI can cause optimization issues for
+  // other users of that operand.
+  //
+  // Moving freeze() from one of the output registers of a G_UNMERGE_VALUES to
+  // the source register is unprofitable because it makes the freeze() more
+  // strict than is necessary (it would affect the whole register instead of
+  // just the subreg being frozen).
+  if (OrigDef->isPHI() || isa<GUnmerge>(OrigDef))
+    return false;
+
+  if (canCreateUndefOrPoison(OrigOp, MRI,
+                             /*ConsiderFlagsAndMetadata=*/false))
+    return false;
+
+  std::optional<MachineOperand> MaybePoisonOperand;
+  for (MachineOperand &Operand : OrigDef->uses()) {
+    if (!Operand.isReg())
+      return false;
+
+    if (isGuaranteedNotToBeUndefOrPoison(Operand.getReg(), MRI))
+      continue;
+
+    if (!MaybePoisonOperand)
+      MaybePoisonOperand = Operand;
+    else {
+      // We have more than one maybe-poison operand. Moving the freeze is
+      // unsafe.
+      return false;
+    }
+  }
+
+  cast<GenericMachineInstr>(OrigDef)->dropPoisonGeneratingFlags();
+
+  // Eliminate freeze if all operands are guaranteed non-poison.
+  if (!MaybePoisonOperand) {
+    MatchInfo = [=](MachineIRBuilder &B) { MRI.replaceRegWith(DstOp, OrigOp); };
+    return true;
+  }
+
+  Register MaybePoisonOperandReg = MaybePoisonOperand->getReg();
+  LLT MaybePoisonOperandRegTy = MRI.getType(MaybePoisonOperandReg);
+
+  MatchInfo = [=](MachineIRBuilder &B) mutable {
+    B.setInsertPt(*OrigDef->getParent(), OrigDef->getIterator());
+    auto Freeze = B.buildFreeze(MaybePoisonOperandRegTy, MaybePoisonOperandReg);
+    replaceRegOpWith(
+        MRI, *OrigDef->findRegisterUseOperand(MaybePoisonOperandReg, TRI),
+        Freeze.getReg(0));
+    replaceRegWith(MRI, DstOp, OrigOp);
+  };
+  return true;
+}
+
 bool CombinerHelper::matchCombineConcatVectors(MachineInstr &MI,
                                                SmallVector<Register> &Ops) {
   assert(MI.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
diff --git a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
index 14e1e1f..5acf35b3 100644
--- a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
@@ -538,6 +538,13 @@ bool InlineAsmLowering::lowerInlineAsm(
     }
   }
 
+  // Add rounding control registers as implicit def for inline asm.
+  if (MF.getFunction().hasFnAttribute(Attribute::StrictFP)) {
+    ArrayRef<MCPhysReg> RCRegs = TLI->getRoundingControlRegisters();
+    for (MCPhysReg Reg : RCRegs)
+      Inst.addReg(Reg, RegState::ImplicitDefine);
+  }
+
   if (auto Bundle = Call.getOperandBundle(LLVMContext::OB_convergencectrl)) {
     auto *Token = Bundle->Inputs[0].get();
     ArrayRef<Register> SourceRegs = GetOrCreateVRegs(*Token);
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 4050784..d8b0f52 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -1296,7 +1296,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     MI.eraseFromParent();
     return Legalized;
   }
-
+  case TargetOpcode::G_CONSTANT_FOLD_BARRIER:
   case TargetOpcode::G_FREEZE: {
     if (TypeIdx != 0)
       return UnableToLegalize;
@@ -1310,7 +1310,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     SmallVector<Register, 8> Parts;
     for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) {
       Parts.push_back(
-          MIRBuilder.buildFreeze(NarrowTy, Unmerge.getReg(i)).getReg(0));
+          MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy}, {Unmerge.getReg(i)})
+              .getReg(0));
     }
 
     MIRBuilder.buildMergeLikeInstr(MI.getOperand(0).getReg(), Parts);
@@ -2515,6 +2516,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     return Legalized;
   }
   case TargetOpcode::G_FREEZE:
+  case TargetOpcode::G_CONSTANT_FOLD_BARRIER:
     Observer.changingInstr(MI);
     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
     widenScalarDst(MI, WideTy);
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index cd5dc0e..f455482 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -1745,11 +1745,20 @@ static bool canCreateUndefOrPoison(Register Reg, const MachineRegisterInfo &MRI,
                                    UndefPoisonKind Kind) {
   MachineInstr *RegDef = MRI.getVRegDef(Reg);
 
+  if (auto *GMI = dyn_cast<GenericMachineInstr>(RegDef)) {
+    if (ConsiderFlagsAndMetadata && includesPoison(Kind) &&
+        GMI->hasPoisonGeneratingFlags())
+      return true;
+  } else {
+    // Conservatively return true.
+    return true;
+  }
+
   switch (RegDef->getOpcode()) {
   case TargetOpcode::G_FREEZE:
     return false;
   default:
-    return true;
+    return !isa<GCastOp>(RegDef) && !isa<GBinOp>(RegDef);
   }
 }
 
@@ -1767,8 +1776,17 @@ static bool isGuaranteedNotToBeUndefOrPoison(Register Reg,
     return true;
   case TargetOpcode::G_IMPLICIT_DEF:
     return !includesUndef(Kind);
-  default:
-    return false;
+  default: {
+    auto MOCheck = [&](const MachineOperand &MO) {
+      if (!MO.isReg())
+        return true;
+      return ::isGuaranteedNotToBeUndefOrPoison(MO.getReg(), MRI, Depth + 1,
+                                                Kind);
+    };
+    return !::canCreateUndefOrPoison(Reg, MRI,
+                                     /*ConsiderFlagsAndMetadata=*/true, Kind) &&
+           all_of(RegDef->uses(), MOCheck);
+  }
   }
 }
 
diff --git a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
index a9b59e7..fc4be84 100644
--- a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
+++ b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
@@ -64,10 +64,10 @@ struct VectorInfo;
 struct InterleavedLoadCombineImpl {
 public:
   InterleavedLoadCombineImpl(Function &F, DominatorTree &DT, MemorySSA &MSSA,
+                             const TargetTransformInfo &TTI,
                              const TargetMachine &TM)
       : F(F), DT(DT), MSSA(MSSA),
-        TLI(*TM.getSubtargetImpl(F)->getTargetLowering()),
-        TTI(TM.getTargetTransformInfo(F)) {}
+        TLI(*TM.getSubtargetImpl(F)->getTargetLowering()), TTI(TTI) {}
 
   /// Scan the function for interleaved load candidates and execute the
   /// replacement if applicable.
@@ -87,7 +87,7 @@ private:
   const TargetLowering &TLI;
 
   /// Target Transform Information
-  const TargetTransformInfo TTI;
+  const TargetTransformInfo &TTI;
 
   /// Find the instruction in sets LIs that dominates all others, return nullptr
   /// if there is none.
@@ -1329,6 +1329,7 @@ struct InterleavedLoadCombine : public FunctionPass {
     return InterleavedLoadCombineImpl(
                F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
                getAnalysis<MemorySSAWrapperPass>().getMSSA(),
+               getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
                TPC->getTM<TargetMachine>())
         .run();
   }
@@ -1336,6 +1337,7 @@ struct InterleavedLoadCombine : public FunctionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MemorySSAWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
 
@@ -1348,7 +1350,8 @@ InterleavedLoadCombinePass::run(Function &F, FunctionAnalysisManager &FAM) {
 
   auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
   auto &MemSSA = FAM.getResult<MemorySSAAnalysis>(F).getMSSA();
-  bool Changed = InterleavedLoadCombineImpl(F, DT, MemSSA, *TM).run();
+  auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
+  bool Changed = InterleavedLoadCombineImpl(F, DT, MemSSA, TTI, *TM).run();
   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
 
@@ -1360,6 +1363,7 @@ INITIALIZE_PASS_BEGIN(
     false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(
     InterleavedLoadCombine, DEBUG_TYPE,
     "Combine interleaved loads into wide loads and shufflevector instructions",
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 643370f..7b7b545 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -414,7 +414,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
       DeadRemats->insert(MI);
       const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
       MI->substituteRegister(Dest, NewLI.reg(), 0, TRI);
-      MI->getOperand(0).setIsDead(true);
+      assert(MI->registerDefIsDead(NewLI.reg(), &TRI));
     } else {
       if (TheDelegate)
         TheDelegate->LRE_WillEraseInstruction(MI);
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 78d581c..03e892a5 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1664,7 +1664,8 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
     if (ShouldTrackPressure) {
       // Update top scheduled pressure.
       RegisterOperands RegOpers;
-      RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
+      RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks,
+                       /*IgnoreDead=*/false);
       if (ShouldTrackLaneMasks) {
         // Adjust liveness and add missing dead+read-undef flags.
         SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
@@ -1698,7 +1699,8 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
     }
     if (ShouldTrackPressure) {
       RegisterOperands RegOpers;
-      RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
+      RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks,
+                       /*IgnoreDead=*/false);
       if (ShouldTrackLaneMasks) {
         // Adjust liveness and add missing dead+read-undef flags.
         SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
@@ -3775,6 +3777,21 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
     }
   } while (SU->isScheduled);
 
+  // If IsTopNode, then SU is in Top.Available and must be removed. Otherwise,
+  // if isTopReady(), then SU is in either Top.Available or Top.Pending.
+  // If !IsTopNode, then SU is in Bot.Available and must be removed. Otherwise,
+  // if isBottomReady(), then SU is in either Bot.Available or Bot.Pending.
+  //
+  // It is coincidental when !IsTopNode && isTopReady or when IsTopNode &&
+  // isBottomReady. That is, it didn't factor into the decision to choose SU
+  // because it isTopReady or isBottomReady, respectively. In fact, if the
+  // RegionPolicy is OnlyTopDown or OnlyBottomUp, then the Bot queues and Top
+  // queues respectivley contain the original roots and don't get updated when
+  // picking a node. So if SU isTopReady on a OnlyBottomUp pick, then it was
+  // because we schduled everything but the top roots. Conversley, if SU
+  // isBottomReady on OnlyTopDown, then it was because we scheduled everything
+  // but the bottom roots. If its in a queue even coincidentally, it should be
+  // removed so it does not get re-picked in a subsequent pickNode call.
   if (SU->isTopReady())
     Top.removeReady(SU);
   if (SU->isBottomReady())
diff --git a/llvm/lib/CodeGen/RegisterPressure.cpp b/llvm/lib/CodeGen/RegisterPressure.cpp
index 3fa2244..9a7eb49 100644
--- a/llvm/lib/CodeGen/RegisterPressure.cpp
+++ b/llvm/lib/CodeGen/RegisterPressure.cpp
@@ -873,7 +873,7 @@ void RegPressureTracker::recede(SmallVectorImpl<RegisterMaskPair> *LiveUses) {
 
   const MachineInstr &MI = *CurrPos;
   RegisterOperands RegOpers;
-  RegOpers.collect(MI, *TRI, *MRI, TrackLaneMasks, false);
+  RegOpers.collect(MI, *TRI, *MRI, TrackLaneMasks, /*IgnoreDead=*/false);
   if (TrackLaneMasks) {
     SlotIndex SlotIdx = LIS->getInstructionIndex(*CurrPos).getRegSlot();
     RegOpers.adjustLaneLiveness(*LIS, *MRI, SlotIdx);
@@ -1041,7 +1041,7 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
   // Account for register pressure similar to RegPressureTracker::recede().
   RegisterOperands RegOpers;
   RegOpers.collect(*MI, *TRI, *MRI, TrackLaneMasks, /*IgnoreDead=*/true);
-  assert(RegOpers.DeadDefs.size() == 0);
+  assert(RegOpers.DeadDefs.empty());
   if (TrackLaneMasks)
     RegOpers.adjustLaneLiveness(*LIS, *MRI, SlotIdx);
   else if (RequireIntervals)
@@ -1290,7 +1290,7 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
 
   // Account for register pressure similar to RegPressureTracker::recede().
   RegisterOperands RegOpers;
-  RegOpers.collect(*MI, *TRI, *MRI, TrackLaneMasks, false);
+  RegOpers.collect(*MI, *TRI, *MRI, TrackLaneMasks, /*IgnoreDead=*/false);
   if (TrackLaneMasks)
     RegOpers.adjustLaneLiveness(*LIS, *MRI, SlotIdx);
 
diff --git a/llvm/lib/CodeGen/ScheduleDAG.cpp b/llvm/lib/CodeGen/ScheduleDAG.cpp
index de8e6f6..8d9a504 100644
--- a/llvm/lib/CodeGen/ScheduleDAG.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAG.cpp
@@ -331,8 +331,10 @@ void SUnit::biasCriticalPath() {
   unsigned MaxDepth = BestI->getSUnit()->getDepth();
   for (SUnit::pred_iterator I = std::next(BestI), E = Preds.end(); I != E;
        ++I) {
-    if (I->getKind() == SDep::Data && I->getSUnit()->getDepth() > MaxDepth)
+    if (I->getKind() == SDep::Data && I->getSUnit()->getDepth() > MaxDepth) {
+      MaxDepth = I->getSUnit()->getDepth();
       BestI = I;
+    }
   }
   if (BestI != Preds.begin())
     std::swap(*Preds.begin(), *BestI);
diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
index 2e03ae6..0a5f0a8 100644
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -130,7 +130,11 @@ public:
   class SelectLike {
     SelectLike(Instruction *I) : I(I) {}
 
+    /// The select (/or) instruction.
     Instruction *I;
+    /// Whether this select is inverted, "not(cond), FalseVal, TrueVal", as
+    /// opposed to the original condition.
+    bool Inverted = false;
 
   public:
     /// Match a select or select-like instruction, returning a SelectLike.
@@ -153,14 +157,22 @@ public:
     bool isValid() { return I; }
     operator bool() { return isValid(); }
 
+    /// Invert the select by inverting the condition and switching the operands.
+    void setInverted() {
+      assert(!Inverted && "Trying to invert an inverted SelectLike");
+      assert(isa<Instruction>(getCondition()) &&
+             cast<Instruction>(getCondition())->getOpcode() ==
+                 Instruction::Xor);
+      Inverted = true;
+    }
+    bool isInverted() const { return Inverted; }
+
     Instruction *getI() { return I; }
     const Instruction *getI() const { return I; }
 
     Type *getType() const { return I->getType(); }
 
-    /// Return the condition for the SelectLike instruction. For example the
-    /// condition of a select or c in `or(zext(c), x)`
-    Value *getCondition() const {
+    Value *getNonInvertedCondition() const {
       if (auto *Sel = dyn_cast<SelectInst>(I))
         return Sel->getCondition();
       // Or(zext) case
@@ -177,11 +189,24 @@ public:
       llvm_unreachable("Unhandled case in getCondition");
     }
 
+    /// Return the condition for the SelectLike instruction. For example the
+    /// condition of a select or c in `or(zext(c), x)`
+    Value *getCondition() const {
+      Value *CC = getNonInvertedCondition();
+      // For inverted conditions the CC is checked when created to be a not
+      // (xor) instruction.
+      if (Inverted)
+        return cast<Instruction>(CC)->getOperand(0);
+      return CC;
+    }
+
     /// Return the true value for the SelectLike instruction. Note this may not
     /// exist for all SelectLike instructions. For example, for `or(zext(c), x)`
     /// the true value would be `or(x,1)`. As this value does not exist, nullptr
     /// is returned.
-    Value *getTrueValue() const {
+    Value *getTrueValue(bool HonorInverts = true) const {
+      if (Inverted && HonorInverts)
+        return getFalseValue(/*HonorInverts=*/false);
       if (auto *Sel = dyn_cast<SelectInst>(I))
         return Sel->getTrueValue();
       // Or(zext) case - The true value is Or(X), so return nullptr as the value
@@ -195,7 +220,9 @@ public:
     /// Return the false value for the SelectLike instruction. For example the
     /// getFalseValue of a select or `x` in `or(zext(c), x)` (which is
     /// `select(c, x|1, x)`)
-    Value *getFalseValue() const {
+    Value *getFalseValue(bool HonorInverts = true) const {
+      if (Inverted && HonorInverts)
+        return getTrueValue(/*HonorInverts=*/false);
       if (auto *Sel = dyn_cast<SelectInst>(I))
         return Sel->getFalseValue();
       // Or(zext) case - return the operand which is not the zext.
@@ -216,8 +243,8 @@ public:
     /// InstCostMap. This may need to be generated for select-like instructions.
     Scaled64 getTrueOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap,
                            const TargetTransformInfo *TTI) {
-      if (auto *Sel = dyn_cast<SelectInst>(I))
-        if (auto *I = dyn_cast<Instruction>(Sel->getTrueValue()))
+      if (isa<SelectInst>(I))
+        if (auto *I = dyn_cast<Instruction>(getTrueValue()))
           return InstCostMap.contains(I) ? InstCostMap[I].NonPredCost
                                          : Scaled64::getZero();
 
@@ -242,8 +269,8 @@ public:
     Scaled64
     getFalseOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap,
                    const TargetTransformInfo *TTI) {
-      if (auto *Sel = dyn_cast<SelectInst>(I))
-        if (auto *I = dyn_cast<Instruction>(Sel->getFalseValue()))
+      if (isa<SelectInst>(I))
+        if (auto *I = dyn_cast<Instruction>(getFalseValue()))
           return InstCostMap.contains(I) ? InstCostMap[I].NonPredCost
                                          : Scaled64::getZero();
 
@@ -510,9 +537,10 @@ getTrueOrFalseValue(SelectOptimizeImpl::SelectLike SI, bool isTrue,
   for (SelectInst *DefSI = dyn_cast<SelectInst>(SI.getI());
        DefSI != nullptr && Selects.count(DefSI);
        DefSI = dyn_cast<SelectInst>(V)) {
-    assert(DefSI->getCondition() == SI.getCondition() &&
-           "The condition of DefSI does not match with SI");
-    V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
+    if (DefSI->getCondition() == SI.getCondition())
+      V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
+    else // Handle inverted SI
+      V = (!isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
   }
 
   if (isa<BinaryOperator>(SI.getI())) {
@@ -632,18 +660,19 @@ void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
     // Delete the unconditional branch that was just created by the split.
     StartBlock->getTerminator()->eraseFromParent();
 
-    // Move any debug/pseudo instructions that were in-between the select
-    // group to the newly-created end block.
-    SmallVector<Instruction *, 2> DebugPseudoINS;
+    // Move any debug/pseudo instructions and not's that were in-between the
+    // select group to the newly-created end block.
+    SmallVector<Instruction *, 2> SinkInstrs;
     auto DIt = SI.getI()->getIterator();
     while (&*DIt != LastSI.getI()) {
       if (DIt->isDebugOrPseudoInst())
-        DebugPseudoINS.push_back(&*DIt);
+        SinkInstrs.push_back(&*DIt);
+      if (match(&*DIt, m_Not(m_Specific(SI.getCondition()))))
+        SinkInstrs.push_back(&*DIt);
       DIt++;
     }
-    for (auto *DI : DebugPseudoINS) {
+    for (auto *DI : SinkInstrs)
       DI->moveBeforePreserving(&*EndBlock->getFirstInsertionPt());
-    }
 
     // Duplicate implementation for DbgRecords, the non-instruction debug-info
     // format. Helper lambda for moving DbgRecords to the end block.
@@ -765,6 +794,13 @@ void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB,
           ++BBIt;
           continue;
         }
+
+        // Skip not(select(..)), if the not is part of the same select group
+        if (match(NI, m_Not(m_Specific(SI.getCondition())))) {
+          ++BBIt;
+          continue;
+        }
+
         // We only allow selects in the same group, not other select-like
         // instructions.
         if (!isa<SelectInst>(NI))
@@ -773,6 +809,10 @@ void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB,
         SelectLike NSI = SelectLike::match(NI);
         if (NSI && SI.getCondition() == NSI.getCondition()) {
           SIGroup.push_back(NSI);
+        } else if (NSI && match(NSI.getCondition(),
+                                m_Not(m_Specific(SI.getCondition())))) {
+          NSI.setInverted();
+          SIGroup.push_back(NSI);
         } else
           break;
         ++BBIt;
@@ -783,6 +823,12 @@ void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB,
       if (!isSelectKindSupported(SI))
         continue;
 
+      LLVM_DEBUG({
+        dbgs() << "New Select group with\n";
+        for (auto SI : SIGroup)
+          dbgs() << "  " << *SI.getI() << "\n";
+      });
+
       SIGroups.push_back(SIGroup);
     }
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 8607b50..93d8663 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10745,6 +10745,7 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
   SDValue N2 = N->getOperand(2);
   bool IsFSHL = N->getOpcode() == ISD::FSHL;
   unsigned BitWidth = VT.getScalarSizeInBits();
+  SDLoc DL(N);
 
   // fold (fshl N0, N1, 0) -> N0
   // fold (fshr N0, N1, 0) -> N1
@@ -10764,8 +10765,8 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
     // fold (fsh* N0, N1, c) -> (fsh* N0, N1, c % BitWidth)
     if (Cst->getAPIntValue().uge(BitWidth)) {
       uint64_t RotAmt = Cst->getAPIntValue().urem(BitWidth);
-      return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0, N1,
-                         DAG.getConstant(RotAmt, SDLoc(N), ShAmtTy));
+      return DAG.getNode(N->getOpcode(), DL, VT, N0, N1,
+                         DAG.getConstant(RotAmt, DL, ShAmtTy));
     }
 
     unsigned ShAmt = Cst->getZExtValue();
@@ -10777,13 +10778,13 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
     // fold fshl(N0, undef_or_zero, C) -> shl(N0, C)
     // fold fshr(N0, undef_or_zero, C) -> shl(N0, BW-C)
     if (IsUndefOrZero(N0))
-      return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1,
-                         DAG.getConstant(IsFSHL ? BitWidth - ShAmt : ShAmt,
-                                         SDLoc(N), ShAmtTy));
+      return DAG.getNode(
+          ISD::SRL, DL, VT, N1,
+          DAG.getConstant(IsFSHL ? BitWidth - ShAmt : ShAmt, DL, ShAmtTy));
     if (IsUndefOrZero(N1))
-      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0,
-                         DAG.getConstant(IsFSHL ? ShAmt : BitWidth - ShAmt,
-                                         SDLoc(N), ShAmtTy));
+      return DAG.getNode(
+          ISD::SHL, DL, VT, N0,
+          DAG.getConstant(IsFSHL ? ShAmt : BitWidth - ShAmt, DL, ShAmtTy));
 
     // fold (fshl ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
     // fold (fshr ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
@@ -10832,18 +10833,19 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
   if (isPowerOf2_32(BitWidth)) {
     APInt ModuloBits(N2.getScalarValueSizeInBits(), BitWidth - 1);
     if (IsUndefOrZero(N0) && !IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits))
-      return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1, N2);
+      return DAG.getNode(ISD::SRL, DL, VT, N1, N2);
     if (IsUndefOrZero(N1) && IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits))
-      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N2);
+      return DAG.getNode(ISD::SHL, DL, VT, N0, N2);
   }
 
   // fold (fshl N0, N0, N2) -> (rotl N0, N2)
   // fold (fshr N0, N0, N2) -> (rotr N0, N2)
-  // TODO: Investigate flipping this rotate if only one is legal, if funnel shift
-  // is legal as well we might be better off avoiding non-constant (BW - N2).
+  // TODO: Investigate flipping this rotate if only one is legal.
+  // If funnel shift is legal as well we might be better off avoiding
+  // non-constant (BW - N2).
   unsigned RotOpc = IsFSHL ? ISD::ROTL : ISD::ROTR;
   if (N0 == N1 && hasOperation(RotOpc, VT))
-    return DAG.getNode(RotOpc, SDLoc(N), VT, N0, N2);
+    return DAG.getNode(RotOpc, DL, VT, N0, N2);
 
   // Simplify, based on bits shifted out of N0/N1.
   if (SimplifyDemandedBits(SDValue(N, 0)))
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 759368a..3673896 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -1412,6 +1412,13 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
       }
     }
 
+    // Add rounding control registers as implicit def for inline asm.
+    if (MF->getFunction().hasFnAttribute(Attribute::StrictFP)) {
+      ArrayRef<MCPhysReg> RCRegs = TLI->getRoundingControlRegisters();
+      for (MCPhysReg Reg : RCRegs)
+        MIB.addReg(Reg, RegState::ImplicitDefine);
+    }
+
     // GCC inline assembly allows input operands to also be early-clobber
     // output operands (so long as the operand is written only after it's
     // used), but this does not match the semantics of our early-clobber flag.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index c64e27f..8fda35f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -107,9 +107,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::SIGN_EXTEND_INREG:
                          Res = PromoteIntRes_SIGN_EXTEND_INREG(N); break;
   case ISD::SRA:
-  case ISD::VP_ASHR:     Res = PromoteIntRes_SRA(N); break;
+  case ISD::VP_SRA:      Res = PromoteIntRes_SRA(N); break;
   case ISD::SRL:
-  case ISD::VP_LSHR:     Res = PromoteIntRes_SRL(N); break;
+  case ISD::VP_SRL:      Res = PromoteIntRes_SRL(N); break;
   case ISD::VP_TRUNCATE:
   case ISD::TRUNCATE:    Res = PromoteIntRes_TRUNCATE(N); break;
   case ISD::UNDEF:       Res = PromoteIntRes_UNDEF(N); break;
@@ -573,7 +573,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) {
                        ShAmt);
   SDValue Mask = N->getOperand(1);
   SDValue EVL = N->getOperand(2);
-  return DAG.getNode(ISD::VP_LSHR, dl, NVT,
+  return DAG.getNode(ISD::VP_SRL, dl, NVT,
                      DAG.getNode(ISD::VP_BSWAP, dl, NVT, Op, Mask, EVL), ShAmt,
                      Mask, EVL);
 }
@@ -601,7 +601,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITREVERSE(SDNode *N) {
                        DAG.getNode(ISD::BITREVERSE, dl, NVT, Op), ShAmt);
   SDValue Mask = N->getOperand(1);
   SDValue EVL = N->getOperand(2);
-  return DAG.getNode(ISD::VP_LSHR, dl, NVT,
+  return DAG.getNode(ISD::VP_SRL, dl, NVT,
                      DAG.getNode(ISD::VP_BITREVERSE, dl, NVT, Op, Mask, EVL),
                      ShAmt, Mask, EVL);
 }
@@ -1405,7 +1405,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) {
   SDValue RHS = N->getOperand(1);
   if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
     RHS = ZExtPromotedInteger(RHS);
-  if (N->getOpcode() != ISD::VP_ASHR)
+  if (N->getOpcode() != ISD::VP_SRA)
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
                      N->getOperand(2), N->getOperand(3));
@@ -1417,7 +1417,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) {
   SDValue RHS = N->getOperand(1);
   if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
     RHS = ZExtPromotedInteger(RHS);
-  if (N->getOpcode() != ISD::VP_LSHR)
+  if (N->getOpcode() != ISD::VP_SRL)
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
                      N->getOperand(2), N->getOperand(3));
@@ -1513,10 +1513,10 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VPFunnelShift(SDNode *N) {
     Hi = DAG.getNode(ISD::VP_SHL, DL, VT, Hi, HiShift, Mask, EVL);
     Lo = DAG.getVPZeroExtendInReg(Lo, Mask, EVL, DL, OldVT);
     SDValue Res = DAG.getNode(ISD::VP_OR, DL, VT, Hi, Lo, Mask, EVL);
-    Res = DAG.getNode(IsFSHR ? ISD::VP_LSHR : ISD::VP_SHL, DL, VT, Res, Amt,
+    Res = DAG.getNode(IsFSHR ? ISD::VP_SRL : ISD::VP_SHL, DL, VT, Res, Amt,
                       Mask, EVL);
     if (!IsFSHR)
-      Res = DAG.getNode(ISD::VP_LSHR, DL, VT, Res, HiShift, Mask, EVL);
+      Res = DAG.getNode(ISD::VP_SRL, DL, VT, Res, HiShift, Mask, EVL);
     return Res;
   }
 
@@ -2212,7 +2212,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_VP_SIGN_EXTEND(SDNode *N) {
   // FIXME: There is no VP_SIGN_EXTEND_INREG so use a pair of shifts.
   SDValue Shl = DAG.getNode(ISD::VP_SHL, dl, VT, Op, ShAmt, N->getOperand(1),
                             N->getOperand(2));
-  return DAG.getNode(ISD::VP_ASHR, dl, VT, Shl, ShAmt, N->getOperand(1),
+  return DAG.getNode(ISD::VP_SRA, dl, VT, Shl, ShAmt, N->getOperand(1),
                      N->getOperand(2));
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index ec05135..40e621f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1188,8 +1188,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::OR: case ISD::VP_OR:
   case ISD::XOR: case ISD::VP_XOR:
   case ISD::SHL: case ISD::VP_SHL:
-  case ISD::SRA: case ISD::VP_ASHR:
-  case ISD::SRL: case ISD::VP_LSHR:
+  case ISD::SRA: case ISD::VP_SRA:
+  case ISD::SRL: case ISD::VP_SRL:
   case ISD::UREM: case ISD::VP_UREM:
   case ISD::SREM: case ISD::VP_SREM:
   case ISD::FREM: case ISD::VP_FREM:
@@ -4235,8 +4235,8 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SUB: case ISD::VP_SUB:
   case ISD::XOR: case ISD::VP_XOR:
   case ISD::SHL: case ISD::VP_SHL:
-  case ISD::SRA: case ISD::VP_ASHR:
-  case ISD::SRL: case ISD::VP_LSHR:
+  case ISD::SRA: case ISD::VP_SRA:
+  case ISD::SRL: case ISD::VP_SRL:
   case ISD::FMINNUM: case ISD::VP_FMINNUM:
   case ISD::FMAXNUM: case ISD::VP_FMAXNUM:
   case ISD::FMINIMUM:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 777bbf0..b05649c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4780,6 +4780,13 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
         (VTBits - SignBitsOp0 + 1) + (VTBits - SignBitsOp1 + 1);
     return OutValidBits > VTBits ? 1 : VTBits - OutValidBits + 1;
   }
+  case ISD::AVGCEILS:
+  case ISD::AVGFLOORS:
+    Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    if (Tmp == 1)
+      return 1; // Early out.
+    Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    return std::min(Tmp, Tmp2);
   case ISD::SREM:
     // The sign bit is the LHS's sign bit, except when the result of the
     // remainder is zero. The magnitude of the result should be less than or
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 3ec6b9b..be7bcc5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -587,6 +587,10 @@ bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth,
   if (VT.isVector())
     return false;
 
+  assert(Op.getOperand(0).getValueType().getScalarSizeInBits() == BitWidth &&
+         Op.getOperand(1).getValueType().getScalarSizeInBits() == BitWidth &&
+         "ShrinkDemandedOp only supports operands that have the same size!");
+
   // Don't do this if the node has another user, which may require the
   // full value.
   if (!Op.getNode()->hasOneUse())
@@ -1832,11 +1836,33 @@ bool TargetLowering::SimplifyDemandedBits(
         }
       }
 
+      // TODO: Can we merge this fold with the one below?
       // Try shrinking the operation as long as the shift amount will still be
       // in range.
-      if ((ShAmt < DemandedBits.getActiveBits()) &&
-          ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
-        return true;
+      if (ShAmt < DemandedBits.getActiveBits() && !VT.isVector() &&
+          Op.getNode()->hasOneUse()) {
+        // Search for the smallest integer type with free casts to and from
+        // Op's type. For expedience, just check power-of-2 integer types.
+        unsigned DemandedSize = DemandedBits.getActiveBits();
+        for (unsigned SmallVTBits = llvm::bit_ceil(DemandedSize);
+             SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
+          EVT SmallVT = EVT::getIntegerVT(*TLO.DAG.getContext(), SmallVTBits);
+          if (isNarrowingProfitable(VT, SmallVT) &&
+              isTypeDesirableForOp(ISD::SHL, SmallVT) &&
+              isTruncateFree(VT, SmallVT) && isZExtFree(SmallVT, VT) &&
+              (!TLO.LegalOperations() || isOperationLegal(ISD::SHL, SmallVT))) {
+            assert(DemandedSize <= SmallVTBits &&
+                   "Narrowed below demanded bits?");
+            // We found a type with free casts.
+            SDValue NarrowShl = TLO.DAG.getNode(
+                ISD::SHL, dl, SmallVT,
+                TLO.DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(0)),
+                TLO.DAG.getShiftAmountConstant(ShAmt, SmallVT, dl));
+            return TLO.CombineTo(
+                Op, TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT, NarrowShl));
+          }
+        }
+      }
 
       // Narrow shift to lower half - similar to ShrinkDemandedOp.
       // (shl i64:x, K) -> (i64 zero_extend (shl (i32 (trunc i64:x)), K))
@@ -1908,11 +1934,6 @@ bool TargetLowering::SimplifyDemandedBits(
     SDValue Op1 = Op.getOperand(1);
     EVT ShiftVT = Op1.getValueType();
 
-    // Try to match AVG patterns.
-    if (SDValue AVG = combineShiftToAVG(Op, TLO.DAG, *this, DemandedBits,
-                                        DemandedElts, Depth + 1))
-      return TLO.CombineTo(Op, AVG);
-
     KnownBits KnownSA = TLO.DAG.computeKnownBits(Op1, DemandedElts, Depth + 1);
     if (KnownSA.isConstant() && KnownSA.getConstant().ult(BitWidth)) {
       unsigned ShAmt = KnownSA.getConstant().getZExtValue();
@@ -1994,6 +2015,12 @@ bool TargetLowering::SimplifyDemandedBits(
       // shift amounts.
       Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
     }
+
+    // Try to match AVG patterns (after shift simplification).
+    if (SDValue AVG = combineShiftToAVG(Op, TLO.DAG, *this, DemandedBits,
+                                        DemandedElts, Depth + 1))
+      return TLO.CombineTo(Op, AVG);
+
     break;
   }
   case ISD::SRA: {
@@ -2015,11 +2042,6 @@ bool TargetLowering::SimplifyDemandedBits(
     if (DemandedBits.isOne())
       return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1));
 
-    // Try to match AVG patterns.
-    if (SDValue AVG = combineShiftToAVG(Op, TLO.DAG, *this, DemandedBits,
-                                        DemandedElts, Depth + 1))
-      return TLO.CombineTo(Op, AVG);
-
     KnownBits KnownSA = TLO.DAG.computeKnownBits(Op1, DemandedElts, Depth + 1);
     if (KnownSA.isConstant() && KnownSA.getConstant().ult(BitWidth)) {
       unsigned ShAmt = KnownSA.getConstant().getZExtValue();
@@ -2106,6 +2128,12 @@ bool TargetLowering::SimplifyDemandedBits(
         }
       }
     }
+
+    // Try to match AVG patterns (after shift simplification).
+    if (SDValue AVG = combineShiftToAVG(Op, TLO.DAG, *this, DemandedBits,
+                                        DemandedElts, Depth + 1))
+      return TLO.CombineTo(Op, AVG);
+
     break;
   }
   case ISD::FSHL:
@@ -2786,10 +2814,16 @@ bool TargetLowering::SimplifyDemandedBits(
     unsigned DemandedBitsLZ = DemandedBits.countl_zero();
     APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
     KnownBits KnownOp0, KnownOp1;
-    if (SimplifyDemandedBits(Op0, LoMask, DemandedElts, KnownOp0, TLO,
-                             Depth + 1) ||
-        SimplifyDemandedBits(Op1, LoMask, DemandedElts, KnownOp1, TLO,
+    auto GetDemandedBitsLHSMask = [&](APInt Demanded,
+                                      const KnownBits &KnownRHS) {
+      if (Op.getOpcode() == ISD::MUL)
+        Demanded.clearHighBits(KnownRHS.countMinTrailingZeros());
+      return Demanded;
+    };
+    if (SimplifyDemandedBits(Op1, LoMask, DemandedElts, KnownOp1, TLO,
                              Depth + 1) ||
+        SimplifyDemandedBits(Op0, GetDemandedBitsLHSMask(LoMask, KnownOp1),
+                             DemandedElts, KnownOp0, TLO, Depth + 1) ||
         // See if the operation should be performed at a smaller bit width.
         ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO)) {
       if (Flags.hasNoSignedWrap() || Flags.hasNoUnsignedWrap()) {
@@ -7855,7 +7889,7 @@ static SDValue expandVPFunnelShift(SDNode *Node, SelectionDAG &DAG) {
     InvShAmt = DAG.getNode(ISD::VP_SUB, DL, ShVT, BitWidthC, ShAmt, Mask, VL);
     ShX = DAG.getNode(ISD::VP_SHL, DL, VT, X, IsFSHL ? ShAmt : InvShAmt, Mask,
                       VL);
-    ShY = DAG.getNode(ISD::VP_LSHR, DL, VT, Y, IsFSHL ? InvShAmt : ShAmt, Mask,
+    ShY = DAG.getNode(ISD::VP_SRL, DL, VT, Y, IsFSHL ? InvShAmt : ShAmt, Mask,
                       VL);
   } else {
     // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
@@ -7877,12 +7911,12 @@ static SDValue expandVPFunnelShift(SDNode *Node, SelectionDAG &DAG) {
     SDValue One = DAG.getConstant(1, DL, ShVT);
     if (IsFSHL) {
       ShX = DAG.getNode(ISD::VP_SHL, DL, VT, X, ShAmt, Mask, VL);
-      SDValue ShY1 = DAG.getNode(ISD::VP_LSHR, DL, VT, Y, One, Mask, VL);
-      ShY = DAG.getNode(ISD::VP_LSHR, DL, VT, ShY1, InvShAmt, Mask, VL);
+      SDValue ShY1 = DAG.getNode(ISD::VP_SRL, DL, VT, Y, One, Mask, VL);
+      ShY = DAG.getNode(ISD::VP_SRL, DL, VT, ShY1, InvShAmt, Mask, VL);
     } else {
       SDValue ShX1 = DAG.getNode(ISD::VP_SHL, DL, VT, X, One, Mask, VL);
       ShX = DAG.getNode(ISD::VP_SHL, DL, VT, ShX1, InvShAmt, Mask, VL);
-      ShY = DAG.getNode(ISD::VP_LSHR, DL, VT, Y, ShAmt, Mask, VL);
+      ShY = DAG.getNode(ISD::VP_SRL, DL, VT, Y, ShAmt, Mask, VL);
     }
   }
   return DAG.getNode(ISD::VP_OR, DL, VT, ShX, ShY, Mask, VL);
@@ -8849,7 +8883,7 @@ SDValue TargetLowering::expandVPCTPOP(SDNode *Node, SelectionDAG &DAG) const {
 
   // v = v - ((v >> 1) & 0x55555555...)
   Tmp1 = DAG.getNode(ISD::VP_AND, dl, VT,
-                     DAG.getNode(ISD::VP_LSHR, dl, VT, Op,
+                     DAG.getNode(ISD::VP_SRL, dl, VT, Op,
                                  DAG.getConstant(1, dl, ShVT), Mask, VL),
                      Mask55, Mask, VL);
   Op = DAG.getNode(ISD::VP_SUB, dl, VT, Op, Tmp1, Mask, VL);
@@ -8857,13 +8891,13 @@ SDValue TargetLowering::expandVPCTPOP(SDNode *Node, SelectionDAG &DAG) const {
   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
   Tmp2 = DAG.getNode(ISD::VP_AND, dl, VT, Op, Mask33, Mask, VL);
   Tmp3 = DAG.getNode(ISD::VP_AND, dl, VT,
-                     DAG.getNode(ISD::VP_LSHR, dl, VT, Op,
+                     DAG.getNode(ISD::VP_SRL, dl, VT, Op,
                                  DAG.getConstant(2, dl, ShVT), Mask, VL),
                      Mask33, Mask, VL);
   Op = DAG.getNode(ISD::VP_ADD, dl, VT, Tmp2, Tmp3, Mask, VL);
 
   // v = (v + (v >> 4)) & 0x0F0F0F0F...
-  Tmp4 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(4, dl, ShVT),
+  Tmp4 = DAG.getNode(ISD::VP_SRL, dl, VT, Op, DAG.getConstant(4, dl, ShVT),
                      Mask, VL),
   Tmp5 = DAG.getNode(ISD::VP_ADD, dl, VT, Op, Tmp4, Mask, VL);
   Op = DAG.getNode(ISD::VP_AND, dl, VT, Tmp5, Mask0F, Mask, VL);
@@ -8887,8 +8921,8 @@ SDValue TargetLowering::expandVPCTPOP(SDNode *Node, SelectionDAG &DAG) const {
                       Mask, VL);
     }
   }
-  return DAG.getNode(ISD::VP_LSHR, dl, VT, V,
-                     DAG.getConstant(Len - 8, dl, ShVT), Mask, VL);
+  return DAG.getNode(ISD::VP_SRL, dl, VT, V, DAG.getConstant(Len - 8, dl, ShVT),
+                     Mask, VL);
 }
 
 SDValue TargetLowering::expandCTLZ(SDNode *Node, SelectionDAG &DAG) const {
@@ -8960,7 +8994,7 @@ SDValue TargetLowering::expandVPCTLZ(SDNode *Node, SelectionDAG &DAG) const {
   for (unsigned i = 0; (1U << i) < NumBitsPerElt; ++i) {
     SDValue Tmp = DAG.getConstant(1ULL << i, dl, ShVT);
     Op = DAG.getNode(ISD::VP_OR, dl, VT, Op,
-                     DAG.getNode(ISD::VP_LSHR, dl, VT, Op, Tmp, Mask, VL), Mask,
+                     DAG.getNode(ISD::VP_SRL, dl, VT, Op, Tmp, Mask, VL), Mask,
                      VL);
   }
   Op = DAG.getNode(ISD::VP_XOR, dl, VT, Op, DAG.getConstant(-1, dl, VT), Mask,
@@ -9194,11 +9228,21 @@ SDValue TargetLowering::expandABD(SDNode *N, SelectionDAG &DAG) const {
                        DAG.getNode(ISD::USUBSAT, dl, VT, LHS, RHS),
                        DAG.getNode(ISD::USUBSAT, dl, VT, RHS, LHS));
 
-  // abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
-  // abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
   EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT;
   SDValue Cmp = DAG.getSetCC(dl, CCVT, LHS, RHS, CC);
+
+  // Branchless expansion iff cmp result is allbits:
+  // abds(lhs, rhs) -> sub(sgt(lhs, rhs), xor(sgt(lhs, rhs), sub(lhs, rhs)))
+  // abdu(lhs, rhs) -> sub(ugt(lhs, rhs), xor(ugt(lhs, rhs), sub(lhs, rhs)))
+  if (CCVT == VT && getBooleanContents(VT) == ZeroOrNegativeOneBooleanContent) {
+    SDValue Diff = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
+    SDValue Xor = DAG.getNode(ISD::XOR, dl, VT, Diff, Cmp);
+    return DAG.getNode(ISD::SUB, dl, VT, Cmp, Xor);
+  }
+
+  // abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
+  // abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
   return DAG.getSelect(dl, VT, Cmp, DAG.getNode(ISD::SUB, dl, VT, LHS, RHS),
                        DAG.getNode(ISD::SUB, dl, VT, RHS, LHS));
 }
@@ -9279,7 +9323,7 @@ SDValue TargetLowering::expandVPBSWAP(SDNode *N, SelectionDAG &DAG) const {
   case MVT::i16:
     Tmp1 = DAG.getNode(ISD::VP_SHL, dl, VT, Op, DAG.getConstant(8, dl, SHVT),
                        Mask, EVL);
-    Tmp2 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(8, dl, SHVT),
+    Tmp2 = DAG.getNode(ISD::VP_SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT),
                        Mask, EVL);
     return DAG.getNode(ISD::VP_OR, dl, VT, Tmp1, Tmp2, Mask, EVL);
   case MVT::i32:
@@ -9289,11 +9333,11 @@ SDValue TargetLowering::expandVPBSWAP(SDNode *N, SelectionDAG &DAG) const {
                        Mask, EVL);
     Tmp3 = DAG.getNode(ISD::VP_SHL, dl, VT, Tmp3, DAG.getConstant(8, dl, SHVT),
                        Mask, EVL);
-    Tmp2 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(8, dl, SHVT),
+    Tmp2 = DAG.getNode(ISD::VP_SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT),
                        Mask, EVL);
     Tmp2 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp2,
                        DAG.getConstant(0xFF00, dl, VT), Mask, EVL);
-    Tmp1 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(24, dl, SHVT),
+    Tmp1 = DAG.getNode(ISD::VP_SRL, dl, VT, Op, DAG.getConstant(24, dl, SHVT),
                        Mask, EVL);
     Tmp4 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp4, Tmp3, Mask, EVL);
     Tmp2 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp2, Tmp1, Mask, EVL);
@@ -9313,19 +9357,19 @@ SDValue TargetLowering::expandVPBSWAP(SDNode *N, SelectionDAG &DAG) const {
                        DAG.getConstant(255ULL << 24, dl, VT), Mask, EVL);
     Tmp5 = DAG.getNode(ISD::VP_SHL, dl, VT, Tmp5, DAG.getConstant(8, dl, SHVT),
                        Mask, EVL);
-    Tmp4 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(8, dl, SHVT),
+    Tmp4 = DAG.getNode(ISD::VP_SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT),
                        Mask, EVL);
     Tmp4 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp4,
                        DAG.getConstant(255ULL << 24, dl, VT), Mask, EVL);
-    Tmp3 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(24, dl, SHVT),
+    Tmp3 = DAG.getNode(ISD::VP_SRL, dl, VT, Op, DAG.getConstant(24, dl, SHVT),
                        Mask, EVL);
     Tmp3 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp3,
                        DAG.getConstant(255ULL << 16, dl, VT), Mask, EVL);
-    Tmp2 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(40, dl, SHVT),
+    Tmp2 = DAG.getNode(ISD::VP_SRL, dl, VT, Op, DAG.getConstant(40, dl, SHVT),
                        Mask, EVL);
     Tmp2 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp2,
                        DAG.getConstant(255ULL << 8, dl, VT), Mask, EVL);
-    Tmp1 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(56, dl, SHVT),
+    Tmp1 = DAG.getNode(ISD::VP_SRL, dl, VT, Op, DAG.getConstant(56, dl, SHVT),
                        Mask, EVL);
     Tmp8 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp8, Tmp7, Mask, EVL);
     Tmp6 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp6, Tmp5, Mask, EVL);
@@ -9424,7 +9468,7 @@ SDValue TargetLowering::expandVPBITREVERSE(SDNode *N, SelectionDAG &DAG) const {
     Tmp = (Sz > 8 ? DAG.getNode(ISD::VP_BSWAP, dl, VT, Op, Mask, EVL) : Op);
 
     // swap i4: ((V >> 4) & 0x0F) | ((V & 0x0F) << 4)
-    Tmp2 = DAG.getNode(ISD::VP_LSHR, dl, VT, Tmp, DAG.getConstant(4, dl, SHVT),
+    Tmp2 = DAG.getNode(ISD::VP_SRL, dl, VT, Tmp, DAG.getConstant(4, dl, SHVT),
                        Mask, EVL);
     Tmp2 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp2,
                        DAG.getConstant(Mask4, dl, VT), Mask, EVL);
@@ -9435,7 +9479,7 @@ SDValue TargetLowering::expandVPBITREVERSE(SDNode *N, SelectionDAG &DAG) const {
     Tmp = DAG.getNode(ISD::VP_OR, dl, VT, Tmp2, Tmp3, Mask, EVL);
 
     // swap i2: ((V >> 2) & 0x33) | ((V & 0x33) << 2)
-    Tmp2 = DAG.getNode(ISD::VP_LSHR, dl, VT, Tmp, DAG.getConstant(2, dl, SHVT),
+    Tmp2 = DAG.getNode(ISD::VP_SRL, dl, VT, Tmp, DAG.getConstant(2, dl, SHVT),
                        Mask, EVL);
     Tmp2 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp2,
                        DAG.getConstant(Mask2, dl, VT), Mask, EVL);
@@ -9446,7 +9490,7 @@ SDValue TargetLowering::expandVPBITREVERSE(SDNode *N, SelectionDAG &DAG) const {
     Tmp = DAG.getNode(ISD::VP_OR, dl, VT, Tmp2, Tmp3, Mask, EVL);
 
     // swap i1: ((V >> 1) & 0x55) | ((V & 0x55) << 1)
-    Tmp2 = DAG.getNode(ISD::VP_LSHR, dl, VT, Tmp, DAG.getConstant(1, dl, SHVT),
+    Tmp2 = DAG.getNode(ISD::VP_SRL, dl, VT, Tmp, DAG.getConstant(1, dl, SHVT),
                        Mask, EVL);
     Tmp2 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp2,
                        DAG.getConstant(Mask1, dl, VT), Mask, EVL);
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 3e1897c..0fc915d 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -523,6 +523,8 @@ static unsigned getELFSectionType(StringRef Name, SectionKind K) {
 
   if (hasPrefix(Name, ".llvm.offloading"))
     return ELF::SHT_LLVM_OFFLOADING;
+  if (Name == ".llvm.lto")
+    return ELF::SHT_LLVM_LTO;
 
   if (K.isBSS() || K.isThreadBSS())
     return ELF::SHT_NOBITS;
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index 58db686..3d5c58d2 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -579,9 +579,11 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   // clang-format on
 }
 
-/// Return the value type corresponding to the specified type.  This returns all
-/// pointers as MVT::iPTR.  If HandleUnknown is true, unknown types are returned
-/// as Other, otherwise they are invalid.
+/// Return the value type corresponding to the specified type.
+/// If HandleUnknown is true, unknown types are returned as Other, otherwise
+/// they are invalid.
+/// NB: This includes pointer types, which require a DataLayout to convert
+/// to a concrete value type.
 MVT MVT::getVT(Type *Ty, bool HandleUnknown){
   assert(Ty != nullptr && "Invalid type");
   switch (Ty->getTypeID()) {
@@ -611,7 +613,6 @@ MVT MVT::getVT(Type *Ty, bool HandleUnknown){
   case Type::X86_AMXTyID:   return MVT(MVT::x86amx);
   case Type::FP128TyID:     return MVT(MVT::f128);
   case Type::PPC_FP128TyID: return MVT(MVT::ppcf128);
-  case Type::PointerTyID:   return MVT(MVT::iPTR);
   case Type::FixedVectorTyID:
   case Type::ScalableVectorTyID: {
     VectorType *VTy = cast<VectorType>(Ty);
@@ -622,9 +623,11 @@ MVT MVT::getVT(Type *Ty, bool HandleUnknown){
   }
 }
 
-/// getEVT - Return the value type corresponding to the specified type.  This
-/// returns all pointers as MVT::iPTR.  If HandleUnknown is true, unknown types
-/// are returned as Other, otherwise they are invalid.
+/// getEVT - Return the value type corresponding to the specified type.
+/// If HandleUnknown is true, unknown types are returned as Other, otherwise
+/// they are invalid.
+/// NB: This includes pointer types, which require a DataLayout to convert
+/// to a concrete value type.
 EVT EVT::getEVT(Type *Ty, bool HandleUnknown){
   switch (Ty->getTypeID()) {
   default:
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
index 4da0317..3cdffb8 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
@@ -75,7 +75,7 @@ Error SimpleExecutorMemoryManager::finalize(tpctypes::FinalizeRequest &FR) {
   auto BailOut = [&](Error Err) {
     std::pair<void *, Allocation> AllocToDestroy;
 
-    // Get allocation to destory.
+    // Get allocation to destroy.
     {
       std::lock_guard<std::mutex> Lock(M);
       auto I = Allocations.find(Base.toPtr<void *>());
@@ -153,7 +153,7 @@ Error SimpleExecutorMemoryManager::deallocate(
   std::vector<std::pair<void *, Allocation>> AllocPairs;
   AllocPairs.reserve(Bases.size());
 
-  // Get allocation to destory.
+  // Get allocation to destroy.
   Error Err = Error::success();
   {
     std::lock_guard<std::mutex> Lock(M);
diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index 59e7a9f..c3bde48 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -930,6 +930,8 @@ ConstantRange ConstantRange::overflowingBinaryOp(Instruction::BinaryOps BinOp,
     return addWithNoWrap(Other, NoWrapKind);
   case Instruction::Sub:
     return subWithNoWrap(Other, NoWrapKind);
+  case Instruction::Mul:
+    return multiplyWithNoWrap(Other, NoWrapKind);
   default:
     // Don't know about this Overflowing Binary Operation.
     // Conservatively fallback to plain binop handling.
@@ -1167,6 +1169,26 @@ ConstantRange::multiply(const ConstantRange &Other) const {
   return UR.isSizeStrictlySmallerThan(SR) ? UR : SR;
 }
 
+ConstantRange
+ConstantRange::multiplyWithNoWrap(const ConstantRange &Other,
+                                  unsigned NoWrapKind,
+                                  PreferredRangeType RangeType) const {
+  if (isEmptySet() || Other.isEmptySet())
+    return getEmpty();
+  if (isFullSet() && Other.isFullSet())
+    return getFull();
+
+  ConstantRange Result = multiply(Other);
+
+  if (NoWrapKind & OverflowingBinaryOperator::NoSignedWrap)
+    Result = Result.intersectWith(smul_sat(Other), RangeType);
+
+  if (NoWrapKind & OverflowingBinaryOperator::NoUnsignedWrap)
+    Result = Result.intersectWith(umul_sat(Other), RangeType);
+
+  return Result;
+}
+
 ConstantRange ConstantRange::smul_fast(const ConstantRange &Other) const {
   if (isEmptySet() || Other.isEmptySet())
     return getEmpty();
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 0d6760e..b327993 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -43,8 +43,8 @@ using namespace llvm;
 GlobalVariable *IRBuilderBase::CreateGlobalString(StringRef Str,
                                                   const Twine &Name,
                                                   unsigned AddressSpace,
-                                                  Module *M) {
-  Constant *StrConstant = ConstantDataArray::getString(Context, Str);
+                                                  Module *M, bool AddNull) {
+  Constant *StrConstant = ConstantDataArray::getString(Context, Str, AddNull);
   if (!M)
     M = BB->getParent()->getParent();
   auto *GV = new GlobalVariable(
diff --git a/llvm/lib/IR/MDBuilder.cpp b/llvm/lib/IR/MDBuilder.cpp
index 0bf41d7..bd68db3 100644
--- a/llvm/lib/IR/MDBuilder.cpp
+++ b/llvm/lib/IR/MDBuilder.cpp
@@ -86,9 +86,8 @@ MDNode *MDBuilder::createFunctionEntryCount(
 }
 
 MDNode *MDBuilder::createFunctionSectionPrefix(StringRef Prefix) {
-  return MDNode::get(Context,
-                     {createString("function_section_prefix"),
-                      createString(Prefix)});
+  return MDNode::get(
+      Context, {createString("function_section_prefix"), createString(Prefix)});
 }
 
 MDNode *MDBuilder::createRange(const APInt &Lo, const APInt &Hi) {
@@ -148,9 +147,10 @@ MDNode *MDBuilder::mergeCallbackEncodings(MDNode *ExistingCallbacks,
   for (unsigned u = 0; u < NumExistingOps; u++) {
     Ops[u] = ExistingCallbacks->getOperand(u);
 
-    auto *OldCBCalleeIdxAsCM = cast<ConstantAsMetadata>(Ops[u]);
+    auto *OldCBCalleeIdxAsCM =
+        cast<ConstantAsMetadata>(cast<MDNode>(Ops[u])->getOperand(0));
     uint64_t OldCBCalleeIdx =
-      cast<ConstantInt>(OldCBCalleeIdxAsCM->getValue())->getZExtValue();
+        cast<ConstantInt>(OldCBCalleeIdxAsCM->getValue())->getZExtValue();
     (void)OldCBCalleeIdx;
     assert(NewCBCalleeIdx != OldCBCalleeIdx &&
            "Cannot map a callback callee index twice!");
@@ -339,8 +339,8 @@ MDNode *MDBuilder::createMutableTBAAAccessTag(MDNode *Tag) {
 
 MDNode *MDBuilder::createIrrLoopHeaderWeight(uint64_t Weight) {
   Metadata *Vals[] = {
-    createString("loop_header_weight"),
-    createConstant(ConstantInt::get(Type::getInt64Ty(Context), Weight)),
+      createString("loop_header_weight"),
+      createConstant(ConstantInt::get(Type::getInt64Ty(Context), Weight)),
   };
   return MDNode::get(Context, Vals);
 }
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index a8696ed..f97dd18 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -882,7 +882,7 @@ StringRef Module::getDarwinTargetVariantTriple() const {
 }
 
 void Module::setDarwinTargetVariantTriple(StringRef T) {
-  addModuleFlag(ModFlagBehavior::Override, "darwin.target_variant.triple",
+  addModuleFlag(ModFlagBehavior::Warning, "darwin.target_variant.triple",
                 MDString::get(getContext(), T));
 }
 
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index e2754d7..7304eab 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1400,18 +1400,20 @@ public:
                   llvm::StringRef ModulePath,
                   const std::string &NewModulePath) {
     std::map<std::string, GVSummaryMapTy> ModuleToSummariesForIndex;
+    GVSummaryPtrSet DeclarationSummaries;
 
     std::error_code EC;
     gatherImportedSummariesForModule(ModulePath, ModuleToDefinedGVSummaries,
-                                     ImportList, ModuleToSummariesForIndex);
+                                     ImportList, ModuleToSummariesForIndex,
+                                     DeclarationSummaries);
 
     raw_fd_ostream OS(NewModulePath + ".thinlto.bc", EC,
                       sys::fs::OpenFlags::OF_None);
     if (EC)
       return errorCodeToError(EC);
 
-    // TODO: Serialize declaration bits to bitcode.
-    writeIndexToFile(CombinedIndex, OS, &ModuleToSummariesForIndex);
+    writeIndexToFile(CombinedIndex, OS, &ModuleToSummariesForIndex,
+                     &DeclarationSummaries);
 
     if (ShouldEmitImportsFiles) {
       EC = EmitImportsFiles(ModulePath, NewModulePath + ".imports",
diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index 8f517eb..b054b42 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -766,7 +766,7 @@ void ThinLTOCodeGenerator::crossModuleImport(Module &TheModule,
 void ThinLTOCodeGenerator::gatherImportedSummariesForModule(
     Module &TheModule, ModuleSummaryIndex &Index,
     std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex,
-    const lto::InputFile &File) {
+    GVSummaryPtrSet &DecSummaries, const lto::InputFile &File) {
   auto ModuleCount = Index.modulePaths().size();
   auto ModuleIdentifier = TheModule.getModuleIdentifier();
 
@@ -796,7 +796,7 @@ void ThinLTOCodeGenerator::gatherImportedSummariesForModule(
 
   llvm::gatherImportedSummariesForModule(
       ModuleIdentifier, ModuleToDefinedGVSummaries,
-      ImportLists[ModuleIdentifier], ModuleToSummariesForIndex);
+      ImportLists[ModuleIdentifier], ModuleToSummariesForIndex, DecSummaries);
 }
 
 /**
@@ -832,10 +832,14 @@ void ThinLTOCodeGenerator::emitImports(Module &TheModule, StringRef OutputName,
                            IsPrevailing(PrevailingCopy), ImportLists,
                            ExportLists);
 
+  // 'EmitImportsFiles' emits the list of modules from which to import from, and
+  // the set of keys in `ModuleToSummariesForIndex` should be a superset of keys
+  // in `DecSummaries`, so no need to use `DecSummaries` in `EmitImportFiles`.
+  GVSummaryPtrSet DecSummaries;
   std::map<std::string, GVSummaryMapTy> ModuleToSummariesForIndex;
   llvm::gatherImportedSummariesForModule(
       ModuleIdentifier, ModuleToDefinedGVSummaries,
-      ImportLists[ModuleIdentifier], ModuleToSummariesForIndex);
+      ImportLists[ModuleIdentifier], ModuleToSummariesForIndex, DecSummaries);
 
   std::error_code EC;
   if ((EC = EmitImportsFiles(ModuleIdentifier, OutputName,
diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp
index bcf065c..d5cbdc5 100644
--- a/llvm/lib/MCA/InstrBuilder.cpp
+++ b/llvm/lib/MCA/InstrBuilder.cpp
@@ -31,9 +31,9 @@ InstrBuilder::InstrBuilder(const llvm::MCSubtargetInfo &sti,
                            const llvm::MCInstrInfo &mcii,
                            const llvm::MCRegisterInfo &mri,
                            const llvm::MCInstrAnalysis *mcia,
-                           const mca::InstrumentManager &im)
+                           const mca::InstrumentManager &im, unsigned cl)
     : STI(sti), MCII(mcii), MRI(mri), MCIA(mcia), IM(im), FirstCallInst(true),
-      FirstReturnInst(true) {
+      FirstReturnInst(true), CallLatency(cl) {
   const MCSchedModel &SM = STI.getSchedModel();
   ProcResourceMasks.resize(SM.getNumProcResourceKinds());
   computeProcResourceMasks(STI.getSchedModel(), ProcResourceMasks);
@@ -220,17 +220,19 @@ static void initializeUsedResources(InstrDesc &ID,
 
 static void computeMaxLatency(InstrDesc &ID, const MCInstrDesc &MCDesc,
                               const MCSchedClassDesc &SCDesc,
-                              const MCSubtargetInfo &STI) {
+                              const MCSubtargetInfo &STI,
+                              unsigned CallLatency) {
   if (MCDesc.isCall()) {
     // We cannot estimate how long this call will take.
-    // Artificially set an arbitrarily high latency (100cy).
-    ID.MaxLatency = 100U;
+    // Artificially set an arbitrarily high latency.
+    ID.MaxLatency = CallLatency;
     return;
   }
 
   int Latency = MCSchedModel::computeInstrLatency(STI, SCDesc);
-  // If latency is unknown, then conservatively assume a MaxLatency of 100cy.
-  ID.MaxLatency = Latency < 0 ? 100U : static_cast<unsigned>(Latency);
+  // If latency is unknown, then conservatively assume the MaxLatency set for
+  // calls.
+  ID.MaxLatency = Latency < 0 ? CallLatency : static_cast<unsigned>(Latency);
 }
 
 static Error verifyOperands(const MCInstrDesc &MCDesc, const MCInst &MCI) {
@@ -568,7 +570,7 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI,
     // We don't correctly model calls.
     WithColor::warning() << "found a call in the input assembly sequence.\n";
     WithColor::note() << "call instructions are not correctly modeled. "
-                      << "Assume a latency of 100cy.\n";
+                      << "Assume a latency of " << CallLatency << "cy.\n";
     FirstCallInst = false;
   }
 
@@ -580,7 +582,7 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI,
   }
 
   initializeUsedResources(*ID, SCDesc, STI, ProcResourceMasks);
-  computeMaxLatency(*ID, MCDesc, SCDesc, STI);
+  computeMaxLatency(*ID, MCDesc, SCDesc, STI, CallLatency);
 
   if (Error Err = verifyOperands(MCDesc, MCI))
     return std::move(Err);
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 806d01d..f9cd71b 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -1002,46 +1002,60 @@ void InstrProfRecord::addValueData(uint32_t ValueKind, uint32_t Site,
     ValueSites.emplace_back(VData, VData + N);
 }
 
-std::vector<BPFunctionNode> TemporalProfTraceTy::createBPFunctionNodes(
-    ArrayRef<TemporalProfTraceTy> Traces) {
+void TemporalProfTraceTy::createBPFunctionNodes(
+    ArrayRef<TemporalProfTraceTy> Traces, std::vector<BPFunctionNode> &Nodes,
+    bool RemoveOutlierUNs) {
   using IDT = BPFunctionNode::IDT;
   using UtilityNodeT = BPFunctionNode::UtilityNodeT;
-  // Collect all function IDs ordered by their smallest timestamp. This will be
-  // used as the initial FunctionNode order.
-  SetVector<IDT> FunctionIds;
-  size_t LargestTraceSize = 0;
-  for (auto &Trace : Traces)
-    LargestTraceSize =
-        std::max(LargestTraceSize, Trace.FunctionNameRefs.size());
-  for (size_t Timestamp = 0; Timestamp < LargestTraceSize; Timestamp++)
-    for (auto &Trace : Traces)
-      if (Timestamp < Trace.FunctionNameRefs.size())
-        FunctionIds.insert(Trace.FunctionNameRefs[Timestamp]);
-
-  const int N = Log2_64(LargestTraceSize) + 1;
-
+  UtilityNodeT MaxUN = 0;
+  DenseMap<IDT, size_t> IdToFirstTimestamp;
+  DenseMap<IDT, UtilityNodeT> IdToFirstUN;
+  DenseMap<IDT, SmallVector<UtilityNodeT>> IdToUNs;
   // TODO: We need to use the Trace.Weight field to give more weight to more
   // important utilities
-  DenseMap<IDT, SmallVector<UtilityNodeT, 4>> FuncGroups;
-  for (size_t TraceIdx = 0; TraceIdx < Traces.size(); TraceIdx++) {
-    auto &Trace = Traces[TraceIdx].FunctionNameRefs;
-    for (size_t Timestamp = 0; Timestamp < Trace.size(); Timestamp++) {
-      for (int I = Log2_64(Timestamp + 1); I < N; I++) {
-        auto FunctionId = Trace[Timestamp];
-        UtilityNodeT GroupId = TraceIdx * N + I;
-        FuncGroups[FunctionId].push_back(GroupId);
+  for (auto &Trace : Traces) {
+    size_t CutoffTimestamp = 1;
+    for (size_t Timestamp = 0; Timestamp < Trace.FunctionNameRefs.size();
+         Timestamp++) {
+      IDT Id = Trace.FunctionNameRefs[Timestamp];
+      auto [It, WasInserted] = IdToFirstTimestamp.try_emplace(Id, Timestamp);
+      if (!WasInserted)
+        It->getSecond() = std::min<size_t>(It->getSecond(), Timestamp);
+      if (Timestamp >= CutoffTimestamp) {
+        ++MaxUN;
+        CutoffTimestamp = 2 * Timestamp;
       }
+      IdToFirstUN.try_emplace(Id, MaxUN);
     }
+    for (auto &[Id, FirstUN] : IdToFirstUN)
+      for (auto UN = FirstUN; UN <= MaxUN; ++UN)
+        IdToUNs[Id].push_back(UN);
+    ++MaxUN;
+    IdToFirstUN.clear();
   }
 
-  std::vector<BPFunctionNode> Nodes;
-  for (auto Id : FunctionIds) {
-    auto &UNs = FuncGroups[Id];
-    llvm::sort(UNs);
-    UNs.erase(std::unique(UNs.begin(), UNs.end()), UNs.end());
-    Nodes.emplace_back(Id, UNs);
+  if (RemoveOutlierUNs) {
+    DenseMap<UtilityNodeT, unsigned> UNFrequency;
+    for (auto &[Id, UNs] : IdToUNs)
+      for (auto &UN : UNs)
+        ++UNFrequency[UN];
+    // Filter out utility nodes that are too infrequent or too prevalent to make
+    // BalancedPartitioning more effective.
+    for (auto &[Id, UNs] : IdToUNs)
+      llvm::erase_if(UNs, [&](auto &UN) {
+        return UNFrequency[UN] <= 1 || 2 * UNFrequency[UN] > IdToUNs.size();
+      });
   }
-  return Nodes;
+
+  for (auto &[Id, UNs] : IdToUNs)
+    Nodes.emplace_back(Id, UNs);
+
+  // Since BalancedPartitioning is sensitive to the initial order, we explicitly
+  // order nodes by their earliest timestamp.
+  llvm::sort(Nodes, [&](auto &L, auto &R) {
+    return std::make_pair(IdToFirstTimestamp[L.Id], L.Id) <
+           std::make_pair(IdToFirstTimestamp[R.Id], R.Id);
+  });
 }
 
 #define INSTR_PROF_COMMON_API_IMPL
@@ -1620,13 +1634,12 @@ inline size_t constexpr offsetOf(T1 T2::*Member) {
   return size_t(&(Object.*Member)) - size_t(&Object);
 }
 
+// Read a uint64_t from the specified buffer offset, and swap the bytes in
+// native endianness if necessary.
 static inline uint64_t read(const unsigned char *Buffer, size_t Offset) {
-  return *reinterpret_cast<const uint64_t *>(Buffer + Offset);
-}
-
-uint64_t Header::formatVersion() const {
-  using namespace support;
-  return endian::byte_swap<uint64_t, llvm::endianness::little>(Version);
+  using namespace ::support;
+  return endian::read<uint64_t, llvm::endianness::little, unaligned>(Buffer +
+                                                                     Offset);
 }
 
 Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
@@ -1638,18 +1651,15 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
 
   H.Magic = read(Buffer, offsetOf(&Header::Magic));
   // Check the magic number.
-  uint64_t Magic =
-      endian::byte_swap<uint64_t, llvm::endianness::little>(H.Magic);
-  if (Magic != IndexedInstrProf::Magic)
+  if (H.Magic != IndexedInstrProf::Magic)
     return make_error<InstrProfError>(instrprof_error::bad_magic);
 
   // Read the version.
   H.Version = read(Buffer, offsetOf(&Header::Version));
-  if (GET_VERSION(H.formatVersion()) >
-      IndexedInstrProf::ProfVersion::CurrentVersion)
+  if (GET_VERSION(H.Version) > IndexedInstrProf::ProfVersion::CurrentVersion)
     return make_error<InstrProfError>(instrprof_error::unsupported_version);
 
-  switch (GET_VERSION(H.formatVersion())) {
+  switch (GET_VERSION(H.Version)) {
     // When a new field is added in the header add a case statement here to
     // populate it.
     static_assert(
@@ -1680,7 +1690,7 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
 }
 
 size_t Header::size() const {
-  switch (GET_VERSION(formatVersion())) {
+  switch (GET_VERSION(Version)) {
     // When a new field is added to the header add a case statement here to
     // compute the size as offset of the new field + size of the new field. This
     // relies on the field being added to the end of the list.
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index ba21e01..836206a 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -1212,7 +1212,6 @@ Error IndexedMemProfReader::deserialize(const unsigned char *Start,
   const uint64_t FirstWord =
       support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
 
-  memprof::IndexedVersion Version = memprof::Version0;
   if (FirstWord == memprof::Version1 || FirstWord == memprof::Version2) {
     // Everything is good.  We can proceed to deserialize the rest.
     Version = static_cast<memprof::IndexedVersion>(FirstWord);
@@ -1311,43 +1310,33 @@ Error IndexedInstrProfReader::readHeader() {
   const IndexedInstrProf::Header *Header = &HeaderOr.get();
   Cur += Header->size();
 
-  Cur = readSummary((IndexedInstrProf::ProfVersion)Header->formatVersion(), Cur,
+  Cur = readSummary((IndexedInstrProf::ProfVersion)Header->Version, Cur,
                     /* UseCS */ false);
-  if (Header->formatVersion() & VARIANT_MASK_CSIR_PROF)
-    Cur =
-        readSummary((IndexedInstrProf::ProfVersion)Header->formatVersion(), Cur,
-                    /* UseCS */ true);
+  if (Header->Version & VARIANT_MASK_CSIR_PROF)
+    Cur = readSummary((IndexedInstrProf::ProfVersion)Header->Version, Cur,
+                      /* UseCS */ true);
   // Read the hash type and start offset.
-  IndexedInstrProf::HashT HashType = static_cast<IndexedInstrProf::HashT>(
-      endian::byte_swap<uint64_t, llvm::endianness::little>(Header->HashType));
+  IndexedInstrProf::HashT HashType =
+      static_cast<IndexedInstrProf::HashT>(Header->HashType);
   if (HashType > IndexedInstrProf::HashT::Last)
     return error(instrprof_error::unsupported_hash_type);
 
-  uint64_t HashOffset =
-      endian::byte_swap<uint64_t, llvm::endianness::little>(Header->HashOffset);
-
   // The hash table with profile counts comes next.
   auto IndexPtr = std::make_unique<InstrProfReaderIndex<OnDiskHashTableImplV3>>(
-      Start + HashOffset, Cur, Start, HashType, Header->formatVersion());
+      Start + Header->HashOffset, Cur, Start, HashType, Header->Version);
 
   // The MemProfOffset field in the header is only valid when the format
   // version is higher than 8 (when it was introduced).
-  if (GET_VERSION(Header->formatVersion()) >= 8 &&
-      Header->formatVersion() & VARIANT_MASK_MEMPROF) {
-    uint64_t MemProfOffset =
-        endian::byte_swap<uint64_t, llvm::endianness::little>(
-            Header->MemProfOffset);
-    if (Error E = MemProfReader.deserialize(Start, MemProfOffset))
+  if (GET_VERSION(Header->Version) >= 8 &&
+      Header->Version & VARIANT_MASK_MEMPROF) {
+    if (Error E = MemProfReader.deserialize(Start, Header->MemProfOffset))
       return E;
   }
 
   // BinaryIdOffset field in the header is only valid when the format version
   // is higher than 9 (when it was introduced).
-  if (GET_VERSION(Header->formatVersion()) >= 9) {
-    uint64_t BinaryIdOffset =
-        endian::byte_swap<uint64_t, llvm::endianness::little>(
-            Header->BinaryIdOffset);
-    const unsigned char *Ptr = Start + BinaryIdOffset;
+  if (GET_VERSION(Header->Version) >= 9) {
+    const unsigned char *Ptr = Start + Header->BinaryIdOffset;
     // Read binary ids size.
     BinaryIdsSize =
         support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
@@ -1360,11 +1349,8 @@ Error IndexedInstrProfReader::readHeader() {
                                         "corrupted binary ids");
   }
 
-  if (GET_VERSION(Header->formatVersion()) >= 12) {
-    uint64_t VTableNamesOffset =
-        endian::byte_swap<uint64_t, llvm::endianness::little>(
-            Header->VTableNamesOffset);
-    const unsigned char *Ptr = Start + VTableNamesOffset;
+  if (GET_VERSION(Header->Version) >= 12) {
+    const unsigned char *Ptr = Start + Header->VTableNamesOffset;
 
     CompressedVTableNamesLen =
         support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
@@ -1376,12 +1362,9 @@ Error IndexedInstrProfReader::readHeader() {
       return make_error<InstrProfError>(instrprof_error::truncated);
   }
 
-  if (GET_VERSION(Header->formatVersion()) >= 10 &&
-      Header->formatVersion() & VARIANT_MASK_TEMPORAL_PROF) {
-    uint64_t TemporalProfTracesOffset =
-        endian::byte_swap<uint64_t, llvm::endianness::little>(
-            Header->TemporalProfTracesOffset);
-    const unsigned char *Ptr = Start + TemporalProfTracesOffset;
+  if (GET_VERSION(Header->Version) >= 10 &&
+      Header->Version & VARIANT_MASK_TEMPORAL_PROF) {
+    const unsigned char *Ptr = Start + Header->TemporalProfTracesOffset;
     const auto *PtrEnd = (const unsigned char *)DataBuffer->getBufferEnd();
     // Expect at least two 64 bit fields: NumTraces, and TraceStreamSize
     if (Ptr + 2 * sizeof(uint64_t) > PtrEnd)
@@ -1506,6 +1489,55 @@ Expected<InstrProfRecord> IndexedInstrProfReader::getInstrProfRecord(
   return error(instrprof_error::unknown_function);
 }
 
+static Expected<memprof::MemProfRecord>
+getMemProfRecordV0(const memprof::IndexedMemProfRecord &IndexedRecord,
+                   MemProfFrameHashTable &MemProfFrameTable) {
+  memprof::FrameIdConverter<MemProfFrameHashTable> FrameIdConv(
+      MemProfFrameTable);
+
+  memprof::MemProfRecord Record =
+      memprof::MemProfRecord(IndexedRecord, FrameIdConv);
+
+  // Check that all frame ids were successfully converted to frames.
+  if (FrameIdConv.LastUnmappedId) {
+    return make_error<InstrProfError>(instrprof_error::hash_mismatch,
+                                      "memprof frame not found for frame id " +
+                                          Twine(*FrameIdConv.LastUnmappedId));
+  }
+
+  return Record;
+}
+
+static Expected<memprof::MemProfRecord>
+getMemProfRecordV2(const memprof::IndexedMemProfRecord &IndexedRecord,
+                   MemProfFrameHashTable &MemProfFrameTable,
+                   MemProfCallStackHashTable &MemProfCallStackTable) {
+  memprof::FrameIdConverter<MemProfFrameHashTable> FrameIdConv(
+      MemProfFrameTable);
+
+  memprof::CallStackIdConverter<MemProfCallStackHashTable> CSIdConv(
+      MemProfCallStackTable, FrameIdConv);
+
+  memprof::MemProfRecord Record = IndexedRecord.toMemProfRecord(CSIdConv);
+
+  // Check that all call stack ids were successfully converted to call stacks.
+  if (CSIdConv.LastUnmappedId) {
+    return make_error<InstrProfError>(
+        instrprof_error::hash_mismatch,
+        "memprof call stack not found for call stack id " +
+            Twine(*CSIdConv.LastUnmappedId));
+  }
+
+  // Check that all frame ids were successfully converted to frames.
+  if (FrameIdConv.LastUnmappedId) {
+    return make_error<InstrProfError>(instrprof_error::hash_mismatch,
+                                      "memprof frame not found for frame id " +
+                                          Twine(*FrameIdConv.LastUnmappedId));
+  }
+
+  return Record;
+}
+
 Expected<memprof::MemProfRecord>
 IndexedMemProfReader::getMemProfRecord(const uint64_t FuncNameHash) const {
   // TODO: Add memprof specific errors.
@@ -1518,41 +1550,27 @@ IndexedMemProfReader::getMemProfRecord(const uint64_t FuncNameHash) const {
         instrprof_error::unknown_function,
         "memprof record not found for function hash " + Twine(FuncNameHash));
 
-  // Setup a callback to convert from frame ids to frame using the on-disk
-  // FrameData hash table.
-  memprof::FrameIdConverter<MemProfFrameHashTable> FrameIdConv(
-      *MemProfFrameTable.get());
-
   const memprof::IndexedMemProfRecord IndexedRecord = *Iter;
-  memprof::MemProfRecord Record;
-  if (MemProfCallStackTable) {
-    // Setup a callback to convert call stack ids to call stacks using the
-    // on-disk hash table.
-    memprof::CallStackIdConverter<MemProfCallStackHashTable> CSIdConv(
-        *MemProfCallStackTable.get(), FrameIdConv);
-
-    Record = IndexedRecord.toMemProfRecord(CSIdConv);
-
-    // Check that all call stack ids were successfully converted to call stacks.
-    if (CSIdConv.LastUnmappedId) {
-      return make_error<InstrProfError>(
-          instrprof_error::hash_mismatch,
-          "memprof call stack not found for call stack id " +
-              Twine(*CSIdConv.LastUnmappedId));
-    }
-  } else {
-    Record = memprof::MemProfRecord(IndexedRecord, FrameIdConv);
-  }
-
-  // Check that all frame ids were successfully converted to frames.
-  if (FrameIdConv.LastUnmappedId) {
-    return make_error<InstrProfError>(
-        instrprof_error::hash_mismatch,
-        "memprof frame not found for frame id " +
-            Twine(*FrameIdConv.LastUnmappedId));
+  switch (Version) {
+  case memprof::Version0:
+  case memprof::Version1:
+    assert(MemProfFrameTable && "MemProfFrameTable must be available");
+    assert(!MemProfCallStackTable &&
+           "MemProfCallStackTable must not be available");
+    return getMemProfRecordV0(IndexedRecord, *MemProfFrameTable);
+  case memprof::Version2:
+    assert(MemProfFrameTable && "MemProfFrameTable must be available");
+    assert(MemProfCallStackTable && "MemProfCallStackTable must be available");
+    return getMemProfRecordV2(IndexedRecord, *MemProfFrameTable,
+                              *MemProfCallStackTable);
   }
 
-  return Record;
+  return make_error<InstrProfError>(
+      instrprof_error::unsupported_version,
+      formatv("MemProf version {} not supported; "
+              "requires version between {} and {}, inclusive",
+              Version, memprof::MinimumSupportedVersion,
+              memprof::MaximumSupportedVersion));
 }
 
 Error IndexedInstrProfReader::getFunctionCounts(StringRef FuncName,
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 101992c..b67a970 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -660,6 +660,37 @@ uint64_t InstrProfWriter::writeHeader(const IndexedInstrProf::Header &Header,
   return BackPatchStartOffset;
 }
 
+Error InstrProfWriter::writeVTableNames(ProfOStream &OS) {
+  std::vector<std::string> VTableNameStrs;
+  for (StringRef VTableName : VTableNames.keys())
+    VTableNameStrs.push_back(VTableName.str());
+
+  std::string CompressedVTableNames;
+  if (!VTableNameStrs.empty())
+    if (Error E = collectGlobalObjectNameStrings(
+            VTableNameStrs, compression::zlib::isAvailable(),
+            CompressedVTableNames))
+      return E;
+
+  const uint64_t CompressedStringLen = CompressedVTableNames.length();
+
+  // Record the length of compressed string.
+  OS.write(CompressedStringLen);
+
+  // Write the chars in compressed strings.
+  for (auto &c : CompressedVTableNames)
+    OS.writeByte(static_cast<uint8_t>(c));
+
+  // Pad up to a multiple of 8.
+  // InstrProfReader could read bytes according to 'CompressedStringLen'.
+  const uint64_t PaddedLength = alignTo(CompressedStringLen, 8);
+
+  for (uint64_t K = CompressedStringLen; K < PaddedLength; K++)
+    OS.writeByte(0);
+
+  return Error::success();
+}
+
 Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   using namespace IndexedInstrProf;
   using namespace support;
@@ -682,7 +713,6 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
 
   // Write the header.
   IndexedInstrProf::Header Header;
-  Header.Magic = IndexedInstrProf::Magic;
   Header.Version = WritePrevVersion
                        ? IndexedInstrProf::ProfVersion::Version11
                        : IndexedInstrProf::ProfVersion::CurrentVersion;
@@ -706,14 +736,6 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   if (static_cast<bool>(ProfileKind & InstrProfKind::TemporalProfile))
     Header.Version |= VARIANT_MASK_TEMPORAL_PROF;
 
-  Header.Unused = 0;
-  Header.HashType = static_cast<uint64_t>(IndexedInstrProf::HashType);
-  Header.HashOffset = 0;
-  Header.MemProfOffset = 0;
-  Header.BinaryIdOffset = 0;
-  Header.TemporalProfTracesOffset = 0;
-  Header.VTableNamesOffset = 0;
-
   const uint64_t BackPatchStartOffset =
       writeHeader(Header, WritePrevVersion, OS);
 
@@ -784,34 +806,9 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
 
   uint64_t VTableNamesSectionStart = OS.tell();
 
-  if (!WritePrevVersion) {
-    std::vector<std::string> VTableNameStrs;
-    for (StringRef VTableName : VTableNames.keys())
-      VTableNameStrs.push_back(VTableName.str());
-
-    std::string CompressedVTableNames;
-    if (!VTableNameStrs.empty())
-      if (Error E = collectGlobalObjectNameStrings(
-              VTableNameStrs, compression::zlib::isAvailable(),
-              CompressedVTableNames))
-        return E;
-
-    const uint64_t CompressedStringLen = CompressedVTableNames.length();
-
-    // Record the length of compressed string.
-    OS.write(CompressedStringLen);
-
-    // Write the chars in compressed strings.
-    for (auto &c : CompressedVTableNames)
-      OS.writeByte(static_cast<uint8_t>(c));
-
-    // Pad up to a multiple of 8.
-    // InstrProfReader could read bytes according to 'CompressedStringLen'.
-    const uint64_t PaddedLength = alignTo(CompressedStringLen, 8);
-
-    for (uint64_t K = CompressedStringLen; K < PaddedLength; K++)
-      OS.writeByte(0);
-  }
+  if (!WritePrevVersion)
+    if (Error E = writeVTableNames(OS))
+      return E;
 
   uint64_t TemporalProfTracesSectionStart = 0;
   if (static_cast<bool>(ProfileKind & InstrProfKind::TemporalProfile)) {
diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
index f578918..e560864 100644
--- a/llvm/lib/ProfileData/MemProf.cpp
+++ b/llvm/lib/ProfileData/MemProf.cpp
@@ -208,6 +208,7 @@ static IndexedMemProfRecord deserializeV2(const MemProfSchema &Schema,
   // Read the meminfo nodes.
   const uint64_t NumNodes =
       endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
+  Record.AllocSites.reserve(NumNodes);
   for (uint64_t I = 0; I < NumNodes; I++) {
     IndexedAllocationInfo Node;
     Node.CSId = endian::readNext<CallStackId, llvm::endianness::little>(Ptr);
@@ -219,6 +220,7 @@ static IndexedMemProfRecord deserializeV2(const MemProfSchema &Schema,
   // Read the callsite information.
   const uint64_t NumCtxs =
       endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
+  Record.CallSiteIds.reserve(NumCtxs);
   for (uint64_t J = 0; J < NumCtxs; J++) {
     CallStackId CSId =
         endian::readNext<CallStackId, llvm::endianness::little>(Ptr);
@@ -247,13 +249,15 @@ MemProfRecord IndexedMemProfRecord::toMemProfRecord(
         Callback) const {
   MemProfRecord Record;
 
+  Record.AllocSites.reserve(AllocSites.size());
   for (const memprof::IndexedAllocationInfo &IndexedAI : AllocSites) {
     memprof::AllocationInfo AI;
     AI.Info = IndexedAI.Info;
     AI.CallStack = Callback(IndexedAI.CSId);
-    Record.AllocSites.push_back(AI);
+    Record.AllocSites.push_back(std::move(AI));
   }
 
+  Record.CallSites.reserve(CallSiteIds.size());
   for (memprof::CallStackId CSId : CallSiteIds)
     Record.CallSites.push_back(Callback(CSId));
 
diff --git a/llvm/lib/ProfileData/MemProfReader.cpp b/llvm/lib/ProfileData/MemProfReader.cpp
index c25baba..fc3be71 100644
--- a/llvm/lib/ProfileData/MemProfReader.cpp
+++ b/llvm/lib/ProfileData/MemProfReader.cpp
@@ -587,31 +587,27 @@ Error RawMemProfReader::symbolizeAndFilterStackFrames(
 std::vector<std::string>
 RawMemProfReader::peekBuildIds(MemoryBuffer *DataBuffer) {
   const char *Next = DataBuffer->getBufferStart();
-  // Use a set + vector since a profile file may contain multiple raw profile
+  // Use a SetVector since a profile file may contain multiple raw profile
   // dumps, each with segment information. We want them unique and in order they
   // were stored in the profile; the profiled binary should be the first entry.
   // The runtime uses dl_iterate_phdr and the "... first object visited by
   // callback is the main program."
   // https://man7.org/linux/man-pages/man3/dl_iterate_phdr.3.html
-  std::vector<std::string> BuildIds;
-  llvm::SmallSet<std::string, 10> BuildIdsSet;
+  llvm::SetVector<std::string, std::vector<std::string>,
+                  llvm::SmallSet<std::string, 10>>
+      BuildIds;
   while (Next < DataBuffer->getBufferEnd()) {
     auto *Header = reinterpret_cast<const memprof::Header *>(Next);
 
     const llvm::SmallVector<SegmentEntry> Entries =
         readSegmentEntries(Next + Header->SegmentOffset);
 
-    for (const auto &Entry : Entries) {
-      const std::string Id = getBuildIdString(Entry);
-      if (BuildIdsSet.contains(Id))
-        continue;
-      BuildIds.push_back(Id);
-      BuildIdsSet.insert(Id);
-    }
+    for (const auto &Entry : Entries)
+      BuildIds.insert(getBuildIdString(Entry));
 
     Next += Header->TotalSize;
   }
-  return BuildIds;
+  return BuildIds.takeVector();
 }
 
 Error RawMemProfReader::readRawProfile(
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 03e8889..be4badc 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -56,9 +56,6 @@ elseif( CMAKE_HOST_UNIX )
     STRING(REGEX REPLACE "^lib" "" Backtrace_LIBFILE ${Backtrace_LIBFILE})
     set(system_libs ${system_libs} ${Backtrace_LIBFILE})
   endif()
-  if( LLVM_ENABLE_TERMINFO )
-    set(imported_libs ${imported_libs} Terminfo::terminfo)
-  endif()
   set(system_libs ${system_libs} ${LLVM_ATOMIC_LIB})
   set(system_libs ${system_libs} ${LLVM_PTHREAD_LIB})
   if( UNIX AND NOT (BEOS OR HAIKU) )
@@ -325,14 +322,6 @@ if(LLVM_ENABLE_ZSTD)
   set(llvm_system_libs ${llvm_system_libs} "${zstd_library}")
 endif()
 
-if(LLVM_ENABLE_TERMINFO)
-  if(NOT terminfo_library)
-    get_property(terminfo_library TARGET Terminfo::terminfo PROPERTY LOCATION)
-  endif()
-  get_library_name(${terminfo_library} terminfo_library)
-  set(llvm_system_libs ${llvm_system_libs} "${terminfo_library}")
-endif()
-
 set_property(TARGET LLVMSupport PROPERTY LLVM_SYSTEM_LIBS "${llvm_system_libs}")
 
 
diff --git a/llvm/lib/Support/Error.cpp b/llvm/lib/Support/Error.cpp
index 21d5915..34ec31e 100644
--- a/llvm/lib/Support/Error.cpp
+++ b/llvm/lib/Support/Error.cpp
@@ -135,6 +135,9 @@ StringError::StringError(std::error_code EC, const Twine &S)
 StringError::StringError(const Twine &S, std::error_code EC)
     : Msg(S.str()), EC(EC), PrintMsgOnly(true) {}
 
+StringError::StringError(std::string &&S, std::error_code EC, bool PrintMsgOnly)
+    : Msg(S), EC(EC), PrintMsgOnly(PrintMsgOnly) {}
+
 void StringError::log(raw_ostream &OS) const {
   if (PrintMsgOnly) {
     OS << Msg;
@@ -149,7 +152,7 @@ std::error_code StringError::convertToErrorCode() const {
   return EC;
 }
 
-Error createStringError(std::error_code EC, char const *Msg) {
+Error createStringError(std::string &&Msg, std::error_code EC) {
   return make_error<StringError>(Msg, EC);
 }
 
diff --git a/llvm/lib/Support/Unix/Process.inc b/llvm/lib/Support/Unix/Process.inc
index ae90924..84b10ff 100644
--- a/llvm/lib/Support/Unix/Process.inc
+++ b/llvm/lib/Support/Unix/Process.inc
@@ -341,17 +341,9 @@ unsigned Process::StandardErrColumns() {
   return getColumns();
 }
 
-#ifdef LLVM_ENABLE_TERMINFO
-// We manually declare these extern functions because finding the correct
-// headers from various terminfo, curses, or other sources is harder than
-// writing their specs down.
-extern "C" int setupterm(char *term, int filedes, int *errret);
-extern "C" struct term *set_curterm(struct term *termp);
-extern "C" int del_curterm(struct term *termp);
-extern "C" int tigetnum(char *capname);
-#endif
-
-bool checkTerminalEnvironmentForColors() {
+static bool terminalHasColors() {
+  // Check if the current terminal is one of terminals that are known to support
+  // ANSI color escape codes.
   if (const char *TermStr = std::getenv("TERM")) {
     return StringSwitch<bool>(TermStr)
         .Case("ansi", true)
@@ -368,54 +360,10 @@ bool checkTerminalEnvironmentForColors() {
   return false;
 }
 
-static bool terminalHasColors(int fd) {
-#ifdef LLVM_ENABLE_TERMINFO
-  // First, acquire a global lock because these C routines are thread hostile.
-  static std::mutex TermColorMutex;
-  std::lock_guard<std::mutex> G(TermColorMutex);
-
-  struct term *previous_term = set_curterm(nullptr);
-  int errret = 0;
-  if (setupterm(nullptr, fd, &errret) != 0)
-    // Regardless of why, if we can't get terminfo, we shouldn't try to print
-    // colors.
-    return false;
-
-  // Test whether the terminal as set up supports color output. How to do this
-  // isn't entirely obvious. We can use the curses routine 'has_colors' but it
-  // would be nice to avoid a dependency on curses proper when we can make do
-  // with a minimal terminfo parsing library. Also, we don't really care whether
-  // the terminal supports the curses-specific color changing routines, merely
-  // if it will interpret ANSI color escape codes in a reasonable way. Thus, the
-  // strategy here is just to query the baseline colors capability and if it
-  // supports colors at all to assume it will translate the escape codes into
-  // whatever range of colors it does support. We can add more detailed tests
-  // here if users report them as necessary.
-  //
-  // The 'tigetnum' routine returns -2 or -1 on errors, and might return 0 if
-  // the terminfo says that no colors are supported.
-  int colors_ti = tigetnum(const_cast<char *>("colors"));
-  bool HasColors =
-      colors_ti >= 0 ? colors_ti : checkTerminalEnvironmentForColors();
-
-  // Now extract the structure allocated by setupterm and free its memory
-  // through a really silly dance.
-  struct term *termp = set_curterm(previous_term);
-  (void)del_curterm(termp); // Drop any errors here.
-
-  // Return true if we found a color capabilities for the current terminal.
-  return HasColors;
-#else
-  // When the terminfo database is not available, check if the current terminal
-  // is one of terminals that are known to support ANSI color escape codes.
-  return checkTerminalEnvironmentForColors();
-#endif
-}
-
 bool Process::FileDescriptorHasColors(int fd) {
   // A file descriptor has colors if it is displayed and the terminal has
   // colors.
-  return FileDescriptorIsDisplayed(fd) && terminalHasColors(fd);
+  return FileDescriptorIsDisplayed(fd) && terminalHasColors();
 }
 
 bool Process::StandardOutHasColors() {
diff --git a/llvm/lib/Support/raw_socket_stream.cpp b/llvm/lib/Support/raw_socket_stream.cpp
index 14e2308..549d537 100644
--- a/llvm/lib/Support/raw_socket_stream.cpp
+++ b/llvm/lib/Support/raw_socket_stream.cpp
@@ -204,17 +204,26 @@ ListeningSocket::accept(std::chrono::milliseconds Timeout) {
     auto Start = std::chrono::steady_clock::now();
 #ifdef _WIN32
     PollStatus = WSAPoll(FDs, 2, RemainingTime);
-    if (PollStatus == SOCKET_ERROR) {
 #else
     PollStatus = ::poll(FDs, 2, RemainingTime);
+#endif
+    // If FD equals -1 then ListeningSocket::shutdown has been called and it is
+    // appropriate to return operation_canceled
+    if (FD.load() == -1)
+      return llvm::make_error<StringError>(
+          std::make_error_code(std::errc::operation_canceled),
+          "Accept canceled");
+
+#if _WIN32
+    if (PollStatus == SOCKET_ERROR) {
+#else
     if (PollStatus == -1) {
 #endif
-      // Ignore error if caused by interupting signal
       std::error_code PollErrCode = getLastSocketErrorCode();
+      // Ignore EINTR (signal occured before any request event) and retry
       if (PollErrCode != std::errc::interrupted)
         return llvm::make_error<StringError>(PollErrCode, "FD poll failed");
     }
-
     if (PollStatus == 0)
       return llvm::make_error<StringError>(
           std::make_error_code(std::errc::timed_out),
@@ -222,13 +231,7 @@ ListeningSocket::accept(std::chrono::milliseconds Timeout) {
 
     if (FDs[0].revents & POLLNVAL)
       return llvm::make_error<StringError>(
-          std::make_error_code(std::errc::bad_file_descriptor),
-          "File descriptor closed by another thread");
-
-    if (FDs[1].revents & POLLIN)
-      return llvm::make_error<StringError>(
-          std::make_error_code(std::errc::operation_canceled),
-          "Accept canceled");
+          std::make_error_code(std::errc::bad_file_descriptor));
 
     auto Stop = std::chrono::steady_clock::now();
     ElapsedTime +=
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
index bfcafc6..9a804c1 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -38,6 +38,8 @@ static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
 static const MCPhysReg ZRegList[] = {AArch64::Z0, AArch64::Z1, AArch64::Z2,
                                      AArch64::Z3, AArch64::Z4, AArch64::Z5,
                                      AArch64::Z6, AArch64::Z7};
+static const MCPhysReg PRegList[] = {AArch64::P0, AArch64::P1, AArch64::P2,
+                                     AArch64::P3};
 
 static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
                              MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
@@ -59,11 +61,17 @@ static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
     // CCAssignFn again we want it to behave as if all remaining registers are
     // allocated. This will force the code to pass the tuple indirectly in
     // accordance with the PCS.
-    bool RegsAllocated[8];
+    bool ZRegsAllocated[8];
     for (int I = 0; I < 8; I++) {
-      RegsAllocated[I] = State.isAllocated(ZRegList[I]);
+      ZRegsAllocated[I] = State.isAllocated(ZRegList[I]);
       State.AllocateReg(ZRegList[I]);
     }
+    // The same applies to P registers.
+    bool PRegsAllocated[4];
+    for (int I = 0; I < 4; I++) {
+      PRegsAllocated[I] = State.isAllocated(PRegList[I]);
+      State.AllocateReg(PRegList[I]);
+    }
 
     auto &It = PendingMembers[0];
     CCAssignFn *AssignFn =
@@ -79,8 +87,11 @@ static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
     // Return the register state back to how it was before, leaving any
     // unallocated registers available for other smaller types.
     for (int I = 0; I < 8; I++)
-      if (!RegsAllocated[I])
+      if (!ZRegsAllocated[I])
         State.DeallocateReg(ZRegList[I]);
+    for (int I = 0; I < 4; I++)
+      if (!PRegsAllocated[I])
+        State.DeallocateReg(PRegList[I]);
 
     // All pending members have now been allocated
     PendingMembers.clear();
@@ -140,9 +151,15 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
     RegList = DRegList;
   else if (LocVT.SimpleTy == MVT::f128 || LocVT.is128BitVector())
     RegList = QRegList;
-  else if (LocVT.isScalableVector())
-    RegList = ZRegList;
-  else {
+  else if (LocVT.isScalableVector()) {
+    // Scalable masks should be pass by Predicate registers.
+    if (LocVT == MVT::nxv1i1 || LocVT == MVT::nxv2i1 || LocVT == MVT::nxv4i1 ||
+        LocVT == MVT::nxv8i1 || LocVT == MVT::nxv16i1 ||
+        LocVT == MVT::aarch64svcount)
+      RegList = PRegList;
+    else
+      RegList = ZRegList;
+  } else {
     // Not an array we want to split up after all.
     return false;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 10cad6d..1c7f6b8 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -295,5 +295,6 @@ def AArch64PostLegalizerCombiner
                         ptr_add_immed_chain, overlapping_and,
                         split_store_zero_128, undef_combines,
                         select_to_minmax, or_to_bsp, combine_concat_vector,
-                        commute_constant_to_rhs]> {
+                        commute_constant_to_rhs,
+                        push_freeze_to_prevent_poison_from_propagating]> {
 }
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index ba0b760..ffb899a 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -223,13 +223,6 @@ def FeatureSVE : Extension<"sve", "SVE",
   "Enable Scalable Vector Extension (SVE) instructions (FEAT_SVE)", [FeatureFullFP16],
   "FEAT_SVE", "+sve,+fullfp16,+fp-armv8,+neon", 310>;
 
-def FeatureFPMR : Extension<"fpmr", "FPMR",
-  "Enable FPMR Register (FEAT_FPMR)">;
-
-let FMVDependencies = "+fpmr" in
-def FeatureFP8 : Extension<"fp8", "FP8",
-  "Enable FP8 instructions (FEAT_FP8)">;
-
 // This flag is currently still labeled as Experimental, but when fully
 // implemented this should tell the compiler to use the zeroing pseudos to
 // benefit from the reverse instructions (e.g. SUB vs SUBR) if the inactive
@@ -667,41 +660,44 @@ def FeatureSME2p1 : Extension<"sme2p1", "SME2p1",
 def FeatureFAMINMAX: Extension<"faminmax", "FAMINMAX",
    "Enable FAMIN and FAMAX instructions (FEAT_FAMINMAX)">;
 
-let FMVDependencies = "+fpmr" in
+def FeatureLUT: Extension<"lut", "LUT",
+   "Enable Lookup Table instructions (FEAT_LUT)">;
+   
+def FeatureFP8 : Extension<"fp8", "FP8",
+  "Enable FP8 instructions (FEAT_FP8)", [FeatureFAMINMAX, FeatureLUT, FeatureBF16]>;
+  
 def FeatureFP8FMA : Extension<"fp8fma", "FP8FMA",
-  "Enable fp8 multiply-add instructions (FEAT_FP8FMA)">;
+  "Enable fp8 multiply-add instructions (FEAT_FP8FMA)", [FeatureFP8]>;
 
 let FMVDependencies = "+sme2" in
 def FeatureSSVE_FP8FMA : Extension<"ssve-fp8fma", "SSVE_FP8FMA",
-  "Enable SVE2 fp8 multiply-add instructions (FEAT_SSVE_FP8FMA)", [FeatureSME2]>;
+  "Enable SVE2 fp8 multiply-add instructions (FEAT_SSVE_FP8FMA)", [FeatureSME2, FeatureFP8]>;
 
+def FeatureFP8DOT4: Extension<"fp8dot4", "FP8DOT4",
+   "Enable fp8 4-way dot instructions (FEAT_FP8DOT4)", [FeatureFP8FMA]>;
+  
 def FeatureFP8DOT2: Extension<"fp8dot2", "FP8DOT2",
-   "Enable fp8 2-way dot instructions (FEAT_FP8DOT2)">;
+   "Enable fp8 2-way dot instructions (FEAT_FP8DOT2)", [FeatureFP8DOT4]>;
 
 let FMVDependencies = "+sme2" in
-def FeatureSSVE_FP8DOT2 : Extension<"ssve-fp8dot2", "SSVE_FP8DOT2",
-  "Enable SVE2 fp8 2-way dot product instructions (FEAT_SSVE_FP8DOT2)", [FeatureSME2]>;
-
-def FeatureFP8DOT4: Extension<"fp8dot4", "FP8DOT4",
-   "Enable fp8 4-way dot instructions (FEAT_FP8DOT4)">;
+def FeatureSSVE_FP8DOT4 : Extension<"ssve-fp8dot4", "SSVE_FP8DOT4",
+  "Enable SVE2 fp8 4-way dot product instructions (FEAT_SSVE_FP8DOT4)", [FeatureSSVE_FP8FMA]>;
 
 let FMVDependencies = "+sme2" in
-def FeatureSSVE_FP8DOT4 : Extension<"ssve-fp8dot4", "SSVE_FP8DOT4",
-  "Enable SVE2 fp8 4-way dot product instructions (FEAT_SSVE_FP8DOT4)", [FeatureSME2]>;
-def FeatureLUT: Extension<"lut", "LUT",
-   "Enable Lookup Table instructions (FEAT_LUT)">;
+def FeatureSSVE_FP8DOT2 : Extension<"ssve-fp8dot2", "SSVE_FP8DOT2",
+  "Enable SVE2 fp8 2-way dot product instructions (FEAT_SSVE_FP8DOT2)", [FeatureSSVE_FP8DOT4]>;
 
 def FeatureSME_LUTv2 : Extension<"sme-lutv2", "SME_LUTv2",
   "Enable Scalable Matrix Extension (SME) LUTv2 instructions (FEAT_SME_LUTv2)">;
 
-let FMVDependencies = "+fp8,+sme2" in
-def FeatureSMEF8F16 : Extension<"sme-f8f16", "SMEF8F16",
-  "Enable Scalable Matrix Extension (SME) F8F16 instructions(FEAT_SME_F8F16)", [FeatureSME2, FeatureFP8]>;
-
 let FMVDependencies = "+sme2,+fp8" in
 def FeatureSMEF8F32 : Extension<"sme-f8f32", "SMEF8F32",
   "Enable Scalable Matrix Extension (SME) F8F32 instructions (FEAT_SME_F8F32)", [FeatureSME2, FeatureFP8]>;
 
+let FMVDependencies = "+fp8,+sme2" in
+def FeatureSMEF8F16 : Extension<"sme-f8f16", "SMEF8F16",
+  "Enable Scalable Matrix Extension (SME) F8F16 instructions(FEAT_SME_F8F16)", [FeatureSMEF8F32]>;
+
 def FeatureAppleA7SysReg  : SubtargetFeature<"apple-a7-sysreg", "HasAppleA7SysReg", "true",
   "Apple A7 (the CPU formerly known as Cyclone)">;
 
@@ -869,7 +865,7 @@ def HasV9_4aOps : Architecture64<9, 4, "a", "v9.4a",
     FeatureRASv2])>;
 def HasV9_5aOps : Architecture64<9, 5, "a", "v9.5a",
   [HasV9_4aOps, FeatureCPA],
-  !listconcat(HasV9_4aOps.DefaultExts, [FeatureCPA])>;
+  !listconcat(HasV9_4aOps.DefaultExts, [FeatureCPA,  FeatureLUT, FeatureFAMINMAX])>;
 def HasV8_0rOps : Architecture64<8, 0, "r", "v8r",
   [ //v8.1
     FeatureCRC, FeaturePAN, FeatureLSE, FeatureCONTEXTIDREL2,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e31a27e..25ba8d8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1871,9 +1871,11 @@ bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
   if (!Subtarget->hasSVEorSME())
     return true;
 
-  // We can only use the BRKB + CNTP sequence with legal predicate types.
+  // We can only use the BRKB + CNTP sequence with legal predicate types. We can
+  // also support fixed-width predicates.
   return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
-         VT != MVT::nxv2i1;
+         VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
+         VT != MVT::v4i1 && VT != MVT::v2i1;
 }
 
 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
@@ -5838,9 +5840,20 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return SDValue();
   }
   case Intrinsic::experimental_cttz_elts: {
-    SDValue NewCttzElts =
-        DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1));
+    SDValue CttzOp = Op.getOperand(1);
+    EVT VT = CttzOp.getValueType();
+    assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
 
+    if (VT.isFixedLengthVector()) {
+      // We can use SVE instructions to lower this intrinsic by first creating
+      // an SVE predicate register mask from the fixed-width vector.
+      EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
+      SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, NewVT, CttzOp);
+      CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
+    }
+
+    SDValue NewCttzElts =
+        DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, CttzOp);
     return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
   }
   }
@@ -7235,7 +7248,6 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
       unsigned NumParts = 1;
       if (Ins[i].Flags.isInConsecutiveRegs()) {
-        assert(!Ins[i].Flags.isInConsecutiveRegsLast());
         while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
           ++NumParts;
       }
@@ -8232,7 +8244,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       uint64_t PartSize = StoreSize;
       unsigned NumParts = 1;
       if (Outs[i].Flags.isInConsecutiveRegs()) {
-        assert(!Outs[i].Flags.isInConsecutiveRegsLast());
         while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
           ++NumParts;
         StoreSize *= NumParts;
@@ -13530,11 +13541,9 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
                       DAG.getConstant(NumElts, dl, MVT::i64));
 
       if (Even && !Odd)
-        return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
-                           RHS);
+        return DAG.getNode(AArch64ISD::UZP1, dl, VT, LHS, RHS);
       if (Odd && !Even)
-        return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
-                           RHS);
+        return DAG.getNode(AArch64ISD::UZP2, dl, VT, LHS, RHS);
     }
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index a39e3b7..4830033 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -171,8 +171,6 @@ def HasSME2          : Predicate<"Subtarget->hasSME2()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSME2), "sme2">;
 def HasSME2p1        : Predicate<"Subtarget->hasSME2p1()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSME2p1), "sme2p1">;
-def HasFPMR          : Predicate<"Subtarget->hasFPMR()">,
-                                 AssemblerPredicateWithAll<(all_of FeatureFPMR), "fpmr">;
 def HasFP8           : Predicate<"Subtarget->hasFP8()">,
                                  AssemblerPredicateWithAll<(all_of FeatureFP8), "fp8">;
 def HasFAMINMAX      : Predicate<"Subtarget->hasFAMINMAX()">,
diff --git a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
index abde099..90bf089 100644
--- a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
@@ -231,7 +231,7 @@ MachineMemOperand *createCheckMemOperand(MachineFunction &MF,
 
 } // namespace
 
-MachineBasicBlock &llvm::AArch64PAuth::checkAuthenticatedRegister(
+void llvm::AArch64PAuth::checkAuthenticatedRegister(
     MachineBasicBlock::iterator MBBI, AuthCheckMethod Method,
     Register AuthenticatedReg, Register TmpReg, bool UseIKey, unsigned BrkImm) {
 
@@ -246,13 +246,13 @@ MachineBasicBlock &llvm::AArch64PAuth::checkAuthenticatedRegister(
   default:
     break;
   case AuthCheckMethod::None:
-    return MBB;
+    return;
   case AuthCheckMethod::DummyLoad:
     BuildMI(MBB, MBBI, DL, TII->get(AArch64::LDRWui), getWRegFromXReg(TmpReg))
         .addReg(AuthenticatedReg)
         .addImm(0)
         .addMemOperand(createCheckMemOperand(MF, Subtarget));
-    return MBB;
+    return;
   }
 
   // Control flow has to be changed, so arrange new MBBs.
@@ -287,7 +287,7 @@ MachineBasicBlock &llvm::AArch64PAuth::checkAuthenticatedRegister(
         .addReg(TmpReg)
         .addImm(62)
         .addMBB(BreakBlock);
-    return *SuccessBlock;
+    return;
   case AuthCheckMethod::XPACHint:
     assert(AuthenticatedReg == AArch64::LR &&
            "XPACHint mode is only compatible with checking the LR register");
@@ -304,7 +304,7 @@ MachineBasicBlock &llvm::AArch64PAuth::checkAuthenticatedRegister(
     BuildMI(CheckBlock, DL, TII->get(AArch64::Bcc))
         .addImm(AArch64CC::NE)
         .addMBB(BreakBlock);
-    return *SuccessBlock;
+    return;
   }
   llvm_unreachable("Unknown AuthCheckMethod enum");
 }
diff --git a/llvm/lib/Target/AArch64/AArch64PointerAuth.h b/llvm/lib/Target/AArch64/AArch64PointerAuth.h
index e1ceaed..4ffda74 100644
--- a/llvm/lib/Target/AArch64/AArch64PointerAuth.h
+++ b/llvm/lib/Target/AArch64/AArch64PointerAuth.h
@@ -98,14 +98,10 @@ enum class AuthCheckMethod {
 /// using an I-key or D-key and which register can be used as temporary.
 /// If an explicit BRK instruction is used to generate an exception, BrkImm
 /// specifies its immediate operand.
-///
-/// \returns The machine basic block containing the code that is executed
-///          after the check succeeds.
-MachineBasicBlock &checkAuthenticatedRegister(MachineBasicBlock::iterator MBBI,
-                                              AuthCheckMethod Method,
-                                              Register AuthenticatedReg,
-                                              Register TmpReg, bool UseIKey,
-                                              unsigned BrkImm);
+void checkAuthenticatedRegister(MachineBasicBlock::iterator MBBI,
+                                AuthCheckMethod Method,
+                                Register AuthenticatedReg, Register TmpReg,
+                                bool UseIKey, unsigned BrkImm);
 
 /// Returns the number of bytes added by checkAuthenticatedRegister.
 unsigned getCheckerSizeInBytes(AuthCheckMethod Method);
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 5d185fc..8bc26ee 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -64,12 +64,6 @@ ReservedRegsForRA("reserve-regs-for-regalloc", cl::desc("Reserve physical "
                   "Should only be used for testing register allocator."),
                   cl::CommaSeparated, cl::Hidden);
 
-static cl::opt<bool> ForceStreamingCompatibleSVE(
-    "force-streaming-compatible-sve",
-    cl::desc(
-        "Force the use of streaming-compatible SVE code for all functions"),
-    cl::Hidden);
-
 static cl::opt<AArch64PAuth::AuthCheckMethod>
     AuthenticatedLRCheckMethod("aarch64-authenticated-lr-check-method",
                                cl::Hidden,
@@ -316,15 +310,14 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU,
                                    const TargetMachine &TM, bool LittleEndian,
                                    unsigned MinSVEVectorSizeInBitsOverride,
                                    unsigned MaxSVEVectorSizeInBitsOverride,
-                                   bool StreamingSVEMode,
-                                   bool StreamingCompatibleSVEMode,
+                                   bool IsStreaming, bool IsStreamingCompatible,
                                    bool HasMinSize)
     : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
       ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
       ReserveXRegisterForRA(AArch64::GPR64commonRegClass.getNumRegs()),
       CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
-      IsLittle(LittleEndian), StreamingSVEMode(StreamingSVEMode),
-      StreamingCompatibleSVEMode(StreamingCompatibleSVEMode),
+      IsLittle(LittleEndian), IsStreaming(IsStreaming),
+      IsStreamingCompatible(IsStreamingCompatible),
       MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
       MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
       InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU, HasMinSize)),
@@ -547,20 +540,6 @@ void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
 
 bool AArch64Subtarget::useAA() const { return UseAA; }
 
-bool AArch64Subtarget::isStreamingCompatible() const {
-  return StreamingCompatibleSVEMode || ForceStreamingCompatibleSVE;
-}
-
-bool AArch64Subtarget::isNeonAvailable() const {
-  return hasNEON() &&
-         (hasSMEFA64() || (!isStreaming() && !isStreamingCompatible()));
-}
-
-bool AArch64Subtarget::isSVEAvailable() const {
-  return hasSVE() &&
-         (hasSMEFA64() || (!isStreaming() && !isStreamingCompatible()));
-}
-
 // If return address signing is enabled, tail calls are emitted as follows:
 //
 // ```
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 3f3eefc..7ef7a89 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -79,8 +79,8 @@ protected:
 
   bool IsLittle;
 
-  bool StreamingSVEMode;
-  bool StreamingCompatibleSVEMode;
+  bool IsStreaming;
+  bool IsStreamingCompatible;
   unsigned MinSVEVectorSizeInBits;
   unsigned MaxSVEVectorSizeInBits;
   unsigned VScaleForTuning = 2;
@@ -120,8 +120,7 @@ public:
                    StringRef FS, const TargetMachine &TM, bool LittleEndian,
                    unsigned MinSVEVectorSizeInBitsOverride = 0,
                    unsigned MaxSVEVectorSizeInBitsOverride = 0,
-                   bool StreamingSVEMode = false,
-                   bool StreamingCompatibleSVEMode = false,
+                   bool IsStreaming = false, bool IsStreamingCompatible = false,
                    bool HasMinSize = false);
 
 // Getters for SubtargetFeatures defined in tablegen
@@ -165,20 +164,26 @@ public:
   bool isXRaySupported() const override { return true; }
 
   /// Returns true if the function has a streaming body.
-  bool isStreaming() const { return StreamingSVEMode; }
+  bool isStreaming() const { return IsStreaming; }
 
   /// Returns true if the function has a streaming-compatible body.
-  bool isStreamingCompatible() const;
+  bool isStreamingCompatible() const { return IsStreamingCompatible; }
 
   /// Returns true if the target has NEON and the function at runtime is known
   /// to have NEON enabled (e.g. the function is known not to be in streaming-SVE
   /// mode, which disables NEON instructions).
-  bool isNeonAvailable() const;
+  bool isNeonAvailable() const {
+    return hasNEON() &&
+           (hasSMEFA64() || (!isStreaming() && !isStreamingCompatible()));
+  }
 
   /// Returns true if the target has SVE and can use the full range of SVE
   /// instructions, for example because it knows the function is known not to be
   /// in streaming-SVE mode or when the target has FEAT_FA64 enabled.
-  bool isSVEAvailable() const;
+  bool isSVEAvailable() const {
+    return hasSVE() &&
+           (hasSMEFA64() || (!isStreaming() && !isStreamingCompatible()));
+  }
 
   unsigned getMinVectorRegisterBitWidth() const {
     // Don't assume any minimum vector size when PSTATE.SM may not be 0, because
diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index 0564741..0b5bc97 100644
--- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -1943,11 +1943,9 @@ def : RWSysReg<"PM",                0b11, 0b000, 0b0100, 0b0011, 0b001>;
 // 2023 ISA Extension
 // AArch64 Floating-point Mode Register controls behaviors of the FP8
 // instructions (FEAT_FPMR)
-let Requires = [{ {AArch64::FeatureFPMR} }] in {
 //                                 Op0   Op1    CRn     CRm     Op2
 def : ROSysReg<"ID_AA64FPFR0_EL1", 0b11, 0b000, 0b0000, 0b0100, 0b111>;
 def : RWSysReg<"FPMR",             0b11, 0b011, 0b0100, 0b0100, 0b010>;
-}
 
 // v9.5a Software Stepping Enhancements (FEAT_STEP2)
 //                                  Op0   Op1    CRn     CRm     Op2
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index df802cf..945ab5c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -187,6 +187,11 @@ static cl::opt<unsigned> SVEVectorBitsMinOpt(
              "with zero meaning no minimum size is assumed."),
     cl::init(0), cl::Hidden);
 
+static cl::opt<bool> ForceStreamingCompatible(
+    "force-streaming-compatible",
+    cl::desc("Force the use of streaming-compatible code for all functions"),
+    cl::init(false), cl::Hidden);
+
 extern cl::opt<bool> EnableHomogeneousPrologEpilog;
 
 static cl::opt<bool> EnableGISelLoadStoreOptPreLegal(
@@ -408,10 +413,11 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
   StringRef FS = FSAttr.isValid() ? FSAttr.getValueAsString() : TargetFS;
   bool HasMinSize = F.hasMinSize();
 
-  bool StreamingSVEMode = F.hasFnAttribute("aarch64_pstate_sm_enabled") ||
-                          F.hasFnAttribute("aarch64_pstate_sm_body");
-  bool StreamingCompatibleSVEMode =
-      F.hasFnAttribute("aarch64_pstate_sm_compatible");
+  bool IsStreaming = F.hasFnAttribute("aarch64_pstate_sm_enabled") ||
+                     F.hasFnAttribute("aarch64_pstate_sm_body");
+  bool IsStreamingCompatible =
+      F.hasFnAttribute("aarch64_pstate_sm_compatible") ||
+      ForceStreamingCompatible;
 
   unsigned MinSVEVectorSize = 0;
   unsigned MaxSVEVectorSize = 0;
@@ -439,10 +445,9 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
 
   SmallString<512> Key;
   raw_svector_ostream(Key) << "SVEMin" << MinSVEVectorSize << "SVEMax"
-                           << MaxSVEVectorSize
-                           << "StreamingSVEMode=" << StreamingSVEMode
-                           << "StreamingCompatibleSVEMode="
-                           << StreamingCompatibleSVEMode << CPU << TuneCPU << FS
+                           << MaxSVEVectorSize << "IsStreaming=" << IsStreaming
+                           << "IsStreamingCompatible=" << IsStreamingCompatible
+                           << CPU << TuneCPU << FS
                            << "HasMinSize=" << HasMinSize;
 
   auto &I = SubtargetMap[Key];
@@ -453,12 +458,10 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
     resetTargetOptions(F);
     I = std::make_unique<AArch64Subtarget>(
         TargetTriple, CPU, TuneCPU, FS, *this, isLittle, MinSVEVectorSize,
-        MaxSVEVectorSize, StreamingSVEMode, StreamingCompatibleSVEMode,
-        HasMinSize);
+        MaxSVEVectorSize, IsStreaming, IsStreamingCompatible, HasMinSize);
   }
 
-  assert((!StreamingSVEMode || I->hasSME()) &&
-         "Expected SME to be available");
+  assert((!IsStreaming || I->hasSME()) && "Expected SME to be available");
 
   return I.get();
 }
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index c9bba9b..13a68b7 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -3718,7 +3718,6 @@ static const struct Extension {
     {"sb", {AArch64::FeatureSB}},
     {"ssbs", {AArch64::FeatureSSBS}},
     {"tme", {AArch64::FeatureTME}},
-    {"fpmr", {AArch64::FeatureFPMR}},
     {"fp8", {AArch64::FeatureFP8}},
     {"faminmax", {AArch64::FeatureFAMINMAX}},
     {"fp8fma", {AArch64::FeatureFP8FMA}},
@@ -3731,7 +3730,7 @@ static const struct Extension {
     {"sme-lutv2", {AArch64::FeatureSME_LUTv2}},
     {"sme-f8f16", {AArch64::FeatureSMEF8F16}},
     {"sme-f8f32", {AArch64::FeatureSMEF8F32}},
-    {"sme-fa64",  {AArch64::FeatureSMEFA64}},
+    {"sme-fa64", {AArch64::FeatureSMEFA64}},
     {"cpa", {AArch64::FeatureCPA}},
     {"tlbiw", {AArch64::FeatureTLBIW}},
 };
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 0dd4a78..6493a2ee 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -430,6 +430,55 @@ public:
     return false;
   }
 
+  bool clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst,
+                            APInt &Mask) const override {
+    const MCInstrDesc &Desc = Info->get(Inst.getOpcode());
+    unsigned NumDefs = Desc.getNumDefs();
+    unsigned NumImplicitDefs = Desc.implicit_defs().size();
+    assert(Mask.getBitWidth() == NumDefs + NumImplicitDefs &&
+           "Unexpected number of bits in the mask!");
+    // 32-bit General Purpose Register class.
+    const MCRegisterClass &GPR32RC = MRI.getRegClass(AArch64::GPR32RegClassID);
+    // Floating Point Register classes.
+    const MCRegisterClass &FPR8RC = MRI.getRegClass(AArch64::FPR8RegClassID);
+    const MCRegisterClass &FPR16RC = MRI.getRegClass(AArch64::FPR16RegClassID);
+    const MCRegisterClass &FPR32RC = MRI.getRegClass(AArch64::FPR32RegClassID);
+    const MCRegisterClass &FPR64RC = MRI.getRegClass(AArch64::FPR64RegClassID);
+    const MCRegisterClass &FPR128RC =
+        MRI.getRegClass(AArch64::FPR128RegClassID);
+
+    auto ClearsSuperReg = [=](unsigned RegID) {
+      // An update to the lower 32 bits of a 64 bit integer register is
+      // architecturally defined to zero extend the upper 32 bits on a write.
+      if (GPR32RC.contains(RegID))
+        return true;
+      // SIMD&FP instructions operating on scalar data only acccess the lower
+      // bits of a register, the upper bits are zero extended on a write. For
+      // SIMD vector registers smaller than 128-bits, the upper 64-bits of the
+      // register are zero extended on a write.
+      // When VL is higher than 128 bits, any write to a SIMD&FP register sets
+      // bits higher than 128 to zero.
+      return FPR8RC.contains(RegID) || FPR16RC.contains(RegID) ||
+             FPR32RC.contains(RegID) || FPR64RC.contains(RegID) ||
+             FPR128RC.contains(RegID);
+    };
+
+    Mask.clearAllBits();
+    for (unsigned I = 0, E = NumDefs; I < E; ++I) {
+      const MCOperand &Op = Inst.getOperand(I);
+      if (ClearsSuperReg(Op.getReg()))
+        Mask.setBit(I);
+    }
+
+    for (unsigned I = 0, E = NumImplicitDefs; I < E; ++I) {
+      const MCPhysReg Reg = Desc.implicit_defs()[I];
+      if (ClearsSuperReg(Reg))
+        Mask.setBit(NumDefs + I);
+    }
+
+    return Mask.getBoolValue();
+  }
+
   std::vector<std::pair<uint64_t, uint64_t>>
   findPltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents,
                  const Triple &TargetTriple) const override {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index bd48a5f..cad4a34 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -19,7 +19,6 @@
 #include "AMDGPU.h"
 #include "AMDGPUHSAMetadataStreamer.h"
 #include "AMDGPUResourceUsageAnalysis.h"
-#include "AMDKernelCodeT.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUInstPrinter.h"
 #include "MCTargetDesc/AMDGPUMCExpr.h"
@@ -29,6 +28,7 @@
 #include "SIMachineFunctionInfo.h"
 #include "TargetInfo/AMDGPUTargetInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/AMDKernelCodeTUtils.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -205,8 +205,9 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() {
   if (STM.isMesaKernel(F) &&
       (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
        F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
-    amd_kernel_code_t KernelCode;
+    AMDGPUMCKernelCodeT KernelCode;
     getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
+    KernelCode.validate(&STM, MF->getContext());
     getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
   }
 
@@ -1317,7 +1318,7 @@ static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
   }
 }
 
-void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
+void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
                                         const SIProgramInfo &CurrentProgramInfo,
                                         const MachineFunction &MF) const {
   const Function &F = MF.getFunction();
@@ -1328,24 +1329,22 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   MCContext &Ctx = MF.getContext();
 
-  AMDGPU::initDefaultAMDKernelCodeT(Out, &STM);
+  Out.initDefault(&STM, Ctx, /*InitMCExpr=*/false);
 
-  Out.compute_pgm_resource_registers =
-      CurrentProgramInfo.getComputePGMRSrc1(STM) |
-      (CurrentProgramInfo.getComputePGMRSrc2() << 32);
+  Out.compute_pgm_resource1_registers =
+      CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);
+  Out.compute_pgm_resource2_registers =
+      CurrentProgramInfo.getComputePGMRSrc2(Ctx);
   Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
 
-  if (getMCExprValue(CurrentProgramInfo.DynamicCallStack, Ctx))
-    Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
+  Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
 
-  AMD_HSA_BITS_SET(Out.code_properties,
-                   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
+  AMD_HSA_BITS_SET(Out.code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
                    getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
 
   const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
   if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
-    Out.code_properties |=
-      AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
   }
 
   if (UserSGPRInfo.hasDispatchPtr())
@@ -1371,10 +1370,9 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
 
   Align MaxKernArgAlign;
   Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
-  Out.wavefront_sgpr_count = getMCExprValue(CurrentProgramInfo.NumSGPR, Ctx);
-  Out.workitem_vgpr_count = getMCExprValue(CurrentProgramInfo.NumVGPR, Ctx);
-  Out.workitem_private_segment_byte_size =
-      getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx);
+  Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
+  Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
+  Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
   Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
 
   // kernarg_segment_alignment is specified as log of the alignment.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 16d8952..87156f2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -17,8 +17,6 @@
 #include "SIProgramInfo.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 
-struct amd_kernel_code_t;
-
 namespace llvm {
 
 class AMDGPUMachineFunction;
@@ -29,6 +27,7 @@ class MCOperand;
 
 namespace AMDGPU {
 struct MCKernelDescriptor;
+struct AMDGPUMCKernelCodeT;
 namespace HSAMD {
 class MetadataStreamer;
 }
@@ -50,7 +49,8 @@ private:
   uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
 
   void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
-  void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo,
+  void getAmdKernelCode(AMDGPU::AMDGPUMCKernelCodeT &Out,
+                        const SIProgramInfo &KernelInfo,
                         const MachineFunction &MF) const;
 
   /// Emit register usage information so that the GPU driver
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index c11c7a5..e359573 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2526,6 +2526,14 @@ void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
 }
 
+void AMDGPUDAGToDAGISel::SelectPOPSExitingWaveID(SDNode *N) {
+  // TODO: Select this with a tablegen pattern. This is tricky because the
+  // intrinsic is IntrReadMem/IntrWriteMem but the instruction is not marked
+  // mayLoad/mayStore and tablegen complains about the mismatch.
+  SDValue Reg = CurDAG->getRegister(AMDGPU::SRC_POPS_EXITING_WAVE_ID, MVT::i32);
+  CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, N->getVTList(), Reg);
+}
+
 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
   switch (IntrID) {
   case Intrinsic::amdgcn_ds_gws_init:
@@ -2682,6 +2690,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
   case Intrinsic::amdgcn_ds_bvh_stack_rtn:
     SelectDSBvhStackIntrinsic(N);
     return;
+  case Intrinsic::amdgcn_pops_exiting_wave_id:
+    SelectPOPSExitingWaveID(N);
+    return;
   }
 
   SelectCode(N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index f987b74..53d25b4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -274,6 +274,7 @@ private:
   void SelectFP_EXTEND(SDNode *N);
   void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
   void SelectDSBvhStackIntrinsic(SDNode *N);
+  void SelectPOPSExitingWaveID(SDNode *N);
   void SelectDS_GWS(SDNode *N, unsigned IntrID);
   void SelectInterpP1F16(SDNode *N);
   void SelectINTRINSIC_W_CHAIN(SDNode *N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index b48a094..04d9bb5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2079,6 +2079,21 @@ bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
 }
 
+bool AMDGPUInstructionSelector::selectPOPSExitingWaveID(
+    MachineInstr &MI) const {
+  Register Dst = MI.getOperand(0).getReg();
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineBasicBlock *MBB = MI.getParent();
+
+  // TODO: Select this with a tablegen pattern. This is tricky because the
+  // intrinsic is IntrReadMem/IntrWriteMem but the instruction is not marked
+  // mayLoad/mayStore and tablegen complains about the mismatch.
+  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
+                 .addReg(AMDGPU::SRC_POPS_EXITING_WAVE_ID);
+  MI.eraseFromParent();
+  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
     MachineInstr &I) const {
   unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
@@ -2129,6 +2144,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
     return selectSBarrierSignalIsfirst(I, IntrinsicID);
   case Intrinsic::amdgcn_s_barrier_leave:
     return selectSBarrierLeave(I);
+  case Intrinsic::amdgcn_pops_exiting_wave_id:
+    return selectPOPSExitingWaveID(I);
   }
   return selectImpl(I, *CoverageInfo);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index f561d5d..48f3b18 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -125,6 +125,7 @@ private:
   bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
   bool selectSBarrier(MachineInstr &MI) const;
   bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
+  bool selectPOPSExitingWaveID(MachineInstr &MI) const;
 
   bool selectImageIntrinsic(MachineInstr &MI,
                             const AMDGPU::ImageDimIntrinsicInfo *Intr) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index aab79ce..c515138 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -1215,16 +1215,36 @@ bool AMDGPULibCalls::fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B,
                                "__rootn2div");
     replaceCall(FPOp, nval);
     return true;
-  } else if (ci_opr1 == -2) { // rootn(x, -2) = rsqrt(x)
-    if (FunctionCallee FPExpr =
-            getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT, FInfo))) {
-      LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> rsqrt(" << *opr0
-                        << ")\n");
-      Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2rsqrt");
-      replaceCall(FPOp, nval);
-      return true;
-    }
   }
+
+  if (ci_opr1 == -2 &&
+      shouldReplaceLibcallWithIntrinsic(CI,
+                                        /*AllowMinSizeF32=*/true,
+                                        /*AllowF64=*/true)) {
+    // rootn(x, -2) = rsqrt(x)
+
+    // The original rootn had looser ulp requirements than the resultant sqrt
+    // and fdiv.
+    MDBuilder MDHelper(M->getContext());
+    MDNode *FPMD = MDHelper.createFPMath(std::max(FPOp->getFPAccuracy(), 2.0f));
+
+    // TODO: Could handle strictfp but need to fix strict sqrt emission
+    FastMathFlags FMF = FPOp->getFastMathFlags();
+    FMF.setAllowContract(true);
+
+    CallInst *Sqrt = B.CreateUnaryIntrinsic(Intrinsic::sqrt, opr0, CI);
+    Instruction *RSqrt = cast<Instruction>(
+        B.CreateFDiv(ConstantFP::get(opr0->getType(), 1.0), Sqrt));
+    Sqrt->setFastMathFlags(FMF);
+    RSqrt->setFastMathFlags(FMF);
+    RSqrt->setMetadata(LLVMContext::MD_fpmath, FPMD);
+
+    LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> rsqrt(" << *opr0
+                      << ")\n");
+    replaceCall(CI, RSqrt);
+    return true;
+  }
+
   return false;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 56345d1..dbb42a6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5132,6 +5132,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
       break;
     }
+    case Intrinsic::amdgcn_pops_exiting_wave_id:
+      return getDefaultMappingSOP(MI);
     default:
       return getInvalidInstructionMapping();
     }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 84320d2..437e01c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1129,31 +1129,56 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                            int Index, VectorType *SubTp,
                                            ArrayRef<const Value *> Args,
                                            const Instruction *CxtI) {
+  if (!isa<FixedVectorType>(VT))
+    return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
+
   Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
-  // Treat extractsubvector as single op permutation.
-  bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
-  if (IsExtractSubvector)
-    Kind = TTI::SK_PermuteSingleSrc;
-
-  if (ST->hasVOP3PInsts()) {
-    if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
-        DL.getTypeSizeInBits(VT->getElementType()) == 16) {
-      // With op_sel VOP3P instructions freely can access the low half or high
-      // half of a register, so any swizzle is free.
 
-      switch (Kind) {
-      case TTI::SK_Broadcast:
-      case TTI::SK_Reverse:
-      case TTI::SK_PermuteSingleSrc:
+  // Larger vector widths may require additional instructions, but are
+  // typically cheaper than scalarized versions.
+  unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
+  if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+      DL.getTypeSizeInBits(VT->getElementType()) == 16) {
+    bool HasVOP3P = ST->hasVOP3PInsts();
+    unsigned RequestedElts =
+        count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
+    if (RequestedElts == 0)
+      return 0;
+    switch (Kind) {
+    case TTI::SK_Broadcast:
+    case TTI::SK_Reverse:
+    case TTI::SK_PermuteSingleSrc: {
+      // With op_sel VOP3P instructions freely can access the low half or high
+      // half of a register, so any swizzle of two elements is free.
+      if (HasVOP3P && NumVectorElts == 2)
         return 0;
-      default:
-        break;
-      }
+      unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
+      // SK_Broadcast just reuses the same mask
+      unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
+      return NumPerms + NumPermMasks;
+    }
+    case TTI::SK_ExtractSubvector:
+    case TTI::SK_InsertSubvector: {
+      // Even aligned accesses are free
+      if (!(Index % 2))
+        return 0;
+      // Insert/extract subvectors only require shifts / extract code to get the
+      // relevant bits
+      return alignTo(RequestedElts, 2) / 2;
+    }
+    case TTI::SK_PermuteTwoSrc:
+    case TTI::SK_Splice:
+    case TTI::SK_Select: {
+      unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
+      // SK_Select just reuses the same mask
+      unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
+      return NumPerms + NumPermMasks;
+    }
+
+    default:
+      break;
     }
   }
-  // Restore optimal kind.
-  if (IsExtractSubvector)
-    Kind = TTI::SK_ExtractSubvector;
 
   return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
 }
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index c08c35c..dcd4b22 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1340,7 +1340,7 @@ private:
   bool ParseDirectiveAMDGCNTarget();
   bool ParseDirectiveAMDHSACodeObjectVersion();
   bool ParseDirectiveAMDHSAKernel();
-  bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header);
+  bool ParseAMDKernelCodeTValue(StringRef ID, AMDGPUMCKernelCodeT &Header);
   bool ParseDirectiveAMDKernelCodeT();
   // TODO: Possibly make subtargetHasRegister const.
   bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo);
@@ -5863,7 +5863,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSACodeObjectVersion() {
 }
 
 bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
-                                               amd_kernel_code_t &Header) {
+                                               AMDGPUMCKernelCodeT &C) {
   // max_scratch_backing_memory_byte_size is deprecated. Ignore it while parsing
   // assembly for backwards compatibility.
   if (ID == "max_scratch_backing_memory_byte_size") {
@@ -5873,25 +5873,13 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
 
   SmallString<40> ErrStr;
   raw_svector_ostream Err(ErrStr);
-  if (!parseAmdKernelCodeField(ID, getParser(), Header, Err)) {
+  if (!C.ParseKernelCodeT(ID, getParser(), Err)) {
     return TokError(Err.str());
   }
   Lex();
 
-  if (ID == "enable_dx10_clamp") {
-    if (G_00B848_DX10_CLAMP(Header.compute_pgm_resource_registers) &&
-        isGFX12Plus())
-      return TokError("enable_dx10_clamp=1 is not allowed on GFX12+");
-  }
-
-  if (ID == "enable_ieee_mode") {
-    if (G_00B848_IEEE_MODE(Header.compute_pgm_resource_registers) &&
-        isGFX12Plus())
-      return TokError("enable_ieee_mode=1 is not allowed on GFX12+");
-  }
-
   if (ID == "enable_wavefront_size32") {
-    if (Header.code_properties & AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32) {
+    if (C.code_properties & AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32) {
       if (!isGFX10Plus())
         return TokError("enable_wavefront_size32=1 is only allowed on GFX10+");
       if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize32])
@@ -5903,41 +5891,23 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
   }
 
   if (ID == "wavefront_size") {
-    if (Header.wavefront_size == 5) {
+    if (C.wavefront_size == 5) {
       if (!isGFX10Plus())
         return TokError("wavefront_size=5 is only allowed on GFX10+");
       if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize32])
         return TokError("wavefront_size=5 requires +WavefrontSize32");
-    } else if (Header.wavefront_size == 6) {
+    } else if (C.wavefront_size == 6) {
       if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize64])
         return TokError("wavefront_size=6 requires +WavefrontSize64");
     }
   }
 
-  if (ID == "enable_wgp_mode") {
-    if (G_00B848_WGP_MODE(Header.compute_pgm_resource_registers) &&
-        !isGFX10Plus())
-      return TokError("enable_wgp_mode=1 is only allowed on GFX10+");
-  }
-
-  if (ID == "enable_mem_ordered") {
-    if (G_00B848_MEM_ORDERED(Header.compute_pgm_resource_registers) &&
-        !isGFX10Plus())
-      return TokError("enable_mem_ordered=1 is only allowed on GFX10+");
-  }
-
-  if (ID == "enable_fwd_progress") {
-    if (G_00B848_FWD_PROGRESS(Header.compute_pgm_resource_registers) &&
-        !isGFX10Plus())
-      return TokError("enable_fwd_progress=1 is only allowed on GFX10+");
-  }
-
   return false;
 }
 
 bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
-  amd_kernel_code_t Header;
-  AMDGPU::initDefaultAMDKernelCodeT(Header, &getSTI());
+  AMDGPUMCKernelCodeT KernelCode;
+  KernelCode.initDefault(&getSTI(), getContext());
 
   while (true) {
     // Lex EndOfStatement.  This is in a while loop, because lexing a comment
@@ -5951,11 +5921,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
     if (ID == ".end_amd_kernel_code_t")
       break;
 
-    if (ParseAMDKernelCodeTValue(ID, Header))
+    if (ParseAMDKernelCodeTValue(ID, KernelCode))
       return true;
   }
 
-  getTargetStreamer().EmitAMDKernelCodeT(Header);
+  KernelCode.validate(&getSTI(), getContext());
+  getTargetStreamer().EmitAMDKernelCodeT(KernelCode);
 
   return false;
 }
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index b754867..db5b467 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1312,6 +1312,9 @@ public:
   // \returns true if the target has IEEE fminimum/fmaximum instructions
   bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
 
+  // \returns true if the target has IEEE fminimum3/fmaximum3 instructions
+  bool hasIEEEMinMax3() const { return hasIEEEMinMax(); }
+
   // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
   bool hasRrWGMode() const { return getGeneration() >= GFX12; }
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 02fe7be..00e64e3 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -13,7 +13,6 @@
 #include "AMDGPUTargetStreamer.h"
 #include "AMDGPUMCKernelDescriptor.h"
 #include "AMDGPUPTNote.h"
-#include "AMDKernelCodeT.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "Utils/AMDKernelCodeTUtils.h"
 #include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
@@ -240,10 +239,9 @@ void AMDGPUTargetAsmStreamer::EmitDirectiveAMDHSACodeObjectVersion(
   OS << "\t.amdhsa_code_object_version " << COV << '\n';
 }
 
-void
-AMDGPUTargetAsmStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
+void AMDGPUTargetAsmStreamer::EmitAMDKernelCodeT(AMDGPUMCKernelCodeT &Header) {
   OS << "\t.amd_kernel_code_t\n";
-  dumpAmdKernelCode(&Header, OS, "\t\t");
+  Header.EmitKernelCodeT(OS, getContext());
   OS << "\t.end_amd_kernel_code_t\n";
 }
 
@@ -789,12 +787,10 @@ unsigned AMDGPUTargetELFStreamer::getEFlagsV6() {
 
 void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget() {}
 
-void
-AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
-
+void AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(AMDGPUMCKernelCodeT &Header) {
   MCStreamer &OS = getStreamer();
   OS.pushSection();
-  OS.emitBytes(StringRef((const char*)&Header, sizeof(Header)));
+  Header.EmitKernelCodeT(OS, getContext());
   OS.popSection();
 }
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 706897a..e5c9006 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -13,8 +13,6 @@
 #include "Utils/AMDGPUPALMetadata.h"
 #include "llvm/MC/MCStreamer.h"
 
-struct amd_kernel_code_t;
-
 namespace llvm {
 
 class MCELFStreamer;
@@ -23,6 +21,7 @@ class formatted_raw_ostream;
 
 namespace AMDGPU {
 
+struct AMDGPUMCKernelCodeT;
 struct MCKernelDescriptor;
 namespace HSAMD {
 struct Metadata;
@@ -54,7 +53,7 @@ public:
     CodeObjectVersion = COV;
   }
 
-  virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header){};
+  virtual void EmitAMDKernelCodeT(AMDGPU::AMDGPUMCKernelCodeT &Header) {};
 
   virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type){};
 
@@ -130,7 +129,7 @@ public:
 
   void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV) override;
 
-  void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
+  void EmitAMDKernelCodeT(AMDGPU::AMDGPUMCKernelCodeT &Header) override;
 
   void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
 
@@ -186,7 +185,7 @@ public:
 
   void EmitDirectiveAMDGCNTarget() override;
 
-  void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
+  void EmitAMDKernelCodeT(AMDGPU::AMDGPUMCKernelCodeT &Header) override;
 
   void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
 
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 6d0e0b3..1e9bfc7 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -1111,7 +1111,7 @@ enum Type { TRAP = -2, WORKGROUP = -1 };
 #define   C_00B84C_LDS_SIZE                                           0xFF007FFF
 #define   S_00B84C_EXCP_EN(x)                                         (((x) & 0x7F) << 24)
 #define   G_00B84C_EXCP_EN(x)                                         (((x) >> 24) & 0x7F)
-#define   C_00B84C_EXCP_EN
+#define   C_00B84C_EXCP_EN                                            0x80FFFFFF
 
 #define R_0286CC_SPI_PS_INPUT_ENA                                       0x0286CC
 #define R_0286D0_SPI_PS_INPUT_ADDR                                      0x0286D0
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 42e1c1c..1d2a5ff 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -957,6 +957,11 @@ const GCNSubtarget *SITargetLowering::getSubtarget() const {
   return Subtarget;
 }
 
+ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const {
+  static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
+  return RCRegs;
+}
+
 //===----------------------------------------------------------------------===//
 // TargetLowering queries
 //===----------------------------------------------------------------------===//
@@ -7588,8 +7593,7 @@ static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result,
                           ? (ReqRetNumElts + 1) / 2
                           : ReqRetNumElts;
 
-  int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
-    DMaskPop : (DMaskPop + 1) / 2;
+  int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
 
   MVT DataDwordVT = NumDataDwords == 1 ?
     MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
@@ -13195,6 +13199,33 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
   return SDValue();
 }
 
+/// \return true if the subtarget supports minimum3 and maximum3 with the given
+/// base min/max opcode \p Opc for type \p VT.
+static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
+                             EVT VT) {
+  switch (Opc) {
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
+  case AMDGPUISD::FMIN_LEGACY:
+  case AMDGPUISD::FMAX_LEGACY:
+    return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
+    return (VT == MVT::f32 || VT == MVT::f16) && Subtarget.hasIEEEMinMax3();
+  case ISD::SMAX:
+  case ISD::SMIN:
+  case ISD::UMAX:
+  case ISD::UMIN:
+    return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
+  default:
+    return false;
+  }
+
+  llvm_unreachable("not a min/max opcode");
+}
+
 SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -13207,10 +13238,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
   // Only do this if the inner op has one use since this will just increases
   // register pressure for no benefit.
 
-  if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
-      !VT.isVector() &&
-      (VT == MVT::i32 || VT == MVT::f32 ||
-       ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) {
+  if (supportsMin3Max3(*Subtarget, Opc, VT)) {
     // max(max(a, b), c) -> max3(a, b, c)
     // min(min(a, b), c) -> min3(a, b, c)
     if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 08aa2a5..fed73f4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -287,6 +287,8 @@ public:
 
   const GCNSubtarget *getSubtarget() const;
 
+  ArrayRef<MCPhysReg> getRoundingControlRegisters() const override;
+
   bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT,
                        EVT SrcVT) const override;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 08351c4..bb5f166 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2031,50 +2031,57 @@ MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
                                                     MachineInstr &MI,
                                                     const DebugLoc &DL) const {
   MachineFunction *MF = MBB.getParent();
-  MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
-  MachineBasicBlock *HaltLoop = MF->CreateMachineBasicBlock();
-  MF->push_back(HaltLoop);
-
   constexpr unsigned DoorbellIDMask = 0x3ff;
   constexpr unsigned ECQueueWaveAbort = 0x400;
 
+  MachineBasicBlock *TrapBB = &MBB;
+  MachineBasicBlock *ContBB = &MBB;
+  MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
+
+  if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
+    ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
+    TrapBB = MF->CreateMachineBasicBlock();
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
+    MF->push_back(TrapBB);
+    MBB.addSuccessor(TrapBB);
+  }
+
   // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
   // will be a nop.
-  BuildMI(MBB, MI, DL, get(AMDGPU::S_TRAP))
+  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
   Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-  BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG_RTN_B32), DoorbellReg)
+  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
+          DoorbellReg)
       .addImm(AMDGPU::SendMsg::ID_RTN_GET_DOORBELL);
-  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
+  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
       .addUse(AMDGPU::M0);
   Register DoorbellRegMasked =
       MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-  BuildMI(MBB, MI, DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
+  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
       .addUse(DoorbellReg)
       .addImm(DoorbellIDMask);
   Register SetWaveAbortBit =
       MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-  BuildMI(MBB, MI, DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
+  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
       .addUse(DoorbellRegMasked)
       .addImm(ECQueueWaveAbort);
-  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
       .addUse(SetWaveAbortBit);
-  BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG))
+  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
       .addImm(AMDGPU::SendMsg::ID_INTERRUPT);
-  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
       .addUse(AMDGPU::TTMP2);
-  BuildMI(MBB, MI, DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoop);
-
-  BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
-  BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_BRANCH))
-      .addMBB(HaltLoop);
+  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
+  TrapBB->addSuccessor(HaltLoopBB);
 
-  if (SplitBB != &MBB)
-    MBB.removeSuccessor(SplitBB);
-  MBB.addSuccessor(HaltLoop);
-  HaltLoop->addSuccessor(HaltLoop);
+  BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
+  BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
+      .addMBB(HaltLoopBB);
+  MF->push_back(HaltLoopBB);
+  HaltLoopBB->addSuccessor(HaltLoopBB);
 
-  return SplitBB;
+  return ContBB;
 }
 
 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 2beaf90..4b34fb2 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -11,6 +11,7 @@
 #include "AMDGPUAsmUtils.h"
 #include "AMDKernelCodeT.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDKernelCodeTUtils.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/IR/Attributes.h"
@@ -1218,39 +1219,37 @@ unsigned getAllocatedNumVGPRBlocks(const MCSubtargetInfo *STI,
 }
 } // end namespace IsaInfo
 
-void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
+void initDefaultAMDKernelCodeT(AMDGPUMCKernelCodeT &KernelCode,
                                const MCSubtargetInfo *STI) {
   IsaVersion Version = getIsaVersion(STI->getCPU());
-
-  memset(&Header, 0, sizeof(Header));
-
-  Header.amd_kernel_code_version_major = 1;
-  Header.amd_kernel_code_version_minor = 2;
-  Header.amd_machine_kind = 1; // AMD_MACHINE_KIND_AMDGPU
-  Header.amd_machine_version_major = Version.Major;
-  Header.amd_machine_version_minor = Version.Minor;
-  Header.amd_machine_version_stepping = Version.Stepping;
-  Header.kernel_code_entry_byte_offset = sizeof(Header);
-  Header.wavefront_size = 6;
+  KernelCode.amd_kernel_code_version_major = 1;
+  KernelCode.amd_kernel_code_version_minor = 2;
+  KernelCode.amd_machine_kind = 1; // AMD_MACHINE_KIND_AMDGPU
+  KernelCode.amd_machine_version_major = Version.Major;
+  KernelCode.amd_machine_version_minor = Version.Minor;
+  KernelCode.amd_machine_version_stepping = Version.Stepping;
+  KernelCode.kernel_code_entry_byte_offset = sizeof(amd_kernel_code_t);
+  if (STI->getFeatureBits().test(FeatureWavefrontSize32)) {
+    KernelCode.wavefront_size = 5;
+    KernelCode.code_properties |= AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
+  } else {
+    KernelCode.wavefront_size = 6;
+  }
 
   // If the code object does not support indirect functions, then the value must
   // be 0xffffffff.
-  Header.call_convention = -1;
+  KernelCode.call_convention = -1;
 
   // These alignment values are specified in powers of two, so alignment =
   // 2^n.  The minimum alignment is 2^4 = 16.
-  Header.kernarg_segment_alignment = 4;
-  Header.group_segment_alignment = 4;
-  Header.private_segment_alignment = 4;
+  KernelCode.kernarg_segment_alignment = 4;
+  KernelCode.group_segment_alignment = 4;
+  KernelCode.private_segment_alignment = 4;
 
   if (Version.Major >= 10) {
-    if (STI->getFeatureBits().test(FeatureWavefrontSize32)) {
-      Header.wavefront_size = 5;
-      Header.code_properties |= AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
-    }
-    Header.compute_pgm_resource_registers |=
-      S_00B848_WGP_MODE(STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1) |
-      S_00B848_MEM_ORDERED(1);
+    KernelCode.compute_pgm_resource_registers |=
+        S_00B848_WGP_MODE(STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1) |
+        S_00B848_MEM_ORDERED(1);
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index fc4147d..3cfc42a 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -37,6 +37,7 @@ class raw_ostream;
 
 namespace AMDGPU {
 
+struct AMDGPUMCKernelCodeT;
 struct IsaVersion;
 
 /// Generic target versions emitted by this version of LLVM.
@@ -860,7 +861,7 @@ unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc);
 LLVM_READONLY
 unsigned mapWMMA3AddrTo2AddrOpcode(unsigned Opc);
 
-void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
+void initDefaultAMDKernelCodeT(AMDGPUMCKernelCodeT &Header,
                                const MCSubtargetInfo *STI);
 
 bool isGroupSegment(const GlobalValue *GV);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
index 95ad3f3..75cb6cf 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
@@ -12,34 +12,51 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define QNAME(name) amd_kernel_code_t::name
+#define QNAME(name) AMDGPUMCKernelCodeT::name
 #define FLD_T(name) decltype(QNAME(name)), &QNAME(name)
 
-#define FIELD2(sname, aname, name) \
-  RECORD(sname, aname, printField<FLD_T(name)>, parseField<FLD_T(name)>)
+#ifndef PRINTFIELD
+#define PRINTFIELD(sname, aname, name) printField<FLD_T(name)>
+#endif
 
-#define FIELD(name) FIELD2(name, name, name)
+#ifndef FIELD2
+#define FIELD2(sname, aname, name)                                             \
+  RECORD(sname, aname, PRINTFIELD(sname, aname, name), parseField<FLD_T(name)>)
+#endif
 
+#ifndef FIELD
+#define FIELD(name) FIELD2(name, name, name)
+#endif
 
+#ifndef PRINTCODEPROP
 #define PRINTCODEPROP(name) \
   printBitField<FLD_T(code_properties),\
                 AMD_CODE_PROPERTY_##name##_SHIFT,\
                 AMD_CODE_PROPERTY_##name##_WIDTH>
+#endif
 
+#ifndef PARSECODEPROP
 #define PARSECODEPROP(name) \
   parseBitField<FLD_T(code_properties),\
                 AMD_CODE_PROPERTY_##name##_SHIFT,\
                 AMD_CODE_PROPERTY_##name##_WIDTH>
+#endif
 
+#ifndef CODEPROP
 #define CODEPROP(name, shift) \
   RECORD(name, name, PRINTCODEPROP(shift), PARSECODEPROP(shift))
+#endif
 
 // have to define these lambdas because of Set/GetMacro
+#ifndef PRINTCOMP
 #define PRINTCOMP(GetMacro, Shift) \
 [](StringRef Name, const amd_kernel_code_t &C, raw_ostream &OS) { \
    printName(OS, Name) << \
      (int)GetMacro(C.compute_pgm_resource_registers >> Shift); \
 }
+#endif
+
+#ifndef PARSECOMP
 #define PARSECOMP(SetMacro, Shift) \
 [](amd_kernel_code_t &C, MCAsmParser &MCParser, raw_ostream &Err) { \
    int64_t Value = 0; \
@@ -49,15 +66,22 @@
    C.compute_pgm_resource_registers |= SetMacro(Value) << Shift; \
    return true; \
 }
+#endif
 
+#ifndef COMPPGM
 #define COMPPGM(name, aname, GetMacro, SetMacro, Shift) \
   RECORD(name, aname, PRINTCOMP(GetMacro, Shift), PARSECOMP(SetMacro, Shift))
+#endif
 
+#ifndef COMPPGM1
 #define COMPPGM1(name, aname, AccMacro) \
   COMPPGM(name, aname, G_00B848_##AccMacro, S_00B848_##AccMacro, 0)
+#endif
 
+#ifndef COMPPGM2
 #define COMPPGM2(name, aname, AccMacro) \
   COMPPGM(name, aname, G_00B84C_##AccMacro, S_00B84C_##AccMacro, 32)
+#endif
 
 ///////////////////////////////////////////////////////////////////////////////
 // Begin of the table
@@ -143,13 +167,14 @@ FIELD(runtime_loader_kernel_symbol)
 
 #undef QNAME
 #undef FLD_T
+#undef PRINTFIELD
 #undef FIELD2
 #undef FIELD
 #undef PRINTCODEPROP
 #undef PARSECODEPROP
 #undef CODEPROP
 #undef PRINTCOMP
-#undef PAPSECOMP
+#undef PARSECOMP
 #undef COMPPGM
 #undef COMPPGM1
 #undef COMPPGM2
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
index 6bbc8c3..eaee1a2 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
@@ -6,44 +6,205 @@
 //
 //===----------------------------------------------------------------------===//
 //
-/// \file - utility functions to parse/print amd_kernel_code_t structure
+/// \file - utility functions to parse/print AMDGPUMCKernelCodeT structure
 //
 //===----------------------------------------------------------------------===//
 
 #include "AMDKernelCodeTUtils.h"
 #include "AMDKernelCodeT.h"
 #include "SIDefines.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringMap.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
+using namespace llvm::AMDGPU;
 
-static ArrayRef<StringRef> get_amd_kernel_code_t_FldNames() {
-  static StringRef const Table[] = {
-    "", // not found placeholder
+// Generates the following for AMDGPUMCKernelCodeT struct members:
+//   - HasMemberXXXXX class
+//     A check to see if AMDGPUMCKernelCodeT has a specific member so it can
+//     determine which of the original amd_kernel_code_t members are duplicated
+//     (if the names don't match, the table driven strategy won't work).
+//   - IsMCExprXXXXX class
+//     Check whether a AMDGPUMCKernelcodeT struct member is MCExpr-ified or not.
+//   - GetMemberXXXXX class
+//     A retrieval helper for said member (of type const MCExpr *&). Will return
+//     a `Phony` const MCExpr * initialized to nullptr to preserve reference
+//     returns.
+#define GEN_HAS_MEMBER(member)                                                 \
+  class HasMember##member {                                                    \
+  private:                                                                     \
+    struct KnownWithMember {                                                   \
+      int member;                                                              \
+    };                                                                         \
+    class AmbiguousDerived : public AMDGPUMCKernelCodeT,                       \
+                             public KnownWithMember {};                        \
+    template <typename U>                                                      \
+    static constexpr std::false_type Test(decltype(U::member) *);              \
+    template <typename U> static constexpr std::true_type Test(...);           \
+                                                                               \
+  public:                                                                      \
+    static constexpr bool RESULT =                                             \
+        std::is_same_v<decltype(Test<AmbiguousDerived>(nullptr)),              \
+                       std::true_type>;                                        \
+  };                                                                           \
+  class IsMCExpr##member {                                                     \
+    template <typename U,                                                      \
+              typename std::enable_if_t<                                       \
+                  HasMember##member::RESULT &&                                 \
+                      std::is_same_v<decltype(U::member), const MCExpr *>,     \
+                  U> * = nullptr>                                              \
+    static constexpr std::true_type HasMCExprType(decltype(U::member) *);      \
+    template <typename U> static constexpr std::false_type HasMCExprType(...); \
+                                                                               \
+  public:                                                                      \
+    static constexpr bool RESULT =                                             \
+        std::is_same_v<decltype(HasMCExprType<AMDGPUMCKernelCodeT>(nullptr)),  \
+                       std::true_type>;                                        \
+  };                                                                           \
+  class GetMember##member {                                                    \
+  public:                                                                      \
+    static const MCExpr *Phony;                                                \
+    template <typename U, typename std::enable_if_t<IsMCExpr##member::RESULT,  \
+                                                    U> * = nullptr>            \
+    static const MCExpr *&Get(U &C) {                                          \
+      assert(IsMCExpr##member::RESULT &&                                       \
+             "Trying to retrieve member that does not exist.");                \
+      return C.member;                                                         \
+    }                                                                          \
+    template <typename U, typename std::enable_if_t<!IsMCExpr##member::RESULT, \
+                                                    U> * = nullptr>            \
+    static const MCExpr *&Get(U &C) {                                          \
+      return Phony;                                                            \
+    }                                                                          \
+  };                                                                           \
+  const MCExpr *GetMember##member::Phony = nullptr;
+
+// Cannot generate class declarations using the table driver approach (see table
+// in AMDKernelCodeTInfo.h). Luckily, if any are missing here or eventually
+// added to the table, an error should occur when trying to retrieve the table
+// in getMCExprIndexTable.
+GEN_HAS_MEMBER(amd_code_version_major)
+GEN_HAS_MEMBER(amd_code_version_minor)
+GEN_HAS_MEMBER(amd_machine_kind)
+GEN_HAS_MEMBER(amd_machine_version_major)
+GEN_HAS_MEMBER(amd_machine_version_minor)
+GEN_HAS_MEMBER(amd_machine_version_stepping)
+
+GEN_HAS_MEMBER(kernel_code_entry_byte_offset)
+GEN_HAS_MEMBER(kernel_code_prefetch_byte_size)
+
+GEN_HAS_MEMBER(granulated_workitem_vgpr_count)
+GEN_HAS_MEMBER(granulated_wavefront_sgpr_count)
+GEN_HAS_MEMBER(priority)
+GEN_HAS_MEMBER(float_mode)
+GEN_HAS_MEMBER(priv)
+GEN_HAS_MEMBER(enable_dx10_clamp)
+GEN_HAS_MEMBER(debug_mode)
+GEN_HAS_MEMBER(enable_ieee_mode)
+GEN_HAS_MEMBER(enable_wgp_mode)
+GEN_HAS_MEMBER(enable_mem_ordered)
+GEN_HAS_MEMBER(enable_fwd_progress)
+
+GEN_HAS_MEMBER(enable_sgpr_private_segment_wave_byte_offset)
+GEN_HAS_MEMBER(user_sgpr_count)
+GEN_HAS_MEMBER(enable_trap_handler)
+GEN_HAS_MEMBER(enable_sgpr_workgroup_id_x)
+GEN_HAS_MEMBER(enable_sgpr_workgroup_id_y)
+GEN_HAS_MEMBER(enable_sgpr_workgroup_id_z)
+GEN_HAS_MEMBER(enable_sgpr_workgroup_info)
+GEN_HAS_MEMBER(enable_vgpr_workitem_id)
+GEN_HAS_MEMBER(enable_exception_msb)
+GEN_HAS_MEMBER(granulated_lds_size)
+GEN_HAS_MEMBER(enable_exception)
+
+GEN_HAS_MEMBER(enable_sgpr_private_segment_buffer)
+GEN_HAS_MEMBER(enable_sgpr_dispatch_ptr)
+GEN_HAS_MEMBER(enable_sgpr_queue_ptr)
+GEN_HAS_MEMBER(enable_sgpr_kernarg_segment_ptr)
+GEN_HAS_MEMBER(enable_sgpr_dispatch_id)
+GEN_HAS_MEMBER(enable_sgpr_flat_scratch_init)
+GEN_HAS_MEMBER(enable_sgpr_private_segment_size)
+GEN_HAS_MEMBER(enable_sgpr_grid_workgroup_count_x)
+GEN_HAS_MEMBER(enable_sgpr_grid_workgroup_count_y)
+GEN_HAS_MEMBER(enable_sgpr_grid_workgroup_count_z)
+GEN_HAS_MEMBER(enable_wavefront_size32)
+GEN_HAS_MEMBER(enable_ordered_append_gds)
+GEN_HAS_MEMBER(private_element_size)
+GEN_HAS_MEMBER(is_ptr64)
+GEN_HAS_MEMBER(is_dynamic_callstack)
+GEN_HAS_MEMBER(is_debug_enabled)
+GEN_HAS_MEMBER(is_xnack_enabled)
+
+GEN_HAS_MEMBER(workitem_private_segment_byte_size)
+GEN_HAS_MEMBER(workgroup_group_segment_byte_size)
+GEN_HAS_MEMBER(gds_segment_byte_size)
+GEN_HAS_MEMBER(kernarg_segment_byte_size)
+GEN_HAS_MEMBER(workgroup_fbarrier_count)
+GEN_HAS_MEMBER(wavefront_sgpr_count)
+GEN_HAS_MEMBER(workitem_vgpr_count)
+GEN_HAS_MEMBER(reserved_vgpr_first)
+GEN_HAS_MEMBER(reserved_vgpr_count)
+GEN_HAS_MEMBER(reserved_sgpr_first)
+GEN_HAS_MEMBER(reserved_sgpr_count)
+GEN_HAS_MEMBER(debug_wavefront_private_segment_offset_sgpr)
+GEN_HAS_MEMBER(debug_private_segment_buffer_sgpr)
+GEN_HAS_MEMBER(kernarg_segment_alignment)
+GEN_HAS_MEMBER(group_segment_alignment)
+GEN_HAS_MEMBER(private_segment_alignment)
+GEN_HAS_MEMBER(wavefront_size)
+GEN_HAS_MEMBER(call_convention)
+GEN_HAS_MEMBER(runtime_loader_kernel_symbol)
+
+static ArrayRef<StringLiteral> get_amd_kernel_code_t_FldNames() {
+  static constexpr StringLiteral const Table[] = {
+      "", // not found placeholder
 #define RECORD(name, altName, print, parse) #name
-#include "AMDKernelCodeTInfo.h"
+#include "Utils/AMDKernelCodeTInfo.h"
 #undef RECORD
   };
   return ArrayRef(Table);
 }
 
-static ArrayRef<StringRef> get_amd_kernel_code_t_FldAltNames() {
-  static StringRef const Table[] = {
-    "", // not found placeholder
+static ArrayRef<StringLiteral> get_amd_kernel_code_t_FldAltNames() {
+  static constexpr StringLiteral const Table[] = {
+      "", // not found placeholder
 #define RECORD(name, altName, print, parse) #altName
-#include "AMDKernelCodeTInfo.h"
+#include "Utils/AMDKernelCodeTInfo.h"
+#undef RECORD
+  };
+  return ArrayRef(Table);
+}
+
+static ArrayRef<bool> hasMCExprVersionTable() {
+  static bool const Table[] = {
+#define RECORD(name, altName, print, parse) (IsMCExpr##name::RESULT)
+#include "Utils/AMDKernelCodeTInfo.h"
 #undef RECORD
   };
   return ArrayRef(Table);
 }
 
-static StringMap<int> createIndexMap(const ArrayRef<StringRef> &names,
-                                     const ArrayRef<StringRef> &altNames) {
+using RetrieveFx = const MCExpr *&(*)(AMDGPUMCKernelCodeT &);
+
+static ArrayRef<RetrieveFx> getMCExprIndexTable() {
+  static const RetrieveFx Table[] = {
+#define RECORD(name, altName, print, parse) GetMember##name::Get
+#include "Utils/AMDKernelCodeTInfo.h"
+#undef RECORD
+  };
+  return ArrayRef(Table);
+}
+
+static StringMap<int> createIndexMap(ArrayRef<StringLiteral> names,
+                                     ArrayRef<StringLiteral> altNames) {
   StringMap<int> map;
   assert(names.size() == altNames.size());
   for (unsigned i = 0; i < names.size(); ++i) {
@@ -59,62 +220,111 @@ static int get_amd_kernel_code_t_FieldIndex(StringRef name) {
   return map.lookup(name) - 1; // returns -1 if not found
 }
 
-static StringRef get_amd_kernel_code_t_FieldName(int index) {
-  return get_amd_kernel_code_t_FldNames()[index + 1];
-}
+static constexpr std::pair<unsigned, unsigned> getShiftMask(unsigned Value) {
+  unsigned Shift = 0;
+  unsigned Mask = 0;
 
-// Field printing
+  Mask = ~Value;
+  for (; !(Mask & 1); Shift++, Mask >>= 1) {
+  }
 
-static raw_ostream &printName(raw_ostream &OS, StringRef Name) {
-  return OS << Name << " = ";
+  return std::make_pair(Shift, Mask);
 }
 
-template <typename T, T amd_kernel_code_t::*ptr>
-static void printField(StringRef Name, const amd_kernel_code_t &C,
-                       raw_ostream &OS) {
-  printName(OS, Name) << (int)(C.*ptr);
+static const MCExpr *MaskShiftSet(const MCExpr *Val, uint32_t Mask,
+                                  uint32_t Shift, MCContext &Ctx) {
+  if (Mask) {
+    const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx);
+    Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx);
+  }
+  if (Shift) {
+    const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx);
+    Val = MCBinaryExpr::createShl(Val, ShiftExpr, Ctx);
+  }
+  return Val;
 }
 
-template <typename T, T amd_kernel_code_t::*ptr, int shift, int width = 1>
-static void printBitField(StringRef Name, const amd_kernel_code_t &c,
-                          raw_ostream &OS) {
+static const MCExpr *MaskShiftGet(const MCExpr *Val, uint32_t Mask,
+                                  uint32_t Shift, MCContext &Ctx) {
+  if (Shift) {
+    const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx);
+    Val = MCBinaryExpr::createLShr(Val, ShiftExpr, Ctx);
+  }
+  if (Mask) {
+    const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx);
+    Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx);
+  }
+  return Val;
+}
+
+class PrintField {
+public:
+  template <typename T, T AMDGPUMCKernelCodeT::*ptr,
+            typename std::enable_if_t<!std::is_integral_v<T>, T> * = nullptr>
+  static void printField(StringRef Name, const AMDGPUMCKernelCodeT &C,
+                         raw_ostream &OS, MCContext &Ctx) {
+    OS << Name << " = ";
+    const MCExpr *Value = C.*ptr;
+    int64_t Val;
+    if (Value->evaluateAsAbsolute(Val))
+      OS << Val;
+    else
+      Value->print(OS, Ctx.getAsmInfo());
+  }
+
+  template <typename T, T AMDGPUMCKernelCodeT::*ptr,
+            typename std::enable_if_t<std::is_integral_v<T>, T> * = nullptr>
+  static void printField(StringRef Name, const AMDGPUMCKernelCodeT &C,
+                         raw_ostream &OS, MCContext &) {
+    OS << Name << " = " << (int)(C.*ptr);
+  }
+};
+
+template <typename T, T AMDGPUMCKernelCodeT::*ptr, int shift, int width = 1>
+static void printBitField(StringRef Name, const AMDGPUMCKernelCodeT &C,
+                          raw_ostream &OS, MCContext &) {
   const auto Mask = (static_cast<T>(1) << width) - 1;
-  printName(OS, Name) << (int)((c.*ptr >> shift) & Mask);
+  OS << Name << " = " << (int)((C.*ptr >> shift) & Mask);
 }
 
-using PrintFx = void(*)(StringRef, const amd_kernel_code_t &, raw_ostream &);
+using PrintFx = void (*)(StringRef, const AMDGPUMCKernelCodeT &, raw_ostream &,
+                         MCContext &);
 
 static ArrayRef<PrintFx> getPrinterTable() {
   static const PrintFx Table[] = {
+#define COMPPGM1(name, aname, AccMacro)                                        \
+  COMPPGM(name, aname, C_00B848_##AccMacro, S_00B848_##AccMacro, 0)
+#define COMPPGM2(name, aname, AccMacro)                                        \
+  COMPPGM(name, aname, C_00B84C_##AccMacro, S_00B84C_##AccMacro, 32)
+#define PRINTFIELD(sname, aname, name) PrintField::printField<FLD_T(name)>
+#define PRINTCOMP(Complement, PGMType)                                         \
+  [](StringRef Name, const AMDGPUMCKernelCodeT &C, raw_ostream &OS,            \
+     MCContext &Ctx) {                                                         \
+    OS << Name << " = ";                                                       \
+    auto [Shift, Mask] = getShiftMask(Complement);                             \
+    const MCExpr *Value;                                                       \
+    if (PGMType == 0) {                                                        \
+      Value =                                                                  \
+          MaskShiftGet(C.compute_pgm_resource1_registers, Mask, Shift, Ctx);   \
+    } else {                                                                   \
+      Value =                                                                  \
+          MaskShiftGet(C.compute_pgm_resource2_registers, Mask, Shift, Ctx);   \
+    }                                                                          \
+    int64_t Val;                                                               \
+    if (Value->evaluateAsAbsolute(Val))                                        \
+      OS << Val;                                                               \
+    else                                                                       \
+      Value->print(OS, Ctx.getAsmInfo());                                      \
+  }
 #define RECORD(name, altName, print, parse) print
-#include "AMDKernelCodeTInfo.h"
+#include "Utils/AMDKernelCodeTInfo.h"
 #undef RECORD
   };
   return ArrayRef(Table);
 }
 
-void llvm::printAmdKernelCodeField(const amd_kernel_code_t &C,
-                                   int FldIndex,
-                                   raw_ostream &OS) {
-  auto Printer = getPrinterTable()[FldIndex];
-  if (Printer)
-    Printer(get_amd_kernel_code_t_FieldName(FldIndex), C, OS);
-}
-
-void llvm::dumpAmdKernelCode(const amd_kernel_code_t *C,
-                             raw_ostream &OS,
-                             const char *tab) {
-  const int Size = getPrinterTable().size();
-  for (int i = 0; i < Size; ++i) {
-    OS << tab;
-    printAmdKernelCodeField(*C, i, OS);
-    OS << '\n';
-  }
-}
-
-// Field parsing
-
-static bool expectAbsExpression(MCAsmParser &MCParser, int64_t &Value, raw_ostream& Err) {
+static bool expectAbsExpression(MCAsmParser &MCParser, int64_t &Value,
+                                raw_ostream &Err) {
 
   if (MCParser.getLexer().isNot(AsmToken::Equal)) {
     Err << "expected '='";
@@ -129,8 +339,8 @@ static bool expectAbsExpression(MCAsmParser &MCParser, int64_t &Value, raw_ostre
   return true;
 }
 
-template <typename T, T amd_kernel_code_t::*ptr>
-static bool parseField(amd_kernel_code_t &C, MCAsmParser &MCParser,
+template <typename T, T AMDGPUMCKernelCodeT::*ptr>
+static bool parseField(AMDGPUMCKernelCodeT &C, MCAsmParser &MCParser,
                        raw_ostream &Err) {
   int64_t Value = 0;
   if (!expectAbsExpression(MCParser, Value, Err))
@@ -139,39 +349,241 @@ static bool parseField(amd_kernel_code_t &C, MCAsmParser &MCParser,
   return true;
 }
 
-template <typename T, T amd_kernel_code_t::*ptr, int shift, int width = 1>
-static bool parseBitField(amd_kernel_code_t &C, MCAsmParser &MCParser,
+template <typename T, T AMDGPUMCKernelCodeT::*ptr, int shift, int width = 1>
+static bool parseBitField(AMDGPUMCKernelCodeT &C, MCAsmParser &MCParser,
                           raw_ostream &Err) {
   int64_t Value = 0;
   if (!expectAbsExpression(MCParser, Value, Err))
     return false;
-  const uint64_t Mask = ((UINT64_C(1)  << width) - 1) << shift;
+  const uint64_t Mask = ((UINT64_C(1) << width) - 1) << shift;
   C.*ptr &= (T)~Mask;
   C.*ptr |= (T)((Value << shift) & Mask);
   return true;
 }
 
-using ParseFx = bool(*)(amd_kernel_code_t &, MCAsmParser &MCParser,
-                        raw_ostream &Err);
+static bool parseExpr(MCAsmParser &MCParser, const MCExpr *&Value,
+                      raw_ostream &Err) {
+  if (MCParser.getLexer().isNot(AsmToken::Equal)) {
+    Err << "expected '='";
+    return false;
+  }
+  MCParser.getLexer().Lex();
+
+  if (MCParser.parseExpression(Value)) {
+    Err << "Could not parse expression";
+    return false;
+  }
+  return true;
+}
+
+using ParseFx = bool (*)(AMDGPUMCKernelCodeT &, MCAsmParser &, raw_ostream &);
 
 static ArrayRef<ParseFx> getParserTable() {
   static const ParseFx Table[] = {
+#define COMPPGM1(name, aname, AccMacro)                                        \
+  COMPPGM(name, aname, G_00B848_##AccMacro, C_00B848_##AccMacro, 0)
+#define COMPPGM2(name, aname, AccMacro)                                        \
+  COMPPGM(name, aname, G_00B84C_##AccMacro, C_00B84C_##AccMacro, 32)
+#define PARSECOMP(Complement, PGMType)                                         \
+  [](AMDGPUMCKernelCodeT &C, MCAsmParser &MCParser,                            \
+     raw_ostream &Err) -> bool {                                               \
+    MCContext &Ctx = MCParser.getContext();                                    \
+    const MCExpr *Value;                                                       \
+    if (!parseExpr(MCParser, Value, Err))                                      \
+      return false;                                                            \
+    auto [Shift, Mask] = getShiftMask(Complement);                             \
+    Value = MaskShiftSet(Value, Mask, Shift, Ctx);                             \
+    const MCExpr *Compl = MCConstantExpr::create(Complement, Ctx);             \
+    if (PGMType == 0) {                                                        \
+      C.compute_pgm_resource1_registers = MCBinaryExpr::createAnd(             \
+          C.compute_pgm_resource1_registers, Compl, Ctx);                      \
+      C.compute_pgm_resource1_registers = MCBinaryExpr::createOr(              \
+          C.compute_pgm_resource1_registers, Value, Ctx);                      \
+    } else {                                                                   \
+      C.compute_pgm_resource2_registers = MCBinaryExpr::createAnd(             \
+          C.compute_pgm_resource2_registers, Compl, Ctx);                      \
+      C.compute_pgm_resource2_registers = MCBinaryExpr::createOr(              \
+          C.compute_pgm_resource2_registers, Value, Ctx);                      \
+    }                                                                          \
+    return true;                                                               \
+  }
 #define RECORD(name, altName, print, parse) parse
-#include "AMDKernelCodeTInfo.h"
+#include "Utils/AMDKernelCodeTInfo.h"
 #undef RECORD
   };
   return ArrayRef(Table);
 }
 
-bool llvm::parseAmdKernelCodeField(StringRef ID,
-                                   MCAsmParser &MCParser,
-                                   amd_kernel_code_t &C,
-                                   raw_ostream &Err) {
+static void printAmdKernelCodeField(const AMDGPUMCKernelCodeT &C, int FldIndex,
+                                    raw_ostream &OS, MCContext &Ctx) {
+  auto Printer = getPrinterTable()[FldIndex];
+  if (Printer)
+    Printer(get_amd_kernel_code_t_FldNames()[FldIndex + 1], C, OS, Ctx);
+}
+
+void AMDGPUMCKernelCodeT::initDefault(const MCSubtargetInfo *STI,
+                                      MCContext &Ctx, bool InitMCExpr) {
+  AMDGPUMCKernelCodeT();
+
+  AMDGPU::initDefaultAMDKernelCodeT(*this, STI);
+
+  if (InitMCExpr) {
+    const MCExpr *ZeroExpr = MCConstantExpr::create(0, Ctx);
+    compute_pgm_resource1_registers =
+        MCConstantExpr::create(Lo_32(compute_pgm_resource_registers), Ctx);
+    compute_pgm_resource2_registers =
+        MCConstantExpr::create(Hi_32(compute_pgm_resource_registers), Ctx);
+    is_dynamic_callstack = ZeroExpr;
+    wavefront_sgpr_count = ZeroExpr;
+    workitem_vgpr_count = ZeroExpr;
+    workitem_private_segment_byte_size = ZeroExpr;
+  }
+}
+
+void AMDGPUMCKernelCodeT::validate(const MCSubtargetInfo *STI, MCContext &Ctx) {
+  int64_t Value;
+  if (!compute_pgm_resource1_registers->evaluateAsAbsolute(Value))
+    return;
+
+  if (G_00B848_DX10_CLAMP(Value) && AMDGPU::isGFX12Plus(*STI)) {
+    Ctx.reportError({}, "enable_dx10_clamp=1 is not allowed on GFX12+");
+    return;
+  }
+
+  if (G_00B848_IEEE_MODE(Value) && AMDGPU::isGFX12Plus(*STI)) {
+    Ctx.reportError({}, "enable_ieee_mode=1 is not allowed on GFX12+");
+    return;
+  }
+
+  if (G_00B848_WGP_MODE(Value) && !AMDGPU::isGFX10Plus(*STI)) {
+    Ctx.reportError({}, "enable_wgp_mode=1 is only allowed on GFX10+");
+    return;
+  }
+
+  if (G_00B848_MEM_ORDERED(Value) && !AMDGPU::isGFX10Plus(*STI)) {
+    Ctx.reportError({}, "enable_mem_ordered=1 is only allowed on GFX10+");
+    return;
+  }
+
+  if (G_00B848_FWD_PROGRESS(Value) && !AMDGPU::isGFX10Plus(*STI)) {
+    Ctx.reportError({}, "enable_fwd_progress=1 is only allowed on GFX10+");
+    return;
+  }
+}
+
+const MCExpr *&AMDGPUMCKernelCodeT::getMCExprForIndex(int Index) {
+  static const auto IndexTable = getMCExprIndexTable();
+  return IndexTable[Index](*this);
+}
+
+bool AMDGPUMCKernelCodeT::ParseKernelCodeT(StringRef ID, MCAsmParser &MCParser,
+                                           raw_ostream &Err) {
   const int Idx = get_amd_kernel_code_t_FieldIndex(ID);
   if (Idx < 0) {
     Err << "unexpected amd_kernel_code_t field name " << ID;
     return false;
   }
+
+  if (hasMCExprVersionTable()[Idx]) {
+    const MCExpr *Value;
+    if (!parseExpr(MCParser, Value, Err))
+      return false;
+    getMCExprForIndex(Idx) = Value;
+    return true;
+  }
   auto Parser = getParserTable()[Idx];
-  return Parser ? Parser(C, MCParser, Err) : false;
+  return Parser ? Parser(*this, MCParser, Err) : false;
+}
+
+void AMDGPUMCKernelCodeT::EmitKernelCodeT(raw_ostream &OS, MCContext &Ctx) {
+  const int Size = hasMCExprVersionTable().size();
+  for (int i = 0; i < Size; ++i) {
+    OS << "\t\t";
+    if (hasMCExprVersionTable()[i]) {
+      OS << get_amd_kernel_code_t_FldNames()[i + 1] << " = ";
+      int64_t Val;
+      const MCExpr *Value = getMCExprForIndex(i);
+      if (Value->evaluateAsAbsolute(Val))
+        OS << Val;
+      else
+        Value->print(OS, Ctx.getAsmInfo());
+    } else {
+      printAmdKernelCodeField(*this, i, OS, Ctx);
+    }
+    OS << '\n';
+  }
+}
+
+void AMDGPUMCKernelCodeT::EmitKernelCodeT(MCStreamer &OS, MCContext &Ctx) {
+  OS.emitIntValue(amd_kernel_code_version_major, /*Size=*/4);
+  OS.emitIntValue(amd_kernel_code_version_minor, /*Size=*/4);
+  OS.emitIntValue(amd_machine_kind, /*Size=*/2);
+  OS.emitIntValue(amd_machine_version_major, /*Size=*/2);
+  OS.emitIntValue(amd_machine_version_minor, /*Size=*/2);
+  OS.emitIntValue(amd_machine_version_stepping, /*Size=*/2);
+  OS.emitIntValue(kernel_code_entry_byte_offset, /*Size=*/8);
+  OS.emitIntValue(kernel_code_prefetch_byte_offset, /*Size=*/8);
+  OS.emitIntValue(kernel_code_prefetch_byte_size, /*Size=*/8);
+  OS.emitIntValue(reserved0, /*Size=*/8);
+
+  if (compute_pgm_resource1_registers != nullptr)
+    OS.emitValue(compute_pgm_resource1_registers, /*Size=*/4);
+  else
+    OS.emitIntValue(Lo_32(compute_pgm_resource_registers),
+                    /*Size=*/4);
+
+  if (compute_pgm_resource2_registers != nullptr)
+    OS.emitValue(compute_pgm_resource2_registers, /*Size=*/4);
+  else
+    OS.emitIntValue(Hi_32(compute_pgm_resource_registers),
+                    /*Size=*/4);
+
+  if (is_dynamic_callstack != nullptr) {
+    const MCExpr *CodeProps = MCConstantExpr::create(code_properties, Ctx);
+    CodeProps = MCBinaryExpr::createOr(
+        CodeProps,
+        MaskShiftSet(is_dynamic_callstack,
+                     (1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1,
+                     AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT, Ctx),
+        Ctx);
+    OS.emitValue(CodeProps, /*Size=*/4);
+  } else
+    OS.emitIntValue(code_properties, /*Size=*/4);
+
+  if (workitem_private_segment_byte_size != nullptr)
+    OS.emitValue(workitem_private_segment_byte_size, /*Size=*/4);
+  else
+    OS.emitIntValue(0, /*Size=*/4);
+
+  OS.emitIntValue(workgroup_group_segment_byte_size, /*Size=*/4);
+  OS.emitIntValue(gds_segment_byte_size, /*Size=*/4);
+  OS.emitIntValue(kernarg_segment_byte_size, /*Size=*/8);
+  OS.emitIntValue(workgroup_fbarrier_count, /*Size=*/4);
+
+  if (wavefront_sgpr_count != nullptr)
+    OS.emitValue(wavefront_sgpr_count, /*Size=*/2);
+  else
+    OS.emitIntValue(0, /*Size=*/2);
+
+  if (workitem_vgpr_count != nullptr)
+    OS.emitValue(workitem_vgpr_count, /*Size=*/2);
+  else
+    OS.emitIntValue(0, /*Size=*/2);
+
+  OS.emitIntValue(reserved_vgpr_first, /*Size=*/2);
+  OS.emitIntValue(reserved_vgpr_count, /*Size=*/2);
+  OS.emitIntValue(reserved_sgpr_first, /*Size=*/2);
+  OS.emitIntValue(reserved_sgpr_count, /*Size=*/2);
+  OS.emitIntValue(debug_wavefront_private_segment_offset_sgpr,
+                  /*Size=*/2);
+  OS.emitIntValue(debug_private_segment_buffer_sgpr, /*Size=*/2);
+  OS.emitIntValue(kernarg_segment_alignment, /*Size=*/1);
+  OS.emitIntValue(group_segment_alignment, /*Size=*/1);
+  OS.emitIntValue(private_segment_alignment, /*Size=*/1);
+  OS.emitIntValue(wavefront_size, /*Size=*/1);
+
+  OS.emitIntValue(call_convention, /*Size=*/4);
+  OS.emitBytes(StringRef((const char *)reserved3, /*Size=*/12));
+  OS.emitIntValue(runtime_loader_kernel_symbol, /*Size=*/8);
+  OS.emitBytes(StringRef((const char *)control_directives, /*Size=*/16 * 8));
 }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
index 41d0e0d..6aeb98f 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
@@ -7,29 +7,84 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file AMDKernelCodeTUtils.h
+/// MC layer struct for AMDGPUMCKernelCodeT, provides MCExpr functionality where
+/// required.
+///
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDKERNELCODETUTILS_H
-#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDKERNELCODETUTILS_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCKERNELCODET_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCKERNELCODET_H
 
-struct amd_kernel_code_t;
+#include "AMDKernelCodeT.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
 
 namespace llvm {
-
 class MCAsmParser;
+class MCContext;
+class MCExpr;
+class MCStreamer;
+class MCSubtargetInfo;
 class raw_ostream;
-class StringRef;
+namespace AMDGPU {
+
+struct AMDGPUMCKernelCodeT {
+  AMDGPUMCKernelCodeT() = default;
+
+  // Names of most (if not all) members should match the ones used for table
+  // driven (array) generation in AMDKernelCodeTInfo.h.
+  uint32_t amd_kernel_code_version_major = 0;
+  uint32_t amd_kernel_code_version_minor = 0;
+  uint16_t amd_machine_kind = 0;
+  uint16_t amd_machine_version_major = 0;
+  uint16_t amd_machine_version_minor = 0;
+  uint16_t amd_machine_version_stepping = 0;
+  int64_t kernel_code_entry_byte_offset = 0;
+  int64_t kernel_code_prefetch_byte_offset = 0;
+  uint64_t kernel_code_prefetch_byte_size = 0;
+  uint64_t reserved0 = 0;
+  uint64_t compute_pgm_resource_registers = 0;
+  uint32_t code_properties = 0;
+  uint32_t workgroup_group_segment_byte_size = 0;
+  uint32_t gds_segment_byte_size = 0;
+  uint64_t kernarg_segment_byte_size = 0;
+  uint32_t workgroup_fbarrier_count = 0;
+  uint16_t reserved_vgpr_first = 0;
+  uint16_t reserved_vgpr_count = 0;
+  uint16_t reserved_sgpr_first = 0;
+  uint16_t reserved_sgpr_count = 0;
+  uint16_t debug_wavefront_private_segment_offset_sgpr = 0;
+  uint16_t debug_private_segment_buffer_sgpr = 0;
+  uint8_t kernarg_segment_alignment = 0;
+  uint8_t group_segment_alignment = 0;
+  uint8_t private_segment_alignment = 0;
+  uint8_t wavefront_size = 0;
+  int32_t call_convention = 0;
+  uint8_t reserved3[12] = {0};
+  uint64_t runtime_loader_kernel_symbol = 0;
+  uint64_t control_directives[16] = {0};
+
+  const MCExpr *compute_pgm_resource1_registers = nullptr;
+  const MCExpr *compute_pgm_resource2_registers = nullptr;
+
+  const MCExpr *is_dynamic_callstack = nullptr;
+  const MCExpr *wavefront_sgpr_count = nullptr;
+  const MCExpr *workitem_vgpr_count = nullptr;
+  const MCExpr *workitem_private_segment_byte_size = nullptr;
 
-void printAmdKernelCodeField(const amd_kernel_code_t &C, int FldIndex,
-                             raw_ostream &OS);
+  void initDefault(const MCSubtargetInfo *STI, MCContext &Ctx,
+                   bool InitMCExpr = true);
+  void validate(const MCSubtargetInfo *STI, MCContext &Ctx);
 
-void dumpAmdKernelCode(const amd_kernel_code_t *C, raw_ostream &OS,
-                       const char *tab);
+  const MCExpr *&getMCExprForIndex(int Index);
 
-bool parseAmdKernelCodeField(StringRef ID, MCAsmParser &Parser,
-                             amd_kernel_code_t &C, raw_ostream &Err);
+  bool ParseKernelCodeT(StringRef ID, MCAsmParser &MCParser, raw_ostream &Err);
+  void EmitKernelCodeT(raw_ostream &OS, MCContext &Ctx);
+  void EmitKernelCodeT(MCStreamer &OS, MCContext &Ctx);
+};
 
+} // end namespace AMDGPU
 } // end namespace llvm
 
-#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDKERNELCODETUTILS_H
+#endif // LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCKERNELCODET_H
diff --git a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
index 19d3b69..2f4ce8e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_component_library(LLVMAMDGPUUtils
   CodeGenTypes
   Core
   MC
+  MCParser
   Support
   TargetParser
 
diff --git a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
index f609305..91ffbc4 100644
--- a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -1119,18 +1119,24 @@ def : MipsPat<(select i32:$cond, immz, i32:$f),
 
 // llvm.fmin/fmax operations.
 let AdditionalPredicates = [NotInMicroMips] in {
-  def : MipsPat<(fmaxnum f32:$lhs, f32:$rhs),
+  def : MipsPat<(fmaxnum_ieee f32:$lhs, f32:$rhs),
                 (MAX_S   f32:$lhs, f32:$rhs)>,
                 ISA_MIPS32R6;
-  def : MipsPat<(fmaxnum f64:$lhs, f64:$rhs),
+  def : MipsPat<(fmaxnum_ieee f64:$lhs, f64:$rhs),
                 (MAX_D   f64:$lhs, f64:$rhs)>,
                 ISA_MIPS32R6;
-  def : MipsPat<(fminnum f32:$lhs, f32:$rhs),
+  def : MipsPat<(fminnum_ieee f32:$lhs, f32:$rhs),
                 (MIN_S   f32:$lhs, f32:$rhs)>,
                 ISA_MIPS32R6;
-  def : MipsPat<(fminnum f64:$lhs, f64:$rhs),
+  def : MipsPat<(fminnum_ieee f64:$lhs, f64:$rhs),
                 (MIN_D   f64:$lhs, f64:$rhs)>,
                 ISA_MIPS32R6;
+  def : MipsPat<(f32 (fcanonicalize f32:$src)),
+                (MIN_S   f32:$src, f32:$src)>,
+                ISA_MIPS32R6;
+  def : MipsPat<(f64 (fcanonicalize f64:$src)),
+                (MIN_D   f64:$src, f64:$src)>,
+                ISA_MIPS32R6;
 }
 
 // Pseudo instructions
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 459164f..c2be8c8 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -360,11 +360,15 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
 
   // Lower fmin and fmax operations for MIPS R6.
   // Instructions are defined but never used.
-  if (Subtarget.hasMips32r6() || Subtarget.hasMips64r6()) {
-    setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
-    setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
-    setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
-    setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+  if (Subtarget.hasMips32r6()) {
+    setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
+    setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::f32, Expand);
+    setOperationAction(ISD::FMAXNUM, MVT::f32, Expand);
+    setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
+    setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::f64, Expand);
+    setOperationAction(ISD::FMAXNUM, MVT::f64, Expand);
   }
 
   if (Subtarget.isGP64bit()) {
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index ac48dc5..f4e84ad 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1157,12 +1157,12 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
     MCSymbolRefExpr::VariantKind VK = GetVKForMO(MO);
 
-    // If the symbol isn't toc-data then use the TOC on AIX.
     // Map the global address operand to be a reference to the TOC entry we
     // will synthesize later. 'TOCEntry' is a label used to reference the
     // storage allocated in the TOC which contains the address of 'MOSymbol'.
-    // If the toc-data attribute is used, the TOC entry contains the data
-    // rather than the address of the MOSymbol.
+    // If the symbol does not have the toc-data attribute, then we create the
+    // TOC entry on AIX. If the toc-data attribute is used, the TOC entry
+    // contains the data rather than the address of the MOSymbol.
     if (![](const MachineOperand &MO) {
           if (!MO.isGlobal())
             return false;
@@ -1170,7 +1170,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
           const GlobalVariable *GV = dyn_cast<GlobalVariable>(MO.getGlobal());
           if (!GV)
             return false;
-
           return GV->hasAttribute("toc-data");
         }(MO)) {
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol, getTOCEntryTypeForMO(MO), VK);
@@ -1301,8 +1300,10 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
     unsigned Op = MI->getOpcode();
 
-    // Change the opcode to load address for tocdata
-    TmpInst.setOpcode(Op == PPC::ADDItocL8 ? PPC::ADDI8 : PPC::LA);
+    // Change the opcode to load address for toc-data.
+    // ADDItocL is only used for 32-bit toc-data on AIX and will always use LA.
+    TmpInst.setOpcode(Op == PPC::ADDItocL8 ? (IsAIX ? PPC::LA8 : PPC::ADDI8)
+                                           : PPC::LA);
 
     const MachineOperand &MO = MI->getOperand(2);
     assert((Op == PPC::ADDItocL8)
@@ -1316,8 +1317,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
     const MCExpr *Exp = MCSymbolRefExpr::create(
         MOSymbol,
-        Op == PPC::ADDItocL8 ? MCSymbolRefExpr::VK_PPC_TOC_LO
-                             : MCSymbolRefExpr::VK_PPC_L,
+        IsAIX ? MCSymbolRefExpr::VK_PPC_L : MCSymbolRefExpr::VK_PPC_TOC_LO,
         OutContext);
 
     TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
@@ -2831,8 +2831,10 @@ void PPCAIXAsmPrinter::emitGlobalVariableHelper(const GlobalVariable *GV) {
 
   // When -fdata-sections is enabled, every GlobalVariable will
   // be put into its own csect; therefore, label is not necessary here.
-  if (!TM.getDataSections() || GV->hasSection())
-    OutStreamer->emitLabel(EmittedInitSym);
+  if (!TM.getDataSections() || GV->hasSection()) {
+    if (Csect->getMappingClass() != XCOFF::XMC_TD)
+      OutStreamer->emitLabel(EmittedInitSym);
+  }
 
   // No alias to emit.
   if (!GOAliasMap[GV].size()) {
diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index 6e31cda..7350506 100644
--- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -2074,16 +2074,15 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
   if (GV->isThreadLocal())
     return 0;
 
-  // If the global has the toc-data attribute then fallback to DAG-ISEL.
-  if (TM.getTargetTriple().isOSAIX())
-    if (const GlobalVariable *Var = dyn_cast_or_null<GlobalVariable>(GV))
-      if (Var->hasAttribute("toc-data"))
-        return false;
-
   PPCFuncInfo->setUsesTOCBasePtr();
+  bool IsAIXTocData = TM.getTargetTriple().isOSAIX() &&
+                      isa<GlobalVariable>(GV) &&
+                      cast<GlobalVariable>(GV)->hasAttribute("toc-data");
+
   // For small code model, generate a simple TOC load.
   if (CModel == CodeModel::Small)
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(PPC::LDtoc),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
+            IsAIXTocData ? TII.get(PPC::ADDItoc8) : TII.get(PPC::LDtoc),
             DestReg)
         .addGlobalAddress(GV)
         .addReg(PPC::X2);
@@ -2101,6 +2100,7 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
             HighPartReg).addReg(PPC::X2).addGlobalAddress(GV);
 
     if (Subtarget->isGVIndirectSymbol(GV)) {
+      assert(!IsAIXTocData && "TOC data should always be direct.");
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(PPC::LDtocL),
               DestReg).addGlobalAddress(GV).addReg(HighPartReg);
     } else {
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 6862155..26560dc 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -6143,23 +6143,22 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
            " ELF/AIX or 32-bit AIX in the following.");
 
     // Transforms the ISD::TOC_ENTRY node for 32-bit AIX large code model mode,
-    // or 64-bit medium (ELF-only), or large (ELF and AIX) code model code that
-    // does not conain TOC data symbols.
-    // We generate two instructions as described below. The first source
-    // operand is a symbol reference. If it must be referenced via the toc
-    // according to Subtarget, we generate:
+    // 64-bit medium (ELF-only), or 64-bit large (ELF and AIX) code model code
+    // that does not contain TOC data symbols. We generate two instructions as
+    // described below. The first source operand is a symbol reference. If it
+    // must be referenced via the TOC according to Subtarget, we generate:
     // [32-bit AIX]
     //   LWZtocL(@sym, ADDIStocHA(%r2, @sym))
     // [64-bit ELF/AIX]
     //   LDtocL(@sym, ADDIStocHA8(%x2, @sym))
-    // Otherwise we generate:
+    // Otherwise for medium code model ELF we generate:
     //   ADDItocL8(ADDIStocHA8(%x2, @sym), @sym)
 
-    // For large code model with TOC data symbols we generate:
+    // And finally for AIX with toc-data we generate:
     // [32-bit AIX]
     //   ADDItocL(ADDIStocHA(%x2, @sym), @sym)
     // [64-bit AIX]
-    //   Currently not supported.
+    //   ADDItocL8(ADDIStocHA8(%x2, @sym), @sym)
 
     SDValue GA = N->getOperand(0);
     SDValue TOCbase = N->getOperand(1);
@@ -6171,12 +6170,9 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     // On AIX, if the symbol has the toc-data attribute it will be defined
     // in the TOC entry, so we use an ADDItocL/ADDItocL8.
     if (isAIXABI && hasTocDataAttr(GA)) {
-      if (isPPC64)
-        report_fatal_error(
-            "64-bit large code model toc-data not yet supported");
-
-      ReplaceNode(N, CurDAG->getMachineNode(PPC::ADDItocL, dl, VT,
-                                            SDValue(Tmp, 0), GA));
+      ReplaceNode(
+          N, CurDAG->getMachineNode(isPPC64 ? PPC::ADDItocL8 : PPC::ADDItocL,
+                                    dl, VT, SDValue(Tmp, 0), GA));
       return;
     }
 
@@ -6191,6 +6187,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       return;
     }
 
+    assert(isPPC64 && "TOC_ENTRY already handled for 32-bit.");
     // Build the address relative to the TOC-pointer.
     ReplaceNode(N, CurDAG->getMachineNode(PPC::ADDItocL8, dl, MVT::i64,
                                           SDValue(Tmp, 0), GA));
@@ -7777,6 +7774,10 @@ void PPCDAGToDAGISel::PeepholePPC64() {
       Flags = PPCII::MO_TLSLD_LO;
       break;
     case PPC::ADDItocL8:
+      // Skip the following peephole optimizations for ADDItocL8 on AIX which
+      // is used for toc-data access.
+      if (Subtarget->isAIXABI())
+        continue;
       Flags = PPCII::MO_TOC_LO;
       break;
     }
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 9e56de7..85bbfab 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -4438,6 +4438,12 @@ bool PPCInstrInfo::isDefMIElgibleForForwarding(MachineInstr &DefMI,
   if (Opc != PPC::ADDItocL8 && Opc != PPC::ADDI && Opc != PPC::ADDI8)
     return false;
 
+  // Skip the optimization of transformTo[NewImm|Imm]FormFedByAdd for ADDItocL8
+  // on AIX which is used for toc-data access. TODO: Follow up to see if it can
+  // apply for AIX toc-data as well.
+  if (Opc == PPC::ADDItocL8 && Subtarget.isAIX())
+    return false;
+
   assert(DefMI.getNumOperands() >= 3 &&
          "Add inst must have at least three operands");
   RegMO = &DefMI.getOperand(1);
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 7929a78..e3d6d2f 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -3346,7 +3346,7 @@ def ADDIStocHA : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, tocentr
                        "#ADDIStocHA",
                        [(set i32:$rD,
                          (PPCtoc_entry i32:$reg, tglobaladdr:$disp))]>;
-// TOC Data Transform AIX
+// TOC Data Transform on AIX
 def ADDItoc : PPCEmitTimePseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg),
                    "#ADDItoc",
                    [(set i32:$rD,
diff --git a/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp b/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
index 4c9f5ff..d10fe11 100644
--- a/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
@@ -29,8 +29,10 @@ using namespace llvm;
 namespace {
 
 static StringRef MASSVFuncs[] = {
-#define TLI_DEFINE_MASSV_VECFUNCS_NAMES
+#define TLI_DEFINE_MASSV_VECFUNCS
+#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, VABI_PREFIX) VEC,
 #include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_MASSV_VECFUNCS
 };
 
 class PPCLowerMASSVEntries : public ModulePass {
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index c73fe2c..dbfcab7 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -130,6 +130,10 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
 
   getActionDefinitionsBuilder({G_SADDO, G_SSUBO}).minScalar(0, sXLen).lower();
 
+  // TODO: Use Vector Single-Width Saturating Instructions for vector types.
+  getActionDefinitionsBuilder({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT})
+      .lower();
+
   auto &ShiftActions = getActionDefinitionsBuilder({G_ASHR, G_LSHR, G_SHL});
   if (ST.is64Bit())
     ShiftActions.customFor({{s32, s32}});
@@ -137,7 +141,8 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
       .widenScalarToNextPow2(0)
       .clampScalar(1, s32, sXLen)
       .clampScalar(0, s32, sXLen)
-      .minScalarSameAs(1, 0);
+      .minScalarSameAs(1, 0)
+      .widenScalarToNextPow2(1);
 
   auto &ExtActions =
       getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
@@ -344,6 +349,9 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
         .widenScalarToNextPow2(0);
   }
 
+  // TODO: Use libcall for sDoubleXLen.
+  getActionDefinitionsBuilder({G_UDIVREM, G_SDIVREM}).lower();
+
   auto &AbsActions = getActionDefinitionsBuilder(G_ABS);
   if (ST.hasStdExtZbb())
     AbsActions.customFor({s32, sXLen}).minScalar(0, sXLen);
@@ -367,6 +375,11 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
                                G_FABS, G_FSQRT, G_FMAXNUM, G_FMINNUM})
       .legalIf(typeIsScalarFPArith(0, ST));
 
+  getActionDefinitionsBuilder(G_FREM)
+      .libcallFor({s32, s64})
+      .minScalar(0, s32)
+      .scalarize(0);
+
   getActionDefinitionsBuilder(G_FCOPYSIGN)
       .legalIf(all(typeIsScalarFPArith(0, ST), typeIsScalarFPArith(1, ST)));
 
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index b099496..a78d789 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -152,7 +152,8 @@ def HasStdExtZimop : Predicate<"Subtarget->hasStdExtZimop()">,
 
 def FeatureStdExtZicfilp
     : RISCVExperimentalExtension<"zicfilp", 0, 4,
-                                 "'Zicfilp' (Landing pad)">;
+                                 "'Zicfilp' (Landing pad)",
+                                 [FeatureStdExtZicsr]>;
 def HasStdExtZicfilp : Predicate<"Subtarget->hasStdExtZicfilp()">,
                        AssemblerPredicate<(all_of FeatureStdExtZicfilp),
                                           "'Zicfilp' (Landing pad)">;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 06f8569..f0e5a7d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -688,7 +688,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         ISD::VP_ADD,         ISD::VP_SUB,         ISD::VP_MUL,
         ISD::VP_SDIV,        ISD::VP_UDIV,        ISD::VP_SREM,
         ISD::VP_UREM,        ISD::VP_AND,         ISD::VP_OR,
-        ISD::VP_XOR,         ISD::VP_ASHR,        ISD::VP_LSHR,
+        ISD::VP_XOR,         ISD::VP_SRA,         ISD::VP_SRL,
         ISD::VP_SHL,         ISD::VP_REDUCE_ADD,  ISD::VP_REDUCE_AND,
         ISD::VP_REDUCE_OR,   ISD::VP_REDUCE_XOR,  ISD::VP_REDUCE_SMAX,
         ISD::VP_REDUCE_SMIN, ISD::VP_REDUCE_UMAX, ISD::VP_REDUCE_UMIN,
@@ -1919,7 +1919,7 @@ bool RISCVTargetLowering::hasAndNotCompare(SDValue Y) const {
     return false;
 
   return (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbkb()) &&
-         !isa<ConstantSDNode>(Y);
+         (!isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque());
 }
 
 bool RISCVTargetLowering::hasBitTest(SDValue X, SDValue Y) const {
@@ -5341,7 +5341,7 @@ RISCVTargetLowering::lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op,
   SDValue Exp;
   // Restore back to original type. Truncation after SRL is to generate vnsrl.
   if (Op->isVPOpcode()) {
-    Exp = DAG.getNode(ISD::VP_LSHR, DL, IntVT, Bitcast,
+    Exp = DAG.getNode(ISD::VP_SRL, DL, IntVT, Bitcast,
                       DAG.getConstant(ShiftAmt, DL, IntVT), Mask, VL);
     Exp = DAG.getVPZExtOrTrunc(DL, VT, Exp, Mask, VL);
   } else {
@@ -5923,9 +5923,9 @@ static unsigned getRISCVVLOp(SDValue Op) {
   case ISD::VP_SELECT:
   case ISD::VP_MERGE:
     return RISCVISD::VMERGE_VL;
-  case ISD::VP_ASHR:
+  case ISD::VP_SRA:
     return RISCVISD::SRA_VL;
-  case ISD::VP_LSHR:
+  case ISD::VP_SRL:
     return RISCVISD::SRL_VL;
   case ISD::VP_SQRT:
     return RISCVISD::FSQRT_VL;
@@ -7010,8 +7010,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
          !Subtarget.hasVInstructionsF16()))
       return SplitVPOp(Op, DAG);
     [[fallthrough]];
-  case ISD::VP_ASHR:
-  case ISD::VP_LSHR:
+  case ISD::VP_SRA:
+  case ISD::VP_SRL:
   case ISD::VP_SHL:
     return lowerVPOp(Op, DAG);
   case ISD::VP_IS_FPCLASS:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 9d574ed..ce50fe6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1560,8 +1560,8 @@ def PseudoJump : Pseudo<(outs GPR:$rd), (ins pseudo_jump_symbol:$target), [],
 // -riscv-use-rematerializable-movimm in RISCVISelDAGToDAG.cpp
 // It will be expanded after register allocation.
 // FIXME: The scheduling information does not reflect the multiple instructions.
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 8, isCodeGenOnly = 1,
-    isPseudo = 1, isReMaterializable = 1, IsSignExtendingOpW = 1 in
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 8,
+    isReMaterializable = 1 in
 def PseudoMovImm : Pseudo<(outs GPR:$dst), (ins i32imm:$imm), []>,
                    Sched<[WriteIALU]>;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index 0bbf715..b581723 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -1680,8 +1680,9 @@ let Predicates = [HasVInstructions] in {
 let Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather in {
 defm VRGATHER_V : VGTR_IV_V_X_I<"vrgather", 0b001100>;
 def VRGATHEREI16_VV : VALUVV<0b001110, OPIVV, "vrgatherei16.vv">,
-                      SchedBinaryMC<"WriteVRGatherVV", "ReadVRGatherVV_data",
-                                    "ReadVRGatherVV_index">;
+                      SchedBinaryMC<"WriteVRGatherEI16VV",
+                                    "ReadVRGatherEI16VV_data",
+                                    "ReadVRGatherEI16VV_index">;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather
 
 // Vector Compress Instruction
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 8bf0f25..f2c867a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -2249,13 +2249,13 @@ multiclass VPseudoBinaryFV_VV_RM<LMULInfo m, string Constraint = "", int sew = 0
                                        UsesVXRM=0>;
 }
 
-multiclass VPseudoVGTR_VV_EEW<int eew, string Constraint = ""> {
+multiclass VPseudoVGTR_EI16_VV<string Constraint = ""> {
   foreach m = MxList in {
     defvar mx = m.MX;
     foreach sew = EEWList in {
       defvar dataEMULOctuple = m.octuple;
-      // emul = lmul * eew / sew
-      defvar idxEMULOctuple = !srl(!mul(dataEMULOctuple, eew), !logtwo(sew));
+      // emul = lmul * 16 / sew
+      defvar idxEMULOctuple = !srl(!mul(dataEMULOctuple, 16), !logtwo(sew));
       if !and(!ge(idxEMULOctuple, 1), !le(idxEMULOctuple, 64)) then {
         defvar emulMX = octuple_to_str<idxEMULOctuple>.ret;
         defvar emul = !cast<LMULInfo>("V_" # emulMX);
@@ -2264,8 +2264,8 @@ multiclass VPseudoVGTR_VV_EEW<int eew, string Constraint = ""> {
           defm _VV
               : VPseudoBinaryEmul<m.vrclass, m.vrclass, emul.vrclass, m, emul,
                                   Constraint, e>,
-                SchedBinary<"WriteVRGatherVV", "ReadVRGatherVV_data",
-                            "ReadVRGatherVV_index", mx, e, forceMergeOpRead=true>;
+                SchedBinary<"WriteVRGatherEI16VV", "ReadVRGatherEI16VV_data",
+                            "ReadVRGatherEI16VV_index", mx, e, forceMergeOpRead=true>;
         }
       }
     }
@@ -6879,8 +6879,7 @@ let Predicates = [HasVInstructionsAnyF] in {
 //===----------------------------------------------------------------------===//
 let Predicates = [HasVInstructions] in {
 defm PseudoVRGATHER     : VPseudoVGTR_VV_VX_VI<uimm5, "@earlyclobber $rd">;
-defm PseudoVRGATHEREI16 : VPseudoVGTR_VV_EEW<eew=16,
-                                             Constraint="@earlyclobber $rd">;
+defm PseudoVRGATHEREI16 : VPseudoVGTR_EI16_VV<Constraint = "@earlyclobber $rd">;
 
 //===----------------------------------------------------------------------===//
 // 16.5. Vector Compress Instruction
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index a4a5d9e..6ebf9f1 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -85,7 +85,7 @@ def ROCKET : RISCVTuneProcessorModel<"rocket",
 
 def SIFIVE_7 : RISCVTuneProcessorModel<"sifive-7-series",
                                        SiFive7Model,
-                                       [TuneSiFive7]>;
+                                       [TuneSiFive7, FeaturePostRAScheduler]>;
 
 def SIFIVE_E20 : RISCVProcessorModel<"sifive-e20",
                                      RocketModel,
@@ -145,7 +145,7 @@ def SIFIVE_E76 : RISCVProcessorModel<"sifive-e76",
                                       FeatureStdExtA,
                                       FeatureStdExtF,
                                       FeatureStdExtC],
-                                     [TuneSiFive7]>;
+                                     [TuneSiFive7, FeaturePostRAScheduler]>;
 
 def SIFIVE_S21 : RISCVProcessorModel<"sifive-s21",
                                      RocketModel,
@@ -189,7 +189,7 @@ def SIFIVE_S76 : RISCVProcessorModel<"sifive-s76",
                                       FeatureStdExtD,
                                       FeatureStdExtC,
                                       FeatureStdExtZihintpause],
-                                     [TuneSiFive7]>;
+                                     [TuneSiFive7, FeaturePostRAScheduler]>;
 
 def SIFIVE_U54 : RISCVProcessorModel<"sifive-u54",
                                      RocketModel,
@@ -212,7 +212,7 @@ def SIFIVE_U74 : RISCVProcessorModel<"sifive-u74",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC],
-                                     [TuneSiFive7]>;
+                                     [TuneSiFive7, FeaturePostRAScheduler]>;
 
 def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,
                                       [Feature64Bit,
@@ -230,6 +230,7 @@ def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,
                                        FeatureStdExtZba,
                                        FeatureStdExtZbb],
                                       [TuneSiFive7,
+                                       FeaturePostRAScheduler,
                                        TuneDLenFactor2]>;
 
 def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
@@ -262,7 +263,8 @@ def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
                                       [TuneNoDefaultUnroll,
                                        TuneConditionalCompressedMoveFusion,
                                        TuneLUIADDIFusion,
-                                       TuneAUIPCADDIFusion]>;
+                                       TuneAUIPCADDIFusion,
+                                       FeaturePostRAScheduler]>;
 
 def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
                                       [Feature64Bit,
@@ -302,7 +304,8 @@ def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
                                        TuneConditionalCompressedMoveFusion,
                                        TuneLUIADDIFusion,
                                        TuneAUIPCADDIFusion,
-                                       TuneNoSinkSplatOperands]>;
+                                       TuneNoSinkSplatOperands,
+                                       FeaturePostRAScheduler]>;
 
 def SYNTACORE_SCR1_BASE : RISCVProcessorModel<"syntacore-scr1-base",
                                               SyntacoreSCR1Model,
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 83fb757..b299114 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -199,7 +199,6 @@ def SiFive7Model : SchedMachineModel {
   let LoadLatency = 3;
   let MispredictPenalty = 3;
   let CompleteModel = 0;
-  let PostRAScheduler = true;
   let EnableIntervals = true;
   let UnsupportedFeatures = [HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx,
                              HasStdExtZcmt, HasStdExtZknd, HasStdExtZkne,
@@ -928,6 +927,7 @@ foreach mx = SchedMxList in {
     defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
     let Latency = !add(Cycles, 3), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
       defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
     }
   }
@@ -1273,6 +1273,8 @@ defm "" : LMULReadAdvance<"ReadVFSlideV", 0>;
 defm "" : LMULReadAdvance<"ReadVFSlideF", 0>;
 defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_data", 0>;
 defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_index", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_data", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_index", 0>;
 defm "" : LMULReadAdvance<"ReadVRGatherVX_data", 0>;
 defm "" : LMULReadAdvance<"ReadVRGatherVX_index", 0>;
 defm "" : LMULReadAdvance<"ReadVRGatherVI_data", 0>;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
index a379588..80362ca 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
@@ -13,7 +13,6 @@ def SiFiveP400Model : SchedMachineModel {
   let MicroOpBufferSize = 56; // Max micro-ops that can be buffered.
   let LoadLatency = 4;        // Cycles for loads to access the cache.
   let MispredictPenalty = 9;  // Extra cycles for a mispredicted branch.
-  let PostRAScheduler = true;
   let UnsupportedFeatures = [HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx,
                              HasStdExtZcmt, HasStdExtZknd, HasStdExtZkne,
                              HasStdExtZknh, HasStdExtZksed, HasStdExtZksh,
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
index 07d72b6..f0697a1b 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
@@ -56,7 +56,6 @@ def SiFiveP600Model : SchedMachineModel {
   let MicroOpBufferSize = 160; // Max micro-ops that can be buffered.
   let LoadLatency = 4;        // Cycles for loads to access the cache.
   let MispredictPenalty = 9;  // Extra cycles for a mispredicted branch.
-  let PostRAScheduler = true;
   let UnsupportedFeatures = [HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx,
                              HasStdExtZknd, HasStdExtZkne, HasStdExtZknh,
                              HasStdExtZksed, HasStdExtZksh, HasStdExtZkr,
@@ -716,6 +715,7 @@ foreach mx = ["MF8", "MF4", "MF2", "M1"] in {
     defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
     let Latency = 3, ReleaseAtCycles = [1] in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SiFiveP600VEXQ1], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SiFiveP600VEXQ1], mx, sew, IsWorstCase>;
       defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SiFiveP600VEXQ1], mx, sew, IsWorstCase>;
     }
   }
@@ -736,6 +736,7 @@ foreach mx = ["M2", "M4", "M8"] in {
     defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
     let Latency = 6, ReleaseAtCycles = [LMulLat] in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SiFiveP600VEXQ1], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SiFiveP600VEXQ1], mx, sew, IsWorstCase>;
       defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SiFiveP600VEXQ1], mx, sew, IsWorstCase>;
     }
   }
@@ -1071,6 +1072,8 @@ defm "" : LMULReadAdvance<"ReadVFSlideV", 0>;
 defm "" : LMULReadAdvance<"ReadVFSlideF", 0>;
 defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_data", 0>;
 defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_index", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_data", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_index", 0>;
 defm "" : LMULReadAdvance<"ReadVRGatherVX_data", 0>;
 defm "" : LMULReadAdvance<"ReadVRGatherVX_index", 0>;
 defm "" : LMULReadAdvance<"ReadVRGatherVI_data", 0>;
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td
index e452418..449611c 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleV.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td
@@ -521,6 +521,7 @@ defm "" : LMULSchedWrites<"WriteVISlide1X">;
 defm "" : LMULSchedWrites<"WriteVFSlide1F">;
 // 16.4. Vector Register Gather Instructions
 defm "" : LMULSEWSchedWrites<"WriteVRGatherVV">;
+defm "" : LMULSEWSchedWrites<"WriteVRGatherEI16VV">;
 defm "" : LMULSchedWrites<"WriteVRGatherVX">;
 defm "" : LMULSchedWrites<"WriteVRGatherVI">;
 // 16.5. Vector Compress Instruction
@@ -749,6 +750,8 @@ defm "" : LMULSchedReads<"ReadVFSlideF">;
 // 16.4. Vector Register Gather Instructions
 defm "" : LMULSEWSchedReads<"ReadVRGatherVV_data">;
 defm "" : LMULSEWSchedReads<"ReadVRGatherVV_index">;
+defm "" : LMULSEWSchedReads<"ReadVRGatherEI16VV_data">;
+defm "" : LMULSEWSchedReads<"ReadVRGatherEI16VV_index">;
 defm "" : LMULSchedReads<"ReadVRGatherVX_data">;
 defm "" : LMULSchedReads<"ReadVRGatherVX_index">;
 defm "" : LMULSchedReads<"ReadVRGatherVI_data">;
@@ -956,6 +959,7 @@ defm "" : LMULWriteRes<"WriteVSlideI", []>;
 defm "" : LMULWriteRes<"WriteVISlide1X", []>;
 defm "" : LMULWriteRes<"WriteVFSlide1F", []>;
 defm "" : LMULSEWWriteRes<"WriteVRGatherVV", []>;
+defm "" : LMULSEWWriteRes<"WriteVRGatherEI16VV", []>;
 defm "" : LMULWriteRes<"WriteVRGatherVX", []>;
 defm "" : LMULWriteRes<"WriteVRGatherVI", []>;
 defm "" : LMULSEWWriteRes<"WriteVCompressV", []>;
@@ -1120,6 +1124,8 @@ defm "" : LMULReadAdvance<"ReadVFSlideV", 0>;
 defm "" : LMULReadAdvance<"ReadVFSlideF", 0>;
 defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_data", 0>;
 defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_index", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_data", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_index", 0>;
 defm "" : LMULReadAdvance<"ReadVRGatherVX_data", 0>;
 defm "" : LMULReadAdvance<"ReadVRGatherVX_index", 0>;
 defm "" : LMULReadAdvance<"ReadVRGatherVI_data", 0>;
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index c880c9e..347c1bc 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -121,9 +121,7 @@ public:
   }
   bool enableMachineScheduler() const override { return true; }
 
-  bool enablePostRAScheduler() const override {
-    return getSchedModel().PostRAScheduler || UsePostRAScheduler;
-  }
+  bool enablePostRAScheduler() const override { return UsePostRAScheduler; }
 
   Align getPrefFunctionAlignment() const {
     return Align(TuneInfo->PrefFunctionAlignment);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index ca82796..176d0e7 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1881,10 +1881,14 @@ unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
 bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                                  const TargetTransformInfo::LSRCost &C2) {
   // RISC-V specific here are "instruction number 1st priority".
-  return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
+  // If we need to emit adds inside the loop to add up base registers, then
+  // we need at least one extra temporary register.
+  unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
+  unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
+  return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
                   C1.NumIVMuls, C1.NumBaseAdds,
                   C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
-         std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
+         std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
                   C2.NumIVMuls, C2.NumBaseAdds,
                   C2.ScaleCost, C2.ImmCost, C2.SetupCost);
 }
diff --git a/llvm/lib/Target/SPIRV/CMakeLists.txt b/llvm/lib/Target/SPIRV/CMakeLists.txt
index 7001ac3..fe09d59 100644
--- a/llvm/lib/Target/SPIRV/CMakeLists.txt
+++ b/llvm/lib/Target/SPIRV/CMakeLists.txt
@@ -17,6 +17,7 @@ add_llvm_target(SPIRVCodeGen
   SPIRVAsmPrinter.cpp
   SPIRVBuiltins.cpp
   SPIRVCallLowering.cpp
+  SPIRVInlineAsmLowering.cpp
   SPIRVCommandLine.cpp
   SPIRVDuplicatesTracker.cpp
   SPIRVEmitIntrinsics.cpp
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
index b468b71..5c286ac 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
@@ -321,14 +321,19 @@ void SPIRVInstPrinter::printStringImm(const MCInst *MI, unsigned OpNo,
     if (MI->getOperand(StrStartIndex).isReg())
       break;
 
-    std::string Str = getSPIRVStringOperand(*MI, OpNo);
+    std::string Str = getSPIRVStringOperand(*MI, StrStartIndex);
     if (StrStartIndex != OpNo)
       O << ' '; // Add a space if we're starting a new string/argument.
     O << '"';
     for (char c : Str) {
-      if (c == '"')
-        O.write('\\'); // Escape " characters (might break for complex UTF-8).
-      O.write(c);
+      // Escape ", \n characters (might break for complex UTF-8).
+      if (c == '\n') {
+        O.write("\\n", 2);
+      } else {
+        if (c == '"')
+          O.write('\\');
+        O.write(c);
+      }
     }
     O << '"';
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 9fde26c..424087f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -1118,6 +1118,39 @@ static bool generateGroupUniformInst(const SPIRV::IncomingCall *Call,
   return true;
 }
 
+static bool generateKernelClockInst(const SPIRV::IncomingCall *Call,
+                                    MachineIRBuilder &MIRBuilder,
+                                    SPIRVGlobalRegistry *GR) {
+  const SPIRV::DemangledBuiltin *Builtin = Call->Builtin;
+  MachineFunction &MF = MIRBuilder.getMF();
+  const auto *ST = static_cast<const SPIRVSubtarget *>(&MF.getSubtarget());
+  if (!ST->canUseExtension(SPIRV::Extension::SPV_KHR_shader_clock)) {
+    std::string DiagMsg = std::string(Builtin->Name) +
+                          ": the builtin requires the following SPIR-V "
+                          "extension: SPV_KHR_shader_clock";
+    report_fatal_error(DiagMsg.c_str(), false);
+  }
+
+  MachineRegisterInfo *MRI = MIRBuilder.getMRI();
+  Register ResultReg = Call->ReturnRegister;
+  MRI->setRegClass(ResultReg, &SPIRV::IDRegClass);
+
+  // Deduce the `Scope` operand from the builtin function name.
+  SPIRV::Scope::Scope ScopeArg =
+      StringSwitch<SPIRV::Scope::Scope>(Builtin->Name)
+          .EndsWith("device", SPIRV::Scope::Scope::Device)
+          .EndsWith("work_group", SPIRV::Scope::Scope::Workgroup)
+          .EndsWith("sub_group", SPIRV::Scope::Scope::Subgroup);
+  Register ScopeReg = buildConstantIntReg(ScopeArg, MIRBuilder, GR);
+
+  MIRBuilder.buildInstr(SPIRV::OpReadClockKHR)
+      .addDef(ResultReg)
+      .addUse(GR->getSPIRVTypeID(Call->ReturnType))
+      .addUse(ScopeReg);
+
+  return true;
+}
+
 // These queries ask for a single size_t result for a given dimension index, e.g
 // size_t get_global_id(uint dimindex). In SPIR-V, the builtins corresonding to
 // these values are all vec3 types, so we need to extract the correct index or
@@ -2290,6 +2323,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
     return generateIntelSubgroupsInst(Call.get(), MIRBuilder, GR);
   case SPIRV::GroupUniform:
     return generateGroupUniformInst(Call.get(), MIRBuilder, GR);
+  case SPIRV::KernelClock:
+    return generateKernelClockInst(Call.get(), MIRBuilder, GR);
   }
   return false;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 5640285..692234c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -58,6 +58,7 @@ def LoadStore : BuiltinGroup;
 def IntelSubgroups : BuiltinGroup;
 def AtomicFloating : BuiltinGroup;
 def GroupUniform : BuiltinGroup;
+def KernelClock : BuiltinGroup;
 
 //===----------------------------------------------------------------------===//
 // Class defining a demangled builtin record. The information in the record
@@ -952,6 +953,14 @@ defm : DemangledGroupBuiltin<"group_scan_exclusive_logical_xor", OnlyWork, OpGro
 defm : DemangledGroupBuiltin<"group_scan_inclusive_logical_xor", OnlyWork, OpGroupLogicalXorKHR>;
 defm : DemangledGroupBuiltin<"group_reduce_logical_xor", OnlyWork, OpGroupLogicalXorKHR>;
 
+// cl_khr_kernel_clock / SPV_KHR_shader_clock
+defm : DemangledNativeBuiltin<"clock_read_device", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
+defm : DemangledNativeBuiltin<"clock_read_work_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
+defm : DemangledNativeBuiltin<"clock_read_sub_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
+defm : DemangledNativeBuiltin<"clock_read_hilo_device", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
+defm : DemangledNativeBuiltin<"clock_read_hilo_work_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
+defm : DemangledNativeBuiltin<"clock_read_hilo_sub_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
+
 //===----------------------------------------------------------------------===//
 // Class defining an atomic instruction on floating-point numbers.
 //
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 691e6ee..7f53154 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -47,6 +47,8 @@ static const std::map<std::string, SPIRV::Extension::Extension>
          SPIRV::Extension::Extension::SPV_KHR_bit_instructions},
         {"SPV_KHR_linkonce_odr",
          SPIRV::Extension::Extension::SPV_KHR_linkonce_odr},
+        {"SPV_INTEL_inline_assembly",
+         SPIRV::Extension::Extension::SPV_INTEL_inline_assembly},
         {"SPV_INTEL_bfloat16_conversion",
          SPIRV::Extension::Extension::SPV_INTEL_bfloat16_conversion},
         {"SPV_KHR_subgroup_rotate",
@@ -55,6 +57,8 @@ static const std::map<std::string, SPIRV::Extension::Extension>
          SPIRV::Extension::Extension::SPV_INTEL_variable_length_array},
         {"SPV_INTEL_function_pointers",
          SPIRV::Extension::Extension::SPV_INTEL_function_pointers},
+        {"SPV_KHR_shader_clock",
+         SPIRV::Extension::Extension::SPV_KHR_shader_clock},
 };
 
 bool SPIRVExtensionsParser::parse(cl::Option &O, llvm::StringRef ArgName,
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index a1a08c5..ea53fe5 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -140,6 +140,7 @@ public:
   Instruction *visitAllocaInst(AllocaInst &I);
   Instruction *visitAtomicCmpXchgInst(AtomicCmpXchgInst &I);
   Instruction *visitUnreachableInst(UnreachableInst &I);
+  Instruction *visitCallInst(CallInst &I);
 
   StringRef getPassName() const override { return "SPIRV emit intrinsics"; }
 
@@ -629,6 +630,28 @@ void SPIRVEmitIntrinsics::preprocessCompositeConstants(IRBuilder<> &B) {
   }
 }
 
+Instruction *SPIRVEmitIntrinsics::visitCallInst(CallInst &Call) {
+  if (!Call.isInlineAsm())
+    return &Call;
+
+  const InlineAsm *IA = cast<InlineAsm>(Call.getCalledOperand());
+  LLVMContext &Ctx = F->getContext();
+
+  Constant *TyC = UndefValue::get(IA->getFunctionType());
+  MDString *ConstraintString = MDString::get(Ctx, IA->getConstraintString());
+  SmallVector<Value *> Args = {
+      MetadataAsValue::get(Ctx,
+                           MDNode::get(Ctx, ValueAsMetadata::getConstant(TyC))),
+      MetadataAsValue::get(Ctx, MDNode::get(Ctx, ConstraintString))};
+  for (unsigned OpIdx = 0; OpIdx < Call.arg_size(); OpIdx++)
+    Args.push_back(Call.getArgOperand(OpIdx));
+
+  IRBuilder<> B(Call.getParent());
+  B.SetInsertPoint(&Call);
+  B.CreateIntrinsic(Intrinsic::spv_inline_asm, {}, {Args});
+  return &Call;
+}
+
 Instruction *SPIRVEmitIntrinsics::visitSwitchInst(SwitchInst &I) {
   BasicBlock *ParentBB = I.getParent();
   IRBuilder<> B(ParentBB);
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
index 96b4a570a..2bd22bb 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
@@ -82,6 +82,28 @@ bool SPIRVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   return false;
 }
 
+std::pair<unsigned, const TargetRegisterClass *>
+SPIRVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                  StringRef Constraint,
+                                                  MVT VT) const {
+  const TargetRegisterClass *RC = nullptr;
+  if (Constraint.starts_with("{"))
+    return std::make_pair(0u, RC);
+
+  if (VT.isFloatingPoint())
+    RC = VT.isVector() ? &SPIRV::vfIDRegClass
+                       : (VT.getScalarSizeInBits() > 32 ? &SPIRV::fID64RegClass
+                                                        : &SPIRV::fIDRegClass);
+  else if (VT.isInteger())
+    RC = VT.isVector() ? &SPIRV::vIDRegClass
+                       : (VT.getScalarSizeInBits() > 32 ? &SPIRV::ID64RegClass
+                                                        : &SPIRV::IDRegClass);
+  else
+    RC = &SPIRV::IDRegClass;
+
+  return std::make_pair(0u, RC);
+}
+
 // Insert a bitcast before the instruction to keep SPIR-V code valid
 // when there is a type mismatch between results and operand types.
 static void validatePtrTypes(const SPIRVSubtarget &STI,
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
index 8c1de7d..6fc200a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
@@ -55,6 +55,15 @@ public:
                           MachineFunction &MF,
                           unsigned Intrinsic) const override;
 
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               StringRef Constraint, MVT VT) const override;
+  unsigned
+  getNumRegisters(LLVMContext &Context, EVT VT,
+                  std::optional<MVT> RegisterVT = std::nullopt) const override {
+    return 1;
+  }
+
   // Call the default implementation and finalize target lowering by inserting
   // extra instructions required to preserve validity of SPIR-V code imposed by
   // the standard.
diff --git a/llvm/lib/Target/SPIRV/SPIRVInlineAsmLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVInlineAsmLowering.cpp
new file mode 100644
index 0000000..8bd4fb6
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVInlineAsmLowering.cpp
@@ -0,0 +1,46 @@
+//===--- SPIRVInlineAsmLowering.cpp - Inline Asm lowering -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the lowering of LLVM inline asm calls to machine code
+// calls for GlobalISel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRVInlineAsmLowering.h"
+#include "SPIRVSubtarget.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsSPIRV.h"
+
+using namespace llvm;
+
+SPIRVInlineAsmLowering::SPIRVInlineAsmLowering(const SPIRVTargetLowering &TLI)
+    : InlineAsmLowering(&TLI) {}
+
+bool SPIRVInlineAsmLowering::lowerAsmOperandForConstraint(
+    Value *Val, StringRef Constraint, std::vector<MachineOperand> &Ops,
+    MachineIRBuilder &MIRBuilder) const {
+  Value *ValOp = nullptr;
+  if (isa<ConstantInt>(Val)) {
+    ValOp = Val;
+  } else if (ConstantFP *CFP = dyn_cast<ConstantFP>(Val)) {
+    Ops.push_back(MachineOperand::CreateFPImm(CFP));
+    return true;
+  } else if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
+    if (II->getIntrinsicID() == Intrinsic::spv_track_constant) {
+      if (isa<ConstantInt>(II->getOperand(0))) {
+        ValOp = II->getOperand(0);
+      } else if (ConstantFP *CFP = dyn_cast<ConstantFP>(II->getOperand(0))) {
+        Ops.push_back(MachineOperand::CreateFPImm(CFP));
+        return true;
+      }
+    }
+  }
+  return ValOp ? InlineAsmLowering::lowerAsmOperandForConstraint(
+                     ValOp, Constraint, Ops, MIRBuilder)
+               : false;
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVInlineAsmLowering.h b/llvm/lib/Target/SPIRV/SPIRVInlineAsmLowering.h
new file mode 100644
index 0000000..7229185
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVInlineAsmLowering.h
@@ -0,0 +1,33 @@
+//===--- SPIRVInlineAsmLowering.h - Inline Asm lowering ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file  describes how to lower LLVM inline asm calls to machine
+// code calls for GlobalISel.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVINLINEASMLOWERING_H
+#define LLVM_LIB_TARGET_SPIRV_SPIRVINLINEASMLOWERING_H
+
+#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
+
+namespace llvm {
+
+class SPIRVTargetLowering;
+
+class SPIRVInlineAsmLowering : public InlineAsmLowering {
+public:
+  SPIRVInlineAsmLowering(const SPIRVTargetLowering &TLI);
+  bool
+  lowerAsmOperandForConstraint(Value *Val, StringRef Constraint,
+                               std::vector<MachineOperand> &Ops,
+                               MachineIRBuilder &MIRBuilder) const override;
+};
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_SPIRV_SPIRVINLINEASMLOWERING_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
index af98f2f..12cf761 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
@@ -47,6 +47,16 @@ bool SPIRVInstrInfo::isConstantInstr(const MachineInstr &MI) const {
   }
 }
 
+bool SPIRVInstrInfo::isInlineAsmDefInstr(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case SPIRV::OpAsmTargetINTEL:
+  case SPIRV::OpAsmINTEL:
+    return true;
+  default:
+    return false;
+  }
+}
+
 bool SPIRVInstrInfo::isTypeDeclInstr(const MachineInstr &MI) const {
   auto &MRI = MI.getMF()->getRegInfo();
   if (MI.getNumDefs() >= 1 && MI.getOperand(0).isReg()) {
@@ -246,7 +256,8 @@ void SPIRVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 }
 
 bool SPIRVInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
-  if (MI.getOpcode() == SPIRV::GET_ID || MI.getOpcode() == SPIRV::GET_fID ||
+  if (MI.getOpcode() == SPIRV::GET_ID || MI.getOpcode() == SPIRV::GET_ID64 ||
+      MI.getOpcode() == SPIRV::GET_fID || MI.getOpcode() == SPIRV::GET_fID64 ||
       MI.getOpcode() == SPIRV::GET_pID32 ||
       MI.getOpcode() == SPIRV::GET_pID64 || MI.getOpcode() == SPIRV::GET_vfID ||
       MI.getOpcode() == SPIRV::GET_vID || MI.getOpcode() == SPIRV::GET_vpID32 ||
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
index 4f2781c..95f3874 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
@@ -30,6 +30,7 @@ public:
   const SPIRVRegisterInfo &getRegisterInfo() const { return RI; }
   bool isHeaderInstr(const MachineInstr &MI) const;
   bool isConstantInstr(const MachineInstr &MI) const;
+  bool isInlineAsmDefInstr(const MachineInstr &MI) const;
   bool isTypeDeclInstr(const MachineInstr &MI) const;
   bool isDecorationInstr(const MachineInstr &MI) const;
   bool canUseFastMathFlags(const MachineInstr &MI) const;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index 151d0ec..7c9b84a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -18,7 +18,9 @@ let isCodeGenOnly=1 in {
   def ASSIGN_TYPE: Pseudo<(outs ANYID:$dst_id), (ins ANYID:$src_id, TYPE:$src_ty)>;
   def DECL_TYPE: Pseudo<(outs ANYID:$dst_id), (ins ANYID:$src_id, TYPE:$src_ty)>;
   def GET_ID: Pseudo<(outs ID:$dst_id), (ins ANYID:$src)>;
+  def GET_ID64: Pseudo<(outs ID64:$dst_id), (ins ANYID:$src)>;
   def GET_fID: Pseudo<(outs fID:$dst_id), (ins ANYID:$src)>;
+  def GET_fID64: Pseudo<(outs fID64:$dst_id), (ins ANYID:$src)>;
   def GET_pID32: Pseudo<(outs pID32:$dst_id), (ins ANYID:$src)>;
   def GET_pID64: Pseudo<(outs pID64:$dst_id), (ins ANYID:$src)>;
   def GET_vID: Pseudo<(outs vID:$dst_id), (ins ANYID:$src)>;
@@ -802,6 +804,11 @@ def OpGroupNonUniformRotateKHR: Op<4431, (outs ID:$res),
                   (ins TYPE:$type, ID:$scope, ID:$value, ID:$delta, variable_ops),
                   "$res = OpGroupNonUniformRotateKHR $type $scope $value $delta">;
 
+// SPV_KHR_shader_clock
+def OpReadClockKHR: Op<5056, (outs ID:$res),
+                  (ins TYPE:$type, ID:$scope),
+                  "$res = OpReadClockKHR $type $scope">;
+
 // 3.49.7, Constant-Creation Instructions
 
 //  - SPV_INTEL_function_pointers
@@ -849,3 +856,11 @@ def OpGroupLogicalOrKHR: Op<6407, (outs ID:$res), (ins TYPE:$type, ID:$scope, i3
                   "$res = OpGroupLogicalOrKHR $type $scope $groupOp $value">;
 def OpGroupLogicalXorKHR: Op<6408, (outs ID:$res), (ins TYPE:$type, ID:$scope, i32imm:$groupOp, ID:$value),
                   "$res = OpGroupLogicalXorKHR $type $scope $groupOp $value">;
+
+// Inline Assembly Instructions
+def OpAsmTargetINTEL: Op<5609, (outs ID:$res), (ins StringImm:$str), "$res = OpAsmTargetINTEL $str">;
+def OpAsmINTEL: Op<5610, (outs ID:$res), (ins TYPE:$type, TYPE:$asm_type, ID:$target,
+                                          StringImm:$asm, StringImm:$constraints),
+                  "$res = OpAsmINTEL $type $asm_type $target $asm">;
+def OpAsmCallINTEL: Op<5611, (outs ID:$res), (ins TYPE:$type, ID:$asm, variable_ops),
+                  "$res = OpAsmCallINTEL $type $asm">;
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 235f947..c86ab28 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1117,6 +1117,14 @@ void addInstrRequirements(const MachineInstr &MI,
       Reqs.addCapability(SPIRV::Capability::GroupUniformArithmeticKHR);
     }
     break;
+  case SPIRV::OpReadClockKHR:
+    if (!ST.canUseExtension(SPIRV::Extension::SPV_KHR_shader_clock))
+      report_fatal_error("OpReadClockKHR instruction requires the "
+                         "following SPIR-V extension: SPV_KHR_shader_clock",
+                         false);
+    Reqs.addExtension(SPIRV::Extension::SPV_KHR_shader_clock);
+    Reqs.addCapability(SPIRV::Capability::ShaderClockKHR);
+    break;
   case SPIRV::OpFunctionPointerCallINTEL:
     if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_function_pointers)) {
       Reqs.addExtension(SPIRV::Extension::SPV_INTEL_function_pointers);
@@ -1143,6 +1151,14 @@ void addInstrRequirements(const MachineInstr &MI,
       Reqs.addCapability(SPIRV::Capability::VariableLengthArrayINTEL);
     }
     break;
+  case SPIRV::OpAsmTargetINTEL:
+  case SPIRV::OpAsmINTEL:
+  case SPIRV::OpAsmCallINTEL:
+    if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_inline_assembly)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_inline_assembly);
+      Reqs.addCapability(SPIRV::Capability::AsmINTEL);
+    }
+    break;
   default:
     break;
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
index d652b5d..c3842f0 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
@@ -54,7 +54,8 @@ extern void processInstr(MachineInstr &MI, MachineIRBuilder &MIB,
 } // namespace llvm
 
 static bool isMetaInstrGET(unsigned Opcode) {
-  return Opcode == SPIRV::GET_ID || Opcode == SPIRV::GET_fID ||
+  return Opcode == SPIRV::GET_ID || Opcode == SPIRV::GET_ID64 ||
+         Opcode == SPIRV::GET_fID || Opcode == SPIRV::GET_fID64 ||
          Opcode == SPIRV::GET_pID32 || Opcode == SPIRV::GET_pID64 ||
          Opcode == SPIRV::GET_vID || Opcode == SPIRV::GET_vfID ||
          Opcode == SPIRV::GET_vpID32 || Opcode == SPIRV::GET_vpID64;
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index 9bff23d..85299a4 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -215,6 +215,8 @@ static SPIRVType *propagateSPIRVType(MachineInstr *MI, SPIRVGlobalRegistry *GR,
         SpirvTy = GR->getOrCreateSPIRVType(Ty, MIB);
         break;
       }
+      case TargetOpcode::G_ANYEXT:
+      case TargetOpcode::G_SEXT:
       case TargetOpcode::G_ZEXT: {
         if (MI->getOperand(1).isReg()) {
           if (MachineInstr *DefInstr =
@@ -457,12 +459,7 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
           Ty = VectorType::get(ElemTy, NumElts, false);
         }
         insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MRI);
-      } else if (MI.getOpcode() == TargetOpcode::G_TRUNC ||
-                 MI.getOpcode() == TargetOpcode::G_ZEXT ||
-                 MI.getOpcode() == TargetOpcode::G_PTRTOINT ||
-                 MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE ||
-                 MI.getOpcode() == TargetOpcode::COPY ||
-                 MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST) {
+      } else if (MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE) {
         propagateSPIRVType(&MI, GR, MRI, MIB);
       }
 
@@ -474,6 +471,24 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
   }
   for (MachineInstr *MI : ToErase)
     MI->eraseFromParent();
+
+  // Address the case when IRTranslator introduces instructions with new
+  // registers without SPIRVType associated.
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      switch (MI.getOpcode()) {
+      case TargetOpcode::G_TRUNC:
+      case TargetOpcode::G_ANYEXT:
+      case TargetOpcode::G_SEXT:
+      case TargetOpcode::G_ZEXT:
+      case TargetOpcode::G_PTRTOINT:
+      case TargetOpcode::COPY:
+      case TargetOpcode::G_ADDRSPACE_CAST:
+        propagateSPIRVType(&MI, GR, MRI, MIB);
+        break;
+      }
+    }
+  }
 }
 
 // Defined in SPIRVLegalizerInfo.cpp.
@@ -519,6 +534,128 @@ static void processInstrsWithTypeFolding(MachineFunction &MF,
   }
 }
 
+static void
+insertInlineAsmProcess(MachineFunction &MF, SPIRVGlobalRegistry *GR,
+                       const SPIRVSubtarget &ST, MachineIRBuilder MIRBuilder,
+                       const SmallVector<MachineInstr *> &ToProcess) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  Register AsmTargetReg;
+  for (unsigned i = 0, Sz = ToProcess.size(); i + 1 < Sz; i += 2) {
+    MachineInstr *I1 = ToProcess[i], *I2 = ToProcess[i + 1];
+    assert(isSpvIntrinsic(*I1, Intrinsic::spv_inline_asm) && I2->isInlineAsm());
+    MIRBuilder.setInsertPt(*I1->getParent(), *I1);
+
+    if (!AsmTargetReg.isValid()) {
+      // define vendor specific assembly target or dialect
+      AsmTargetReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
+      MRI.setRegClass(AsmTargetReg, &SPIRV::IDRegClass);
+      auto AsmTargetMIB =
+          MIRBuilder.buildInstr(SPIRV::OpAsmTargetINTEL).addDef(AsmTargetReg);
+      addStringImm(ST.getTargetTripleAsStr(), AsmTargetMIB);
+      GR->add(AsmTargetMIB.getInstr(), &MF, AsmTargetReg);
+    }
+
+    // create types
+    const MDNode *IAMD = I1->getOperand(1).getMetadata();
+    FunctionType *FTy = cast<FunctionType>(getMDOperandAsType(IAMD, 0));
+    SmallVector<SPIRVType *, 4> ArgTypes;
+    for (const auto &ArgTy : FTy->params())
+      ArgTypes.push_back(GR->getOrCreateSPIRVType(ArgTy, MIRBuilder));
+    SPIRVType *RetType =
+        GR->getOrCreateSPIRVType(FTy->getReturnType(), MIRBuilder);
+    SPIRVType *FuncType = GR->getOrCreateOpTypeFunctionWithArgs(
+        FTy, RetType, ArgTypes, MIRBuilder);
+
+    // define vendor specific assembly instructions string
+    Register AsmReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
+    MRI.setRegClass(AsmReg, &SPIRV::IDRegClass);
+    auto AsmMIB = MIRBuilder.buildInstr(SPIRV::OpAsmINTEL)
+                      .addDef(AsmReg)
+                      .addUse(GR->getSPIRVTypeID(RetType))
+                      .addUse(GR->getSPIRVTypeID(FuncType))
+                      .addUse(AsmTargetReg);
+    // inline asm string:
+    addStringImm(I2->getOperand(InlineAsm::MIOp_AsmString).getSymbolName(),
+                 AsmMIB);
+    // inline asm constraint string:
+    addStringImm(cast<MDString>(I1->getOperand(2).getMetadata()->getOperand(0))
+                     ->getString(),
+                 AsmMIB);
+    GR->add(AsmMIB.getInstr(), &MF, AsmReg);
+
+    // calls the inline assembly instruction
+    unsigned ExtraInfo = I2->getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+    if (ExtraInfo & InlineAsm::Extra_HasSideEffects)
+      MIRBuilder.buildInstr(SPIRV::OpDecorate)
+          .addUse(AsmReg)
+          .addImm(static_cast<uint32_t>(SPIRV::Decoration::SideEffectsINTEL));
+    Register DefReg;
+    SmallVector<unsigned, 4> Ops;
+    unsigned StartOp = InlineAsm::MIOp_FirstOperand,
+             AsmDescOp = InlineAsm::MIOp_FirstOperand;
+    unsigned I2Sz = I2->getNumOperands();
+    for (unsigned Idx = StartOp; Idx != I2Sz; ++Idx) {
+      const MachineOperand &MO = I2->getOperand(Idx);
+      if (MO.isMetadata())
+        continue;
+      if (Idx == AsmDescOp && MO.isImm()) {
+        // compute the index of the next operand descriptor
+        const InlineAsm::Flag F(MO.getImm());
+        AsmDescOp += 1 + F.getNumOperandRegisters();
+      } else {
+        if (MO.isReg() && MO.isDef())
+          DefReg = MO.getReg();
+        else
+          Ops.push_back(Idx);
+      }
+    }
+    if (!DefReg.isValid()) {
+      DefReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
+      MRI.setRegClass(DefReg, &SPIRV::IDRegClass);
+      SPIRVType *VoidType = GR->getOrCreateSPIRVType(
+          Type::getVoidTy(MF.getFunction().getContext()), MIRBuilder);
+      GR->assignSPIRVTypeToVReg(VoidType, DefReg, MF);
+    }
+    auto AsmCall = MIRBuilder.buildInstr(SPIRV::OpAsmCallINTEL)
+                       .addDef(DefReg)
+                       .addUse(GR->getSPIRVTypeID(RetType))
+                       .addUse(AsmReg);
+    unsigned IntrIdx = 2;
+    for (unsigned Idx : Ops) {
+      ++IntrIdx;
+      const MachineOperand &MO = I2->getOperand(Idx);
+      if (MO.isReg())
+        AsmCall.addUse(MO.getReg());
+      else
+        AsmCall.addUse(I1->getOperand(IntrIdx).getReg());
+    }
+  }
+  for (MachineInstr *MI : ToProcess)
+    MI->eraseFromParent();
+}
+
+static void insertInlineAsm(MachineFunction &MF, SPIRVGlobalRegistry *GR,
+                            const SPIRVSubtarget &ST,
+                            MachineIRBuilder MIRBuilder) {
+  SmallVector<MachineInstr *> ToProcess;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (isSpvIntrinsic(MI, Intrinsic::spv_inline_asm) ||
+          MI.getOpcode() == TargetOpcode::INLINEASM)
+        ToProcess.push_back(&MI);
+    }
+  }
+  if (ToProcess.size() == 0)
+    return;
+
+  if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_inline_assembly))
+    report_fatal_error("Inline assembly instructions require the "
+                       "following SPIR-V extension: SPV_INTEL_inline_assembly",
+                       false);
+
+  insertInlineAsmProcess(MF, GR, ST, MIRBuilder, ToProcess);
+}
+
 static void insertSpirvDecorations(MachineFunction &MF, MachineIRBuilder MIB) {
   SmallVector<MachineInstr *, 10> ToErase;
   for (MachineBasicBlock &MBB : MF) {
@@ -673,6 +810,7 @@ bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) {
   processInstrsWithTypeFolding(MF, GR, MIB);
   removeImplicitFallthroughs(MF, MIB);
   insertSpirvDecorations(MF, MIB);
+  insertInlineAsm(MF, GR, ST, MIB);
 
   return true;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVRegisterBanks.td b/llvm/lib/Target/SPIRV/SPIRVRegisterBanks.td
index dea2ef4..e81d969 100644
--- a/llvm/lib/Target/SPIRV/SPIRVRegisterBanks.td
+++ b/llvm/lib/Target/SPIRV/SPIRVRegisterBanks.td
@@ -10,4 +10,4 @@
 // as InstructionSelector RegClass checking code relies on them
 
 def TYPERegBank : RegisterBank<"TYPEBank", [TYPE]>;
-def IDRegBank : RegisterBank<"IDBank", [ID, fID, pID32, pID64, vID, vfID, vpID32, vpID64]>;
+def IDRegBank : RegisterBank<"IDBank", [ID, ID64, fID, fID64, pID32, pID64, vID, vfID, vpID32, vpID64]>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td b/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td
index 9231d22..17f6ba5 100644
--- a/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td
@@ -29,7 +29,9 @@ let Namespace = "SPIRV" in {
 
   // Class for non-type registers
   def ID0 : Register<"ID0">;
+  def ID640 : Register<"ID640">;
   def fID0 : Register<"fID0">;
+  def fID640 : Register<"fID640">;
   def pID320 : Register<"pID320">;
   def pID640 : Register<"pID640">;
   def vID0 : Register<"vID0">;
@@ -38,7 +40,9 @@ let Namespace = "SPIRV" in {
   def vpID640 : Register<"vpID640">;
 
   def ID : RegisterClass<"SPIRV", [i32], 32, (add ID0)>;
+  def ID64 : RegisterClass<"SPIRV", [i64], 32, (add ID640)>;
   def fID : RegisterClass<"SPIRV", [f32], 32, (add fID0)>;
+  def fID64 : RegisterClass<"SPIRV", [f64], 32, (add fID640)>;
   def pID32 : RegisterClass<"SPIRV", [p32], 32, (add pID320)>;
   def pID64 : RegisterClass<"SPIRV", [p64], 32, (add pID640)>;
   def vID : RegisterClass<"SPIRV", [v2i32], 32, (add vID0)>;
@@ -48,9 +52,9 @@ let Namespace = "SPIRV" in {
 
   def ANYID : RegisterClass<
       "SPIRV",
-      [i32, f32, p32, p64, v2i32, v2f32, v2p32, v2p64],
+      [i32, i64, f32, f64, p32, p64, v2i32, v2f32, v2p32, v2p64],
       32,
-      (add ID0, fID0, pID320, pID640, vID0, vfID0, vpID320, vpID640)>;
+      (add ID0, ID640, fID0, fID640, pID320, pID640, vID0, vfID0, vpID320, vpID640)>;
 
   // A few instructions like OpName can take ids from both type and non-type
   // instructions, so we need a super-class to allow for both to count as valid
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index 7aa0c56..2747292 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -82,6 +82,7 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU,
 
   GR = std::make_unique<SPIRVGlobalRegistry>(PointerSize);
   CallLoweringInfo = std::make_unique<SPIRVCallLowering>(TLInfo, GR.get());
+  InlineAsmInfo = std::make_unique<SPIRVInlineAsmLowering>(TLInfo);
   Legalizer = std::make_unique<SPIRVLegalizerInfo>(*this);
   RegBankInfo = std::make_unique<SPIRVRegisterBankInfo>();
   InstSelector.reset(
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.h b/llvm/lib/Target/SPIRV/SPIRVSubtarget.h
index 3e40440..2112164 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.h
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.h
@@ -16,6 +16,7 @@
 #include "SPIRVCallLowering.h"
 #include "SPIRVFrameLowering.h"
 #include "SPIRVISelLowering.h"
+#include "SPIRVInlineAsmLowering.h"
 #include "SPIRVInstrInfo.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
@@ -54,6 +55,7 @@ private:
   std::unique_ptr<RegisterBankInfo> RegBankInfo;
   std::unique_ptr<LegalizerInfo> Legalizer;
   std::unique_ptr<InstructionSelector> InstSelector;
+  std::unique_ptr<InlineAsmLowering> InlineAsmInfo;
 
   // TODO: Initialise the available extensions, extended instruction sets
   // based on the environment settings.
@@ -81,6 +83,7 @@ public:
            TargetTriple.getArch() == Triple::spirv64;
   }
   bool isVulkanEnv() const { return TargetTriple.getArch() == Triple::spirv; }
+  const std::string &getTargetTripleAsStr() const { return TargetTriple.str(); }
   VersionTuple getSPIRVVersion() const { return SPIRVVersion; };
   bool isAtLeastSPIRVVer(VersionTuple VerToCompareTo) const;
   bool isAtLeastOpenCLVer(VersionTuple VerToCompareTo) const;
@@ -108,6 +111,9 @@ public:
   InstructionSelector *getInstructionSelector() const override {
     return InstSelector.get();
   }
+  const InlineAsmLowering *getInlineAsmLowering() const override {
+    return InlineAsmInfo.get();
+  }
   const SPIRVInstrInfo *getInstrInfo() const override { return &InstrInfo; }
   const SPIRVFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 31e19ad..98cbd9d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -298,6 +298,7 @@ defm SPV_INTEL_optnone : ExtensionOperand<103>;
 defm SPV_INTEL_function_pointers : ExtensionOperand<104>;
 defm SPV_INTEL_variable_length_array : ExtensionOperand<105>;
 defm SPV_INTEL_bfloat16_conversion : ExtensionOperand<106>;
+defm SPV_INTEL_inline_assembly : ExtensionOperand<107>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Capabilities enum values and at the same time
@@ -413,6 +414,7 @@ defm ImageGatherBiasLodAMD : CapabilityOperand<5009, 0, 0, [], [Shader]>;
 defm FragmentMaskAMD : CapabilityOperand<5010, 0, 0, [], [Shader]>;
 defm StencilExportEXT : CapabilityOperand<5013, 0, 0, [], [Shader]>;
 defm ImageReadWriteLodAMD : CapabilityOperand<5015, 0, 0, [], [Shader]>;
+defm ShaderClockKHR : CapabilityOperand<5055, 0, 0, [SPV_KHR_shader_clock], []>;
 defm SampleMaskOverrideCoverageNV : CapabilityOperand<5249, 0, 0, [], [SampleRateShading]>;
 defm GeometryShaderPassthroughNV : CapabilityOperand<5251, 0, 0, [], [Geometry]>;
 defm ShaderViewportIndexLayerEXT : CapabilityOperand<5254, 0, 0, [], [MultiViewport]>;
@@ -457,6 +459,7 @@ defm BitInstructions : CapabilityOperand<6025, 0, 0, [SPV_KHR_bit_instructions],
 defm ExpectAssumeKHR : CapabilityOperand<5629, 0, 0, [SPV_KHR_expect_assume], []>;
 defm FunctionPointersINTEL : CapabilityOperand<5603, 0, 0, [SPV_INTEL_function_pointers], []>;
 defm IndirectReferencesINTEL : CapabilityOperand<5604, 0, 0, [SPV_INTEL_function_pointers], []>;
+defm AsmINTEL : CapabilityOperand<5606, 0, 0, [SPV_INTEL_inline_assembly], []>;
 defm GroupNonUniformRotateKHR : CapabilityOperand<6026, 0, 0, [SPV_KHR_subgroup_rotate], [GroupNonUniform]>;
 defm AtomicFloat32AddEXT : CapabilityOperand<6033, 0, 0, [SPV_EXT_shader_atomic_float_add], []>;
 defm AtomicFloat64AddEXT : CapabilityOperand<6034, 0, 0, [SPV_EXT_shader_atomic_float_add], []>;
@@ -1200,6 +1203,8 @@ defm UserSemantic : DecorationOperand<5635, 0, 0, [], []>;
 defm RestrictPointerEXT : DecorationOperand<5355, 0, 0, [], [PhysicalStorageBufferAddressesEXT]>;
 defm AliasedPointerEXT : DecorationOperand<5356, 0, 0, [], [PhysicalStorageBufferAddressesEXT]>;
 defm ReferencedIndirectlyINTEL : DecorationOperand<5602, 0, 0, [], [IndirectReferencesINTEL]>;
+defm ClobberINTEL : DecorationOperand<5607, 0, 0, [SPV_INTEL_inline_assembly], [AsmINTEL]>;
+defm SideEffectsINTEL : DecorationOperand<5608, 0, 0, [SPV_INTEL_inline_assembly], [AsmINTEL]>;
 defm ArgumentAttributeINTEL : DecorationOperand<6409, 0, 0, [], [FunctionPointersINTEL]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/VE/VVPNodes.def b/llvm/lib/Target/VE/VVPNodes.def
index a605886..da1e483 100644
--- a/llvm/lib/Target/VE/VVPNodes.def
+++ b/llvm/lib/Target/VE/VVPNodes.def
@@ -99,8 +99,8 @@ ADD_BINARY_VVP_OP_COMPACT(MUL)
 ADD_BINARY_VVP_OP_COMPACT(UDIV)
 ADD_BINARY_VVP_OP_COMPACT(SDIV)
 
-ADD_BINARY_VVP_OP(VVP_SRA,VP_ASHR,SRA) REGISTER_PACKED(VVP_SRA)
-ADD_BINARY_VVP_OP(VVP_SRL,VP_LSHR,SRL) REGISTER_PACKED(VVP_SRL)
+ADD_BINARY_VVP_OP(VVP_SRA,VP_SRA,SRA) REGISTER_PACKED(VVP_SRA)
+ADD_BINARY_VVP_OP(VVP_SRL,VP_SRL,SRL) REGISTER_PACKED(VVP_SRL)
 ADD_BINARY_VVP_OP_COMPACT(SHL) REGISTER_PACKED(VVP_SHL)
 
 ADD_BINARY_VVP_OP_COMPACT(AND) REGISTER_PACKED(VVP_AND)
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index d4e9fb0..3450217 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -345,6 +345,8 @@ inline bool isArgument(unsigned Opc) {
   case WebAssembly::ARGUMENT_v4i32_S:
   case WebAssembly::ARGUMENT_v2i64:
   case WebAssembly::ARGUMENT_v2i64_S:
+  case WebAssembly::ARGUMENT_v8f16:
+  case WebAssembly::ARGUMENT_v8f16_S:
   case WebAssembly::ARGUMENT_v4f32:
   case WebAssembly::ARGUMENT_v4f32_S:
   case WebAssembly::ARGUMENT_v2f64:
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
index fac2e0d..867953b 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
@@ -50,6 +50,7 @@ wasm::ValType WebAssembly::toValType(MVT Type) {
   case MVT::v8i16:
   case MVT::v4i32:
   case MVT::v2i64:
+  case MVT::v8f16:
   case MVT::v4f32:
   case MVT::v2f64:
     return wasm::ValType::V128;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 3524abb..44355853 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -62,7 +62,7 @@ MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const {
   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
   const TargetRegisterClass *TRC = MRI->getRegClass(RegNo);
   for (MVT T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64, MVT::v16i8, MVT::v8i16,
-                MVT::v4i32, MVT::v2i64, MVT::v4f32, MVT::v2f64})
+                MVT::v4i32, MVT::v2i64, MVT::v4f32, MVT::v2f64, MVT::v8f16})
     if (TRI->isTypeLegalForClass(*TRC, T))
       return T;
   LLVM_DEBUG(errs() << "Unknown type for register number: " << RegNo);
@@ -662,6 +662,8 @@ void WebAssemblyAsmPrinter::emitInstruction(const MachineInstr *MI) {
   case WebAssembly::ARGUMENT_v4f32_S:
   case WebAssembly::ARGUMENT_v2f64:
   case WebAssembly::ARGUMENT_v2f64_S:
+  case WebAssembly::ARGUMENT_v8f16:
+  case WebAssembly::ARGUMENT_v8f16_S:
     // These represent values which are live into the function entry, so there's
     // no instruction to emit.
     break;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 1c62290..26e1394 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -885,18 +885,6 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
       Table->setNoStrip();
       MIB.addImm(0);
     }
-    // See if we must truncate the function pointer.
-    // CALL_INDIRECT takes an i32, but in wasm64 we represent function pointers
-    // as 64-bit for uniformity with other pointer types.
-    // See also: WebAssemblyISelLowering.cpp: LowerCallResults
-    if (Subtarget->hasAddr64()) {
-      auto Wrap = BuildMI(*FuncInfo.MBB, std::prev(FuncInfo.InsertPt), MIMD,
-                          TII.get(WebAssembly::I32_WRAP_I64));
-      Register Reg32 = createResultReg(&WebAssembly::I32RegClass);
-      Wrap.addReg(Reg32, RegState::Define);
-      Wrap.addReg(CalleeReg);
-      CalleeReg = Reg32;
-    }
   }
 
   for (unsigned ArgReg : Args)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 527bb4c..518b693 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -70,6 +70,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     addRegisterClass(MVT::v2i64, &WebAssembly::V128RegClass);
     addRegisterClass(MVT::v2f64, &WebAssembly::V128RegClass);
   }
+  if (Subtarget->hasHalfPrecision()) {
+    addRegisterClass(MVT::v8f16, &WebAssembly::V128RegClass);
+  }
   if (Subtarget->hasReferenceTypes()) {
     addRegisterClass(MVT::externref, &WebAssembly::EXTERNREFRegClass);
     addRegisterClass(MVT::funcref, &WebAssembly::FUNCREFRegClass);
@@ -576,20 +579,6 @@ LowerCallResults(MachineInstr &CallResults, DebugLoc DL, MachineBasicBlock *BB,
   const MCInstrDesc &MCID = TII.get(CallOp);
   MachineInstrBuilder MIB(MF, MF.CreateMachineInstr(MCID, DL));
 
-  // See if we must truncate the function pointer.
-  // CALL_INDIRECT takes an i32, but in wasm64 we represent function pointers
-  // as 64-bit for uniformity with other pointer types.
-  // See also: WebAssemblyFastISel::selectCall
-  if (IsIndirect && MF.getSubtarget<WebAssemblySubtarget>().hasAddr64()) {
-    Register Reg32 =
-        MF.getRegInfo().createVirtualRegister(&WebAssembly::I32RegClass);
-    auto &FnPtr = CallParams.getOperand(0);
-    BuildMI(*BB, CallResults.getIterator(), DL,
-            TII.get(WebAssembly::I32_WRAP_I64), Reg32)
-        .addReg(FnPtr.getReg());
-    FnPtr.setReg(Reg32);
-  }
-
   // Move the function pointer to the end of the arguments for indirect calls
   if (IsIndirect) {
     auto FnPtr = CallParams.getOperand(0);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index af95dfa..558e3d8 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -38,6 +38,13 @@ multiclass RELAXED_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
                             asmstr_s, simdop, HasRelaxedSIMD>;
 }
 
+multiclass HALF_PRECISION_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
+                            list<dag> pattern_r, string asmstr_r = "",
+                            string asmstr_s = "", bits<32> simdop = -1> {
+  defm "" : ABSTRACT_SIMD_I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r,
+                            asmstr_s, simdop, HasHalfPrecision>;
+}
+
 
 defm "" : ARGUMENT<V128, v16i8>;
 defm "" : ARGUMENT<V128, v8i16>;
@@ -45,6 +52,7 @@ defm "" : ARGUMENT<V128, v4i32>;
 defm "" : ARGUMENT<V128, v2i64>;
 defm "" : ARGUMENT<V128, v4f32>;
 defm "" : ARGUMENT<V128, v2f64>;
+defm "" : ARGUMENT<V128, v8f16>;
 
 // Constrained immediate argument types. Allow any value from the minimum signed
 // value to the maximum unsigned value for the lane size.
@@ -591,6 +599,14 @@ defm "" : Splat<I64x2, 18>;
 defm "" : Splat<F32x4, 19>;
 defm "" : Splat<F64x2, 20>;
 
+// Half values are not fully supported so an intrinsic is used instead of a
+// regular Splat pattern as above.
+defm SPLAT_F16x8 :
+  HALF_PRECISION_I<(outs V128:$dst), (ins F32:$x),
+                   (outs), (ins),
+                   [(set (v8f16 V128:$dst), (int_wasm_splat_f16x8 F32:$x))],
+                   "f16x8.splat\t$dst, $x", "f16x8.splat", 0x120>;
+
 // scalar_to_vector leaves high lanes undefined, so can be a splat
 foreach vec = AllVecs in
 def : Pat<(vec.vt (scalar_to_vector (vec.lane_vt vec.lane_rc:$x))),
@@ -644,6 +660,14 @@ def : Pat<
   (and (vector_extract (v8i16 V128:$vec), (i32 LaneIdx8:$idx)), (i32 0xffff)),
   (EXTRACT_LANE_I16x8_u $vec, imm:$idx)>;
 
+defm EXTRACT_LANE_F16x8 :
+  HALF_PRECISION_I<(outs F32:$dst), (ins V128:$vec, vec_i8imm_op:$idx),
+                   (outs), (ins vec_i8imm_op:$idx),
+                   [(set (f32 F32:$dst), (int_wasm_extract_lane_f16x8
+                    (v8f16 V128:$vec), (i32 LaneIdx16:$idx)))],
+                   "f16x8.extract_lane\t$dst, $vec, $idx",
+                   "f16x8.extract_lane\t$idx", 0x121>;
+
 // Replace lane value: replace_lane
 multiclass ReplaceLane<Vec vec, bits<32> simdop> {
   defm REPLACE_LANE_#vec :
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
index ba2936b..4e2faa6 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@@ -63,7 +63,8 @@ def I32 : WebAssemblyRegClass<[i32], 32, (add FP32, SP32, I32_0)>;
 def I64 : WebAssemblyRegClass<[i64], 64, (add FP64, SP64, I64_0)>;
 def F32 : WebAssemblyRegClass<[f32], 32, (add F32_0)>;
 def F64 : WebAssemblyRegClass<[f64], 64, (add F64_0)>;
-def V128 : WebAssemblyRegClass<[v4f32, v2f64, v2i64, v4i32, v16i8, v8i16], 128,
-                               (add V128_0)>;
+def V128 : WebAssemblyRegClass<[v8f16, v4f32, v2f64, v2i64, v4i32, v16i8,
+                                v8i16],
+                               128, (add V128_0)>;
 def FUNCREF : WebAssemblyRegClass<[funcref], 0, (add FUNCREF_0)>;
 def EXTERNREF : WebAssemblyRegClass<[externref], 0, (add EXTERNREF_0)>;
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 54642ec..7e8133e 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -124,24 +124,15 @@ def FeatureEVEX512  : SubtargetFeature<"evex512", "HasEVEX512", "true",
 def FeatureAVX512   : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512",
                                       "Enable AVX-512 instructions",
                                       [FeatureAVX2, FeatureFMA, FeatureF16C]>;
-def FeatureERI      : SubtargetFeature<"avx512er", "HasERI", "true",
-                      "Enable AVX-512 Exponential and Reciprocal Instructions",
-                                      [FeatureAVX512]>;
 def FeatureCDI      : SubtargetFeature<"avx512cd", "HasCDI", "true",
                       "Enable AVX-512 Conflict Detection Instructions",
                                       [FeatureAVX512]>;
 def FeatureVPOPCNTDQ : SubtargetFeature<"avx512vpopcntdq", "HasVPOPCNTDQ",
                        "true", "Enable AVX-512 Population Count Instructions",
                                       [FeatureAVX512]>;
-def FeaturePFI      : SubtargetFeature<"avx512pf", "HasPFI", "true",
-                      "Enable AVX-512 PreFetch Instructions",
-                                      [FeatureAVX512]>;
 def FeaturePREFETCHI  : SubtargetFeature<"prefetchi", "HasPREFETCHI",
                                    "true",
                                    "Prefetch instruction with T0 or T1 Hint">;
-def FeaturePREFETCHWT1  : SubtargetFeature<"prefetchwt1", "HasPREFETCHWT1",
-                                   "true",
-                                   "Prefetch with Intent to Write and T1 Hint">;
 def FeatureDQI     : SubtargetFeature<"avx512dq", "HasDQI", "true",
                       "Enable AVX-512 Doubleword and Quadword Instructions",
                                       [FeatureAVX512]>;
@@ -1312,10 +1303,7 @@ def ProcessorFeatures {
                                         FeatureFSGSBase,
                                         FeatureAVX512,
                                         FeatureEVEX512,
-                                        FeatureERI,
                                         FeatureCDI,
-                                        FeaturePFI,
-                                        FeaturePREFETCHWT1,
                                         FeatureADX,
                                         FeatureRDSEED,
                                         FeatureMOVBE,
diff --git a/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
index db1d21b..a0c91d4 100644
--- a/llvm/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -53,7 +53,6 @@
 #include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/Passes.h"
@@ -113,8 +112,6 @@ public:
   FixupBWInstPass() : MachineFunctionPass(ID) { }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineLoopInfo>(); // Machine loop info is used to
-                                       // guide some heuristics.
     AU.addRequired<ProfileSummaryInfoWrapperPass>();
     AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
@@ -141,9 +138,6 @@ private:
   /// Local member for function's OptForSize attribute.
   bool OptForSize = false;
 
-  /// Machine loop info used for guiding some heruistics.
-  MachineLoopInfo *MLI = nullptr;
-
   /// Register Liveness information after the current instruction.
   LiveRegUnits LiveUnits;
 
@@ -164,7 +158,6 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) {
   this->MF = &MF;
   TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
   TRI = MF.getRegInfo().getTargetRegisterInfo();
-  MLI = &getAnalysis<MachineLoopInfo>();
   PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   MBFI = (PSI && PSI->hasProfileSummary()) ?
          &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index ea5ef5b..80ff98b 100644
--- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -73,7 +73,7 @@ using CondRegArray = std::array<unsigned, X86::LAST_VALID_COND + 1>;
 
 class X86FlagsCopyLoweringPass : public MachineFunctionPass {
 public:
-  X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) { }
+  X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) {}
 
   StringRef getPassName() const override { return "X86 EFLAGS copy lowering"; }
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -102,32 +102,14 @@ private:
   void insertTest(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos,
                   const DebugLoc &Loc, unsigned Reg);
 
-  void rewriteArithmetic(MachineBasicBlock &TestMBB,
-                         MachineBasicBlock::iterator TestPos,
-                         const DebugLoc &TestLoc, MachineInstr &MI,
-                         MachineOperand &FlagUse, CondRegArray &CondRegs);
-  void rewriteCMov(MachineBasicBlock &TestMBB,
-                   MachineBasicBlock::iterator TestPos, const DebugLoc &TestLoc,
-                   MachineInstr &CMovI, MachineOperand &FlagUse,
-                   CondRegArray &CondRegs);
-  void rewriteFCMov(MachineBasicBlock &TestMBB,
-                    MachineBasicBlock::iterator TestPos,
-                    const DebugLoc &TestLoc, MachineInstr &CMovI,
-                    MachineOperand &FlagUse, CondRegArray &CondRegs);
-  void rewriteCondJmp(MachineBasicBlock &TestMBB,
-                      MachineBasicBlock::iterator TestPos,
-                      const DebugLoc &TestLoc, MachineInstr &JmpI,
-                      CondRegArray &CondRegs);
-  void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse,
-                   MachineInstr &CopyDefI);
-  void rewriteSetCC(MachineBasicBlock &TestMBB,
-                    MachineBasicBlock::iterator TestPos,
-                    const DebugLoc &TestLoc, MachineInstr &SetCCI,
-                    MachineOperand &FlagUse, CondRegArray &CondRegs);
-  void rewriteCCMP(MachineBasicBlock &TestMBB,
-                   MachineBasicBlock::iterator TestPos, const DebugLoc &TestLoc,
-                   MachineInstr &CMovI, MachineOperand &FlagUse,
-                   CondRegArray &CondRegs);
+  void rewriteSetCC(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos,
+                    const DebugLoc &Loc, MachineInstr &MI,
+                    CondRegArray &CondRegs);
+  void rewriteArithmetic(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator Pos, const DebugLoc &Loc,
+                         MachineInstr &MI, CondRegArray &CondRegs);
+  void rewriteMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos,
+                 const DebugLoc &Loc, MachineInstr &MI, CondRegArray &CondRegs);
 };
 
 } // end anonymous namespace
@@ -148,85 +130,9 @@ void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-namespace {
-/// An enumeration of the arithmetic instruction mnemonics which have
-/// interesting flag semantics.
-///
-/// We can map instruction opcodes into these mnemonics to make it easy to
-/// dispatch with specific functionality.
-enum class FlagArithMnemonic {
-  ADC,
-  RCL,
-  RCR,
-  SBB,
-  SETB,
-};
-} // namespace
-
-static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    report_fatal_error("No support for lowering a copy into EFLAGS when used "
-                       "by this instruction!");
-
-#define CASE_ND(OP)                                                            \
-  case X86::OP:                                                                \
-  case X86::OP##_ND:
-
-#define LLVM_EXPAND_INSTR_SIZES(MNEMONIC, SUFFIX)                              \
-  CASE_ND(MNEMONIC##8##SUFFIX)                                                 \
-  CASE_ND(MNEMONIC##16##SUFFIX)                                                \
-  CASE_ND(MNEMONIC##32##SUFFIX)                                                \
-  CASE_ND(MNEMONIC##64##SUFFIX)
-
-#define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC)                                    \
-  LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr)                                        \
-  LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm)                                        \
-  LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr)                                        \
-  CASE_ND(MNEMONIC##8ri)                                                       \
-  CASE_ND(MNEMONIC##16ri8)                                                     \
-  CASE_ND(MNEMONIC##32ri8)                                                     \
-  CASE_ND(MNEMONIC##64ri8)                                                     \
-  CASE_ND(MNEMONIC##16ri)                                                      \
-  CASE_ND(MNEMONIC##32ri)                                                      \
-  CASE_ND(MNEMONIC##64ri32)                                                    \
-  CASE_ND(MNEMONIC##8mi)                                                       \
-  CASE_ND(MNEMONIC##16mi8)                                                     \
-  CASE_ND(MNEMONIC##32mi8)                                                     \
-  CASE_ND(MNEMONIC##64mi8)                                                     \
-  CASE_ND(MNEMONIC##16mi)                                                      \
-  CASE_ND(MNEMONIC##32mi)                                                      \
-  CASE_ND(MNEMONIC##64mi32)                                                    \
-  case X86::MNEMONIC##8i8:                                                     \
-  case X86::MNEMONIC##16i16:                                                   \
-  case X86::MNEMONIC##32i32:                                                   \
-  case X86::MNEMONIC##64i32:
-
-    LLVM_EXPAND_ADC_SBB_INSTR(ADC)
-    return FlagArithMnemonic::ADC;
-
-    LLVM_EXPAND_ADC_SBB_INSTR(SBB)
-    return FlagArithMnemonic::SBB;
-
-#undef LLVM_EXPAND_ADC_SBB_INSTR
-
-    LLVM_EXPAND_INSTR_SIZES(RCL, rCL)
-    LLVM_EXPAND_INSTR_SIZES(RCL, r1)
-    LLVM_EXPAND_INSTR_SIZES(RCL, ri)
-    return FlagArithMnemonic::RCL;
-
-    LLVM_EXPAND_INSTR_SIZES(RCR, rCL)
-    LLVM_EXPAND_INSTR_SIZES(RCR, r1)
-    LLVM_EXPAND_INSTR_SIZES(RCR, ri)
-    return FlagArithMnemonic::RCR;
-
-#undef LLVM_EXPAND_INSTR_SIZES
-#undef CASE_ND
-
-  case X86::SETB_C32r:
-  case X86::SETB_C64r:
-    return FlagArithMnemonic::SETB;
-  }
+static bool isArithmeticOp(unsigned Opc) {
+  return X86::isADC(Opc) || X86::isSBB(Opc) || X86::isRCL(Opc) ||
+         X86::isRCR(Opc) || (Opc == X86::SETB_C32r || Opc == X86::SETB_C64r);
 }
 
 static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB,
@@ -329,28 +235,6 @@ static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB,
   return NewMBB;
 }
 
-static X86::CondCode getCondFromFCMOV(unsigned Opcode) {
-  switch (Opcode) {
-  default: return X86::COND_INVALID;
-  case X86::CMOVBE_Fp32:  case X86::CMOVBE_Fp64:  case X86::CMOVBE_Fp80:
-    return X86::COND_BE;
-  case X86::CMOVB_Fp32:   case X86::CMOVB_Fp64:   case X86::CMOVB_Fp80:
-    return X86::COND_B;
-  case X86::CMOVE_Fp32:   case X86::CMOVE_Fp64:   case X86::CMOVE_Fp80:
-    return X86::COND_E;
-  case X86::CMOVNBE_Fp32: case X86::CMOVNBE_Fp64: case X86::CMOVNBE_Fp80:
-    return X86::COND_A;
-  case X86::CMOVNB_Fp32:  case X86::CMOVNB_Fp64:  case X86::CMOVNB_Fp80:
-    return X86::COND_AE;
-  case X86::CMOVNE_Fp32:  case X86::CMOVNE_Fp64:  case X86::CMOVNE_Fp80:
-    return X86::COND_NE;
-  case X86::CMOVNP_Fp32:  case X86::CMOVNP_Fp64:  case X86::CMOVNP_Fp80:
-    return X86::COND_NP;
-  case X86::CMOVP_Fp32:   case X86::CMOVP_Fp64:   case X86::CMOVP_Fp80:
-    return X86::COND_P;
-  }
-}
-
 bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
                     << " **********\n");
@@ -362,7 +246,7 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
   MDT = &getAnalysis<MachineDominatorTree>();
   PromoteRC = &X86::GR8RegClass;
 
-  if (MF.begin() == MF.end())
+  if (MF.empty())
     // Nothing to do for a degenerate empty function...
     return false;
 
@@ -569,20 +453,12 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
 
         MachineOperand *FlagUse =
             MI.findRegisterUseOperand(X86::EFLAGS, /*TRI=*/nullptr);
-        if (!FlagUse) {
-          if (MI.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr)) {
-            // If EFLAGS are defined, it's as-if they were killed. We can stop
-            // scanning here.
-            //
-            // NB!!! Many instructions only modify some flags. LLVM currently
-            // models this as clobbering all flags, but if that ever changes
-            // this will need to be carefully updated to handle that more
-            // complex logic.
-            FlagsKilled = true;
-            break;
-          }
+        FlagsKilled = MI.modifiesRegister(X86::EFLAGS, TRI);
+
+        if (!FlagUse && FlagsKilled)
+          break;
+        else if (!FlagUse)
           continue;
-        }
 
         LLVM_DEBUG(dbgs() << "  Rewriting use: "; MI.dump());
 
@@ -604,40 +480,23 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
             JmpIs.push_back(&*JmpIt);
             ++JmpIt;
           } while (JmpIt != UseMBB.instr_end() &&
-                   X86::getCondFromBranch(*JmpIt) !=
-                       X86::COND_INVALID);
+                   X86::getCondFromBranch(*JmpIt) != X86::COND_INVALID);
           break;
         }
 
         // Otherwise we can just rewrite in-place.
-        if (X86::getCondFromCMov(MI) != X86::COND_INVALID ||
-            X86::getCondFromCFCMov(MI) != X86::COND_INVALID) {
-          rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
-        } else if (getCondFromFCMOV(MI.getOpcode()) != X86::COND_INVALID) {
-          rewriteFCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
-        } else if (X86::getCondFromSETCC(MI) != X86::COND_INVALID) {
-          rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
-        } else if (X86::getCondFromCCMP(MI) != X86::COND_INVALID) {
-          rewriteCCMP(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
-          FlagsKilled = true;
-        } else if (MI.getOpcode() == TargetOpcode::COPY) {
-          rewriteCopy(MI, *FlagUse, CopyDefI);
+        unsigned Opc = MI.getOpcode();
+        if (Opc == TargetOpcode::COPY) {
+          // Just replace this copy with the original copy def.
+          MRI->replaceRegWith(MI.getOperand(0).getReg(),
+                              CopyDefI.getOperand(0).getReg());
+          MI.eraseFromParent();
+        } else if (X86::isSETCC(Opc)) {
+          rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, CondRegs);
+        } else if (isArithmeticOp(Opc)) {
+          rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, CondRegs);
         } else {
-          // We assume all other instructions that use flags also def them.
-          assert(MI.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr) &&
-                 "Expected a def of EFLAGS for this instruction!");
-
-          // NB!!! Several arithmetic instructions only *partially* update
-          // flags. Theoretically, we could generate MI code sequences that
-          // would rely on this fact and observe different flags independently.
-          // But currently LLVM models all of these instructions as clobbering
-          // all the flags in an undef way. We rely on that to simplify the
-          // logic.
-          FlagsKilled = true;
-
-          // Generically handle remaining uses as arithmetic instructions.
-          rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
-                            CondRegs);
+          rewriteMI(*TestMBB, TestPos, TestLoc, MI, CondRegs);
         }
 
         // If this was the last use of the flags, we're done.
@@ -702,7 +561,7 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
       else
         LastJmpMBB = JmpI->getParent();
 
-      rewriteCondJmp(*TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
+      rewriteMI(*TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
     }
 
     // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if
@@ -753,8 +612,8 @@ Register X86FlagsCopyLoweringPass::promoteCondToReg(
     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
     const DebugLoc &TestLoc, X86::CondCode Cond) {
   Register Reg = MRI->createVirtualRegister(PromoteRC);
-  auto SetI = BuildMI(TestMBB, TestPos, TestLoc,
-                      TII->get(X86::SETCCr), Reg).addImm(Cond);
+  auto SetI = BuildMI(TestMBB, TestPos, TestLoc, TII->get(X86::SETCCr), Reg)
+                  .addImm(Cond);
   (void)SetI;
   LLVM_DEBUG(dbgs() << "    save cond: "; SetI->dump());
   ++NumSetCCsInserted;
@@ -785,43 +644,66 @@ void X86FlagsCopyLoweringPass::insertTest(MachineBasicBlock &MBB,
   ++NumTestsInserted;
 }
 
-void X86FlagsCopyLoweringPass::rewriteArithmetic(
-    MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
-    const DebugLoc &TestLoc, MachineInstr &MI, MachineOperand &FlagUse,
-    CondRegArray &CondRegs) {
-  // Arithmetic is either reading CF or OF. Figure out which condition we need
-  // to preserve in a register.
-  X86::CondCode Cond = X86::COND_INVALID;
+void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator Pos,
+                                            const DebugLoc &Loc,
+                                            MachineInstr &MI,
+                                            CondRegArray &CondRegs) {
+  X86::CondCode Cond = X86::getCondFromSETCC(MI);
+  // Note that we can't usefully rewrite this to the inverse without complex
+  // analysis of the users of the setCC. Largely we rely on duplicates which
+  // could have been avoided already being avoided here.
+  unsigned &CondReg = CondRegs[Cond];
+  if (!CondReg)
+    CondReg = promoteCondToReg(MBB, Pos, Loc, Cond);
 
-  // The addend to use to reset CF or OF when added to the flag value.
-  int Addend = 0;
-
-  switch (getMnemonicFromOpcode(MI.getOpcode())) {
-  case FlagArithMnemonic::ADC:
-  case FlagArithMnemonic::RCL:
-  case FlagArithMnemonic::RCR:
-  case FlagArithMnemonic::SBB:
-  case FlagArithMnemonic::SETB:
-    Cond = X86::COND_B; // CF == 1
-    // Set up an addend that when one is added will need a carry due to not
-    // having a higher bit available.
-    Addend = 255;
-    break;
+  // Rewriting a register def is trivial: we just replace the register and
+  // remove the setcc.
+  if (!MI.mayStore()) {
+    assert(MI.getOperand(0).isReg() &&
+           "Cannot have a non-register defined operand to SETcc!");
+    Register OldReg = MI.getOperand(0).getReg();
+    // Drop Kill flags on the old register before replacing. CondReg may have
+    // a longer live range.
+    MRI->clearKillFlags(OldReg);
+    MRI->replaceRegWith(OldReg, CondReg);
+    MI.eraseFromParent();
+    return;
   }
 
+  // Otherwise, we need to emit a store.
+  auto MIB = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+                     TII->get(X86::MOV8mr));
+  // Copy the address operands.
+  for (int i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.add(MI.getOperand(i));
+
+  MIB.addReg(CondReg);
+  MIB.setMemRefs(MI.memoperands());
+  MI.eraseFromParent();
+}
+
+void X86FlagsCopyLoweringPass::rewriteArithmetic(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos,
+    const DebugLoc &Loc, MachineInstr &MI, CondRegArray &CondRegs) {
+  // Arithmetic is either reading CF or OF.
+  X86::CondCode Cond = X86::COND_B; // CF == 1
+  // The addend to use to reset CF or OF when added to the flag value.
+  // Set up an addend that when one is added will need a carry due to not
+  // having a higher bit available.
+  int Addend = 255;
+
   // Now get a register that contains the value of the flag input to the
   // arithmetic. We require exactly this flag to simplify the arithmetic
   // required to materialize it back into the flag.
   unsigned &CondReg = CondRegs[Cond];
   if (!CondReg)
-    CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
-
-  MachineBasicBlock &MBB = *MI.getParent();
+    CondReg = promoteCondToReg(MBB, Pos, Loc, Cond);
 
   // Insert an instruction that will set the flag back to the desired value.
   Register TmpReg = MRI->createVirtualRegister(PromoteRC);
   auto AddI =
-      BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(),
+      BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
               TII->get(Subtarget->hasNDD() ? X86::ADD8ri_ND : X86::ADD8ri))
           .addDef(TmpReg, RegState::Dead)
           .addReg(CondReg)
@@ -829,177 +711,81 @@ void X86FlagsCopyLoweringPass::rewriteArithmetic(
   (void)AddI;
   LLVM_DEBUG(dbgs() << "    add cond: "; AddI->dump());
   ++NumAddsInserted;
-  FlagUse.setIsKill(true);
+  MI.findRegisterUseOperand(X86::EFLAGS, /*TRI=*/nullptr)->setIsKill(true);
 }
 
-void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB,
-                                           MachineBasicBlock::iterator TestPos,
-                                           const DebugLoc &TestLoc,
-                                           MachineInstr &CMovI,
-                                           MachineOperand &FlagUse,
-                                           CondRegArray &CondRegs) {
-  // First get the register containing this specific condition.
-  X86::CondCode Cond = X86::getCondFromCMov(CMovI) == X86::COND_INVALID
-                           ? X86::getCondFromCFCMov(CMovI)
-                           : X86::getCondFromCMov(CMovI);
-  unsigned CondReg;
-  bool Inverted;
-  std::tie(CondReg, Inverted) =
-      getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
-
-  MachineBasicBlock &MBB = *CMovI.getParent();
+static X86::CondCode getImplicitCondFromMI(unsigned Opc) {
+#define FROM_TO(A, B)                                                          \
+  case X86::CMOV##A##_Fp32:                                                    \
+  case X86::CMOV##A##_Fp64:                                                    \
+  case X86::CMOV##A##_Fp80:                                                    \
+    return X86::COND_##B;
 
-  // Insert a direct test of the saved register.
-  insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg);
-
-  // Rewrite the CMov to use the !ZF flag from the test, and then kill its use
-  // of the flags afterward.
-  CMovI.getOperand(CMovI.getDesc().getNumOperands() - 1)
-      .setImm(Inverted ? X86::COND_E : X86::COND_NE);
-  FlagUse.setIsKill(true);
-  LLVM_DEBUG(dbgs() << "    fixed cmov: "; CMovI.dump());
+  switch (Opc) {
+  default:
+    return X86::COND_INVALID;
+    FROM_TO(B, B)
+    FROM_TO(E, E)
+    FROM_TO(P, P)
+    FROM_TO(BE, BE)
+    FROM_TO(NB, AE)
+    FROM_TO(NE, NE)
+    FROM_TO(NP, NP)
+    FROM_TO(NBE, A)
+  }
+#undef FROM_TO
 }
 
-void X86FlagsCopyLoweringPass::rewriteFCMov(MachineBasicBlock &TestMBB,
-                                            MachineBasicBlock::iterator TestPos,
-                                            const DebugLoc &TestLoc,
-                                            MachineInstr &CMovI,
-                                            MachineOperand &FlagUse,
-                                            CondRegArray &CondRegs) {
-  // First get the register containing this specific condition.
-  X86::CondCode Cond = getCondFromFCMOV(CMovI.getOpcode());
-  unsigned CondReg;
-  bool Inverted;
-  std::tie(CondReg, Inverted) =
-      getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
-
-  MachineBasicBlock &MBB = *CMovI.getParent();
-
-  // Insert a direct test of the saved register.
-  insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg);
-
-  auto getFCMOVOpcode = [](unsigned Opcode, bool Inverted) {
-    switch (Opcode) {
-    default: llvm_unreachable("Unexpected opcode!");
-    case X86::CMOVBE_Fp32: case X86::CMOVNBE_Fp32:
-    case X86::CMOVB_Fp32:  case X86::CMOVNB_Fp32:
-    case X86::CMOVE_Fp32:  case X86::CMOVNE_Fp32:
-    case X86::CMOVP_Fp32:  case X86::CMOVNP_Fp32:
-      return Inverted ? X86::CMOVE_Fp32 : X86::CMOVNE_Fp32;
-    case X86::CMOVBE_Fp64: case X86::CMOVNBE_Fp64:
-    case X86::CMOVB_Fp64:  case X86::CMOVNB_Fp64:
-    case X86::CMOVE_Fp64:  case X86::CMOVNE_Fp64:
-    case X86::CMOVP_Fp64:  case X86::CMOVNP_Fp64:
-      return Inverted ? X86::CMOVE_Fp64 : X86::CMOVNE_Fp64;
-    case X86::CMOVBE_Fp80: case X86::CMOVNBE_Fp80:
-    case X86::CMOVB_Fp80:  case X86::CMOVNB_Fp80:
-    case X86::CMOVE_Fp80:  case X86::CMOVNE_Fp80:
-    case X86::CMOVP_Fp80:  case X86::CMOVNP_Fp80:
-      return Inverted ? X86::CMOVE_Fp80 : X86::CMOVNE_Fp80;
-    }
-  };
-
-  // Rewrite the CMov to use the !ZF flag from the test.
-  CMovI.setDesc(TII->get(getFCMOVOpcode(CMovI.getOpcode(), Inverted)));
-  FlagUse.setIsKill(true);
-  LLVM_DEBUG(dbgs() << "    fixed fcmov: "; CMovI.dump());
+static unsigned getOpcodeWithCC(unsigned Opc, X86::CondCode CC) {
+  assert((CC == X86::COND_E || CC == X86::COND_NE) && "Unexpected CC");
+#define CASE(A)                                                                \
+  case X86::CMOVB_##A:                                                         \
+  case X86::CMOVE_##A:                                                         \
+  case X86::CMOVP_##A:                                                         \
+  case X86::CMOVBE_##A:                                                        \
+  case X86::CMOVNB_##A:                                                        \
+  case X86::CMOVNE_##A:                                                        \
+  case X86::CMOVNP_##A:                                                        \
+  case X86::CMOVNBE_##A:                                                       \
+    return (CC == X86::COND_E) ? X86::CMOVE_##A : X86::CMOVNE_##A;
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unexpected opcode");
+    CASE(Fp32)
+    CASE(Fp64)
+    CASE(Fp80)
+  }
+#undef CASE
 }
 
-void X86FlagsCopyLoweringPass::rewriteCondJmp(
-    MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
-    const DebugLoc &TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) {
+void X86FlagsCopyLoweringPass::rewriteMI(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator Pos,
+                                         const DebugLoc &Loc, MachineInstr &MI,
+                                         CondRegArray &CondRegs) {
   // First get the register containing this specific condition.
-  X86::CondCode Cond = X86::getCondFromBranch(JmpI);
+  bool IsImplicitCC = false;
+  X86::CondCode CC = X86::getCondFromMI(MI);
+  if (CC == X86::COND_INVALID) {
+    CC = getImplicitCondFromMI(MI.getOpcode());
+    IsImplicitCC = true;
+  }
+  assert(CC != X86::COND_INVALID && "Unknown EFLAG user!");
   unsigned CondReg;
   bool Inverted;
   std::tie(CondReg, Inverted) =
-      getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
-
-  MachineBasicBlock &JmpMBB = *JmpI.getParent();
+      getCondOrInverseInReg(MBB, Pos, Loc, CC, CondRegs);
 
   // Insert a direct test of the saved register.
-  insertTest(JmpMBB, JmpI.getIterator(), JmpI.getDebugLoc(), CondReg);
-
-  // Rewrite the jump to use the !ZF flag from the test, and kill its use of
-  // flags afterward.
-  JmpI.getOperand(1).setImm(Inverted ? X86::COND_E : X86::COND_NE);
-  JmpI.findRegisterUseOperand(X86::EFLAGS, /*TRI=*/nullptr)->setIsKill(true);
-  LLVM_DEBUG(dbgs() << "    fixed jCC: "; JmpI.dump());
-}
-
-void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI,
-                                           MachineOperand &FlagUse,
-                                           MachineInstr &CopyDefI) {
-  // Just replace this copy with the original copy def.
-  MRI->replaceRegWith(MI.getOperand(0).getReg(),
-                      CopyDefI.getOperand(0).getReg());
-  MI.eraseFromParent();
-}
-
-void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB,
-                                            MachineBasicBlock::iterator TestPos,
-                                            const DebugLoc &TestLoc,
-                                            MachineInstr &SetCCI,
-                                            MachineOperand &FlagUse,
-                                            CondRegArray &CondRegs) {
-  X86::CondCode Cond = X86::getCondFromSETCC(SetCCI);
-  // Note that we can't usefully rewrite this to the inverse without complex
-  // analysis of the users of the setCC. Largely we rely on duplicates which
-  // could have been avoided already being avoided here.
-  unsigned &CondReg = CondRegs[Cond];
-  if (!CondReg)
-    CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
-
-  // Rewriting a register def is trivial: we just replace the register and
-  // remove the setcc.
-  if (!SetCCI.mayStore()) {
-    assert(SetCCI.getOperand(0).isReg() &&
-           "Cannot have a non-register defined operand to SETcc!");
-    Register OldReg = SetCCI.getOperand(0).getReg();
-    // Drop Kill flags on the old register before replacing. CondReg may have
-    // a longer live range.
-    MRI->clearKillFlags(OldReg);
-    MRI->replaceRegWith(OldReg, CondReg);
-    SetCCI.eraseFromParent();
-    return;
-  }
+  insertTest(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), CondReg);
 
-  // Otherwise, we need to emit a store.
-  auto MIB = BuildMI(*SetCCI.getParent(), SetCCI.getIterator(),
-                     SetCCI.getDebugLoc(), TII->get(X86::MOV8mr));
-  // Copy the address operands.
-  for (int i = 0; i < X86::AddrNumOperands; ++i)
-    MIB.add(SetCCI.getOperand(i));
-
-  MIB.addReg(CondReg);
-
-  MIB.setMemRefs(SetCCI.memoperands());
-
-  SetCCI.eraseFromParent();
-}
-
-void X86FlagsCopyLoweringPass::rewriteCCMP(MachineBasicBlock &TestMBB,
-                                           MachineBasicBlock::iterator TestPos,
-                                           const DebugLoc &TestLoc,
-                                           MachineInstr &CCMPI,
-                                           MachineOperand &FlagUse,
-                                           CondRegArray &CondRegs) {
-  // First get the register containing this specific condition.
-  X86::CondCode Cond = X86::getCondFromCCMP(CCMPI);
-  unsigned CondReg;
-  bool Inverted;
-  std::tie(CondReg, Inverted) =
-      getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
-
-  MachineBasicBlock &MBB = *CCMPI.getParent();
+  // Rewrite the instruction to use the !ZF flag from the test, and then kill
+  // its use of the flags afterward.
+  X86::CondCode NewCC = Inverted ? X86::COND_E : X86::COND_NE;
+  if (IsImplicitCC)
+    MI.setDesc(TII->get(getOpcodeWithCC(MI.getOpcode(), NewCC)));
+  else
+    MI.getOperand(MI.getDesc().getNumOperands() - 1).setImm(NewCC);
 
-  // Insert a direct test of the saved register.
-  insertTest(MBB, CCMPI.getIterator(), CCMPI.getDebugLoc(), CondReg);
-
-  // Rewrite the CCMP/CTEST to use the !ZF flag from the test, and then kill its
-  // use of the flags afterward.
-  CCMPI.getOperand(CCMPI.getDesc().getNumOperands() - 1)
-      .setImm(Inverted ? X86::COND_E : X86::COND_NE);
-  FlagUse.setIsKill(true);
-  LLVM_DEBUG(dbgs() << "    fixed ccmp/ctest: "; CCMPI.dump());
+  MI.findRegisterUseOperand(X86::EFLAGS, /*TRI=*/nullptr)->setIsKill(true);
+  LLVM_DEBUG(dbgs() << "    fixed instruction: "; MI.dump());
 }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5d08464..ca32cfe 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1108,13 +1108,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
     }
 
-    setOperationAction(ISD::ABDU,               MVT::v16i8, Custom);
-    setOperationAction(ISD::ABDS,               MVT::v16i8, Custom);
-    setOperationAction(ISD::ABDU,               MVT::v8i16, Custom);
-    setOperationAction(ISD::ABDS,               MVT::v8i16, Custom);
-    setOperationAction(ISD::ABDU,               MVT::v4i32, Custom);
-    setOperationAction(ISD::ABDS,               MVT::v4i32, Custom);
-
     setOperationAction(ISD::UADDSAT,            MVT::v16i8, Legal);
     setOperationAction(ISD::SADDSAT,            MVT::v16i8, Legal);
     setOperationAction(ISD::USUBSAT,            MVT::v16i8, Legal);
@@ -1132,9 +1125,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
-      setOperationAction(ISD::SETCC,              VT, Custom);
-      setOperationAction(ISD::CTPOP,              VT, Custom);
-      setOperationAction(ISD::ABS,                VT, Custom);
+      setOperationAction(ISD::SETCC, VT, Custom);
+      setOperationAction(ISD::CTPOP, VT, Custom);
+      setOperationAction(ISD::ABS, VT, Custom);
+      setOperationAction(ISD::ABDS, VT, Custom);
+      setOperationAction(ISD::ABDU, VT, Custom);
 
       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
       // setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -1336,11 +1331,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
 
-    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
-      setOperationAction(ISD::ABDS,             VT, Custom);
-      setOperationAction(ISD::ABDU,             VT, Custom);
-    }
-
     setOperationAction(ISD::UADDSAT,            MVT::v4i32, Custom);
     setOperationAction(ISD::SADDSAT,            MVT::v2i64, Custom);
     setOperationAction(ISD::SSUBSAT,            MVT::v2i64, Custom);
@@ -2032,6 +2022,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
       setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
     }
+
+    setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
+    setOperationAction(ISD::FABS, MVT::v32f16, Custom);
+    setOperationAction(ISD::FCOPYSIGN, MVT::v32f16, Custom);
   }// useAVX512Regs
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
@@ -2108,9 +2102,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
         setOperationAction(ISD::CTPOP, VT, Legal);
     }
-    setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
-    setOperationAction(ISD::FABS, MVT::v32f16, Custom);
-    setOperationAction(ISD::FCOPYSIGN, MVT::v32f16, Custom);
   }
 
   // This block control legalization of v32i1/v64i1 which are available with
@@ -3292,7 +3283,7 @@ bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
   if (VT != MVT::i32 && VT != MVT::i64)
     return false;
 
-  return !isa<ConstantSDNode>(Y);
+  return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
 }
 
 bool X86TargetLowering::hasAndNot(SDValue Y) const {
@@ -20130,12 +20121,11 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
   return Res;
 }
 
-static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
+static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG,
                               const X86Subtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
   SDValue In = Op.getOperand(0);
   MVT InVT = In.getSimpleValueType();
-  SDLoc dl(Op);
   unsigned Opc = Op.getOpcode();
 
   assert(VT.isVector() && InVT.isVector() && "Expected vector type");
@@ -20206,14 +20196,13 @@ static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
   return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
 }
 
-static  SDValue LowerZERO_EXTEND_Mask(SDValue Op,
-                                      const X86Subtarget &Subtarget,
-                                      SelectionDAG &DAG) {
+static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL,
+                                     const X86Subtarget &Subtarget,
+                                     SelectionDAG &DAG) {
   MVT VT = Op->getSimpleValueType(0);
   SDValue In = Op->getOperand(0);
   MVT InVT = In.getSimpleValueType();
   assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
-  SDLoc DL(Op);
   unsigned NumElts = VT.getVectorNumElements();
 
   // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
@@ -20268,12 +20257,13 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
                                 SelectionDAG &DAG) {
   SDValue In = Op.getOperand(0);
   MVT SVT = In.getSimpleValueType();
+  SDLoc DL(Op);
 
   if (SVT.getVectorElementType() == MVT::i1)
-    return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
+    return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
 
   assert(Subtarget.hasAVX() && "Expected AVX support");
-  return LowerAVXExtend(Op, DAG, Subtarget);
+  return LowerAVXExtend(Op, DL, DAG, Subtarget);
 }
 
 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
@@ -24320,7 +24310,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
 }
 
-static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
+static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl,
                                      const X86Subtarget &Subtarget,
                                      SelectionDAG &DAG) {
   MVT VT = Op->getSimpleValueType(0);
@@ -24328,8 +24318,6 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
   MVT InVT = In.getSimpleValueType();
   assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
   MVT VTElt = VT.getVectorElementType();
-  SDLoc dl(Op);
-
   unsigned NumElts = VT.getVectorNumElements();
 
   // Extend VT if the scalar type is i8/i16 and BWI is not supported.
@@ -24381,12 +24369,13 @@ static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
                                SelectionDAG &DAG) {
   SDValue In = Op->getOperand(0);
   MVT InVT = In.getSimpleValueType();
+  SDLoc DL(Op);
 
   if (InVT.getVectorElementType() == MVT::i1)
-    return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
+    return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG);
 
   assert(Subtarget.hasAVX() && "Expected AVX support");
-  return LowerAVXExtend(Op, DAG, Subtarget);
+  return LowerAVXExtend(Op, DL, DAG, Subtarget);
 }
 
 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
@@ -24524,7 +24513,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
   SDLoc dl(Op);
 
   if (InVT.getVectorElementType() == MVT::i1)
-    return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
+    return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG);
 
   assert(VT.isVector() && InVT.isVector() && "Expected vector type");
   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
@@ -28421,18 +28410,6 @@ static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
     }
   }
 
-  // TODO: Move to TargetLowering expandABD().
-  if (!Subtarget.hasSSE41() &&
-      ((IsSigned && VT == MVT::v16i8) || VT == MVT::v4i32)) {
-    SDValue LHS = DAG.getFreeze(Op.getOperand(0));
-    SDValue RHS = DAG.getFreeze(Op.getOperand(1));
-    ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT;
-    SDValue Cmp = DAG.getSetCC(dl, VT, LHS, RHS, CC);
-    SDValue Diff0 = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
-    SDValue Diff1 = DAG.getNode(ISD::SUB, dl, VT, RHS, LHS);
-    return getBitSelect(dl, VT, Diff0, Diff1, Cmp, DAG);
-  }
-
   // Default to expand.
   return SDValue();
 }
@@ -33849,18 +33826,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(ADDSUB)
   NODE_NAME_CASE(RCP14)
   NODE_NAME_CASE(RCP14S)
-  NODE_NAME_CASE(RCP28)
-  NODE_NAME_CASE(RCP28_SAE)
-  NODE_NAME_CASE(RCP28S)
-  NODE_NAME_CASE(RCP28S_SAE)
-  NODE_NAME_CASE(EXP2)
-  NODE_NAME_CASE(EXP2_SAE)
   NODE_NAME_CASE(RSQRT14)
   NODE_NAME_CASE(RSQRT14S)
-  NODE_NAME_CASE(RSQRT28)
-  NODE_NAME_CASE(RSQRT28_SAE)
-  NODE_NAME_CASE(RSQRT28S)
-  NODE_NAME_CASE(RSQRT28S_SAE)
   NODE_NAME_CASE(FADD_RND)
   NODE_NAME_CASE(FADDS)
   NODE_NAME_CASE(FADDS_RND)
@@ -42963,7 +42930,6 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
     bool PoisonOnly, unsigned Depth) const {
   unsigned NumElts = DemandedElts.getBitWidth();
 
-  // TODO: Add more target shuffles.
   switch (Op.getOpcode()) {
   case X86ISD::PSHUFD:
   case X86ISD::VPERMILPI: {
@@ -42999,8 +42965,12 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
     bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
 
-  // TODO: Add more target shuffles.
   switch (Op.getOpcode()) {
+  // SSE vector shifts handle out of bounds shift amounts.
+  case X86ISD::VSHLI:
+  case X86ISD::VSRLI:
+  case X86ISD::VSRAI:
+    return false;
   case X86ISD::PSHUFD:
   case X86ISD::VPERMILPI:
   case X86ISD::UNPCKH:
@@ -43443,7 +43413,11 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
 // the chain.
 static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
                                           SelectionDAG &DAG,
-                                          const X86Subtarget &Subtarget) {
+                                          const X86Subtarget &Subtarget,
+                                          unsigned Depth = 0) {
+  if (Depth >= SelectionDAG::MaxRecursionDepth)
+    return SDValue(); // Limit search depth.
+
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   unsigned Opc = V.getOpcode();
   switch (Opc) {
@@ -43455,14 +43429,22 @@ static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
       return DAG.getBitcast(VT, Src);
     break;
   }
+  case ISD::Constant: {
+    auto *C = cast<ConstantSDNode>(V);
+    if (C->isZero())
+      return DAG.getConstant(0, DL, VT);
+    if (C->isAllOnes())
+      return DAG.getAllOnesConstant(DL, VT);
+    break;
+  }
   case ISD::TRUNCATE: {
     // If we find a suitable source, a truncated scalar becomes a subvector.
     SDValue Src = V.getOperand(0);
     EVT NewSrcVT =
         EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
     if (TLI.isTypeLegal(NewSrcVT))
-      if (SDValue N0 =
-              combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
+      if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
+                                                  Subtarget, Depth + 1))
         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
                            DAG.getIntPtrConstant(0, DL));
     break;
@@ -43474,20 +43456,22 @@ static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
     EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
                                     Src.getScalarValueSizeInBits());
     if (TLI.isTypeLegal(NewSrcVT))
-      if (SDValue N0 =
-              combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
+      if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
+                                                  Subtarget, Depth + 1))
         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
                            Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
                                                   : DAG.getConstant(0, DL, VT),
                            N0, DAG.getIntPtrConstant(0, DL));
     break;
   }
-  case ISD::OR: {
-    // If we find suitable sources, we can just move an OR to the vector domain.
-    SDValue Src0 = V.getOperand(0);
-    SDValue Src1 = V.getOperand(1);
-    if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
-      if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
+  case ISD::OR:
+  case ISD::XOR: {
+    // If we find suitable sources, we can just move the op to the vector
+    // domain.
+    if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG,
+                                                Subtarget, Depth + 1))
+      if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG,
+                                                  Subtarget, Depth + 1))
         return DAG.getNode(Opc, DL, VT, N0, N1);
     break;
   }
@@ -43499,13 +43483,20 @@ static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
       break;
 
     if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
-      if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
+      if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget,
+                                                  Depth + 1))
         return DAG.getNode(
             X86ISD::KSHIFTL, DL, VT, N0,
             DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
     break;
   }
   }
+
+  // Does the inner bitcast already exist?
+  if (Depth > 0)
+    if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V}))
+      return SDValue(Alt, 0);
+
   return SDValue();
 }
 
@@ -43694,14 +43685,14 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
     return combinevXi1ConstantToInteger(N0, DAG);
   }
 
-  if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
-      VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
-      isa<ConstantSDNode>(N0)) {
-    auto *C = cast<ConstantSDNode>(N0);
-    if (C->isAllOnes())
-      return DAG.getConstant(1, SDLoc(N0), VT);
-    if (C->isZero())
-      return DAG.getConstant(0, SDLoc(N0), VT);
+  if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
+      VT.getVectorElementType() == MVT::i1) {
+    if (auto *C = dyn_cast<ConstantSDNode>(N0)) {
+      if (C->isAllOnes())
+        return DAG.getConstant(1, SDLoc(N0), VT);
+      if (C->isZero())
+        return DAG.getConstant(0, SDLoc(N0), VT);
+    }
   }
 
   // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index ade54f7..14b9eb7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -699,18 +699,6 @@ namespace llvm {
     // Test if in transactional execution.
     XTEST,
 
-    // ERI instructions.
-    RSQRT28,
-    RSQRT28_SAE,
-    RSQRT28S,
-    RSQRT28S_SAE,
-    RCP28,
-    RCP28_SAE,
-    RCP28S,
-    RCP28S_SAE,
-    EXP2,
-    EXP2_SAE,
-
     // Conversions between float and half-float.
     CVTPS2PH,
     CVTPS2PH_SAE,
diff --git a/llvm/lib/Target/X86/X86Instr3DNow.td b/llvm/lib/Target/X86/X86Instr3DNow.td
index 3be03ab..03612de 100644
--- a/llvm/lib/Target/X86/X86Instr3DNow.td
+++ b/llvm/lib/Target/X86/X86Instr3DNow.td
@@ -90,8 +90,7 @@ def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr",
                   TB, Requires<[HasPrefetchW]>;
 
 def PREFETCHWT1 : I<0x0D, MRM2m, (outs), (ins i8mem:$addr), "prefetchwt1\t$addr",
-                    [(prefetch addr:$addr, (i32 1), (i32 PrefetchWT1Level), (i32 1))]>,
-                    TB, Requires<[HasPREFETCHWT1]>;
+                    []>, TB;
 }
 
 // "3DNowA" instructions
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 0723328..da690ae 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -9265,6 +9265,37 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
   }
 }
 
+multiclass avx512_fp28_s_ass<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                             X86FoldableSchedWrite sched> {
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR], hasSideEffects = 0 in {
+  defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                           (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                           "$src2, $src1", "$src1, $src2",
+                           (null_frag)>, Sched<[sched]>, SIMD_EXC;
+  defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                            "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+                            (null_frag)>, EVEX_B, Sched<[sched]>;
+  let mayLoad = 1 in
+  defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
+                         "$src2, $src1", "$src1, $src2",
+                         (null_frag)>,
+                         Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+  }
+}
+
+multiclass avx512_eri_s_ass<bits<8> opc, string OpcodeStr,
+                            X86FoldableSchedWrite sched> {
+  defm SSZ : avx512_fp28_s_ass<opc, OpcodeStr#"ss", f32x_info, sched>,
+             EVEX_CD8<32, CD8VT1>, VEX_LIG, T8, PD, EVEX, VVVV;
+  defm SDZ : avx512_fp28_s_ass<opc, OpcodeStr#"sd", f64x_info, sched>,
+             EVEX_CD8<64, CD8VT1>, VEX_LIG, REX_W, T8, PD, EVEX, VVVV;
+}
+
+defm VRCP28   : avx512_eri_s_ass<0xCB, "vrcp28", SchedWriteFRcp.Scl>;
+defm VRSQRT28 : avx512_eri_s_ass<0xCD, "vrsqrt28", SchedWriteFRsqrt.Scl>;
+
 multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                         SDNode OpNodeSAE, X86FoldableSchedWrite sched> {
   defm SSZ : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, OpNodeSAE,
@@ -9280,13 +9311,6 @@ multiclass avx512_vgetexpsh<bits<8> opc, string OpcodeStr, SDNode OpNode,
                EVEX_CD8<16, CD8VT1>, T_MAP6, PD, EVEX, VVVV;
 }
 
-let Predicates = [HasERI] in {
-  defm VRCP28   : avx512_eri_s<0xCB, "vrcp28", X86rcp28s, X86rcp28SAEs,
-                               SchedWriteFRcp.Scl>;
-  defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s, X86rsqrt28SAEs,
-                               SchedWriteFRsqrt.Scl>;
-}
-
 defm VGETEXP   : avx512_eri_s<0x43, "vgetexp", X86fgetexps, X86fgetexpSAEs,
                               SchedWriteFRnd.Scl>,
                  avx512_vgetexpsh<0x43, "vgetexp", X86fgetexps, X86fgetexpSAEs,
@@ -9325,6 +9349,49 @@ multiclass avx512_fp28_p_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                         EVEX_B, Sched<[sched]>;
 }
 
+multiclass avx512_fp28_p_ass<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                             X86FoldableSchedWrite sched> {
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1,
+    hasSideEffects = 0 in {
+  defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src), OpcodeStr, "$src", "$src",
+                         (null_frag)>, Sched<[sched]>;
+  let mayLoad = 1 in
+  defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+                         (null_frag)>,
+                         Sched<[sched.Folded, sched.ReadAfterFold]>;
+  let mayLoad = 1 in
+  defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.ScalarMemOp:$src), OpcodeStr,
+                         "${src}"#_.BroadcastStr, "${src}"#_.BroadcastStr,
+                         (null_frag)>,
+                         EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+  }
+}
+multiclass avx512_fp28_p_sae_ass<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                                X86FoldableSchedWrite sched> {
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR], hasSideEffects = 0 in
+  defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                        (ins _.RC:$src), OpcodeStr,
+                        "{sae}, $src", "$src, {sae}",
+                        (null_frag)>, Sched<[sched]>, EVEX_B;
+}
+
+multiclass  avx512_eri_ass<bits<8> opc, string OpcodeStr,
+                           X86SchedWriteWidths sched> {
+   defm PSZ : avx512_fp28_p_ass<opc, OpcodeStr#"ps", v16f32_info, sched.ZMM>,
+              avx512_fp28_p_sae_ass<opc, OpcodeStr#"ps", v16f32_info, sched.ZMM>,
+              T8, PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+   defm PDZ : avx512_fp28_p_ass<opc, OpcodeStr#"pd", v8f64_info, sched.ZMM>,
+              avx512_fp28_p_sae_ass<opc, OpcodeStr#"pd", v8f64_info, sched.ZMM>,
+              T8, PD, EVEX_V512, REX_W, EVEX_CD8<64, CD8VF>;
+}
+
+defm VRSQRT28 : avx512_eri_ass<0xCC, "vrsqrt28", SchedWriteFRsqrt>, EVEX;
+defm VRCP28   : avx512_eri_ass<0xCA, "vrcp28", SchedWriteFRcp>, EVEX;
+defm VEXP2    : avx512_eri_ass<0xC8, "vexp2", SchedWriteFAdd>, EVEX;
+
 multiclass  avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode,
                        SDNode OpNodeSAE, X86SchedWriteWidths sched> {
    defm PSZ : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode, sched.ZMM>,
@@ -9367,14 +9434,6 @@ multiclass  avx512_vgetexp_fp16<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                      EVEX_V256, T_MAP6, PD, EVEX_CD8<16, CD8VF>;
   }
 }
-let Predicates = [HasERI] in {
- defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, X86rsqrt28SAE,
-                            SchedWriteFRsqrt>, EVEX;
- defm VRCP28   : avx512_eri<0xCA, "vrcp28", X86rcp28, X86rcp28SAE,
-                            SchedWriteFRcp>, EVEX;
- defm VEXP2    : avx512_eri<0xC8, "vexp2", X86exp2, X86exp2SAE,
-                            SchedWriteFAdd>, EVEX;
-}
 defm VGETEXP   : avx512_eri<0x42, "vgetexp", X86fgetexp, X86fgetexpSAE,
                             SchedWriteFRnd>,
                  avx512_vgetexp_fp16<0x42, "vgetexp", X86fgetexp, X86fgetexpSAE,
@@ -10308,7 +10367,7 @@ defm VPSCATTER : avx512_scatter_q_pd<0xA0, 0xA1, avx512vl_i64_info, "vpscatter",
 // prefetch
 multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr,
                        RegisterClass KRC, X86MemOperand memop> {
-  let Predicates = [HasPFI], mayLoad = 1, mayStore = 1 in
+  let mayLoad = 1, mayStore = 1 in
   def m  : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src),
             !strconcat(OpcodeStr, "\t{$src {${mask}}|{${mask}}, $src}"), []>,
             EVEX, EVEX_K, Sched<[WriteLoad]>;
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index f14c720..142e186 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -607,14 +607,8 @@ def X86any_fcmp : PatFrags<(ops node:$lhs, node:$rhs),
                           [(X86strict_fcmp node:$lhs, node:$rhs),
                            (X86fcmp node:$lhs, node:$rhs)]>;
 
-// PREFETCHWT1 is supported we want to use it for everything but T0.
 def PrefetchWLevel : PatFrag<(ops), (i32 timm), [{
-  return N->getSExtValue() == 3 || !Subtarget->hasPREFETCHWT1();
-}]>;
-
-// Use PREFETCHWT1 for NTA, T2, T1.
-def PrefetchWT1Level : TImmLeaf<i32, [{
-  return Imm < 3;
+  return N->getSExtValue() <= 3;
 }]>;
 
 def X86lock_add_nocf : PatFrag<(ops node:$lhs, node:$rhs),
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index f86e15b..dff33a4 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -600,19 +600,8 @@ def X86Vpdpbusds : SDNode<"X86ISD::VPDPBUSDS", SDTVnni>;
 def X86Vpdpwssd  : SDNode<"X86ISD::VPDPWSSD", SDTVnni>;
 def X86Vpdpwssds : SDNode<"X86ISD::VPDPWSSDS", SDTVnni>;
 
-def X86rsqrt28   : SDNode<"X86ISD::RSQRT28",     SDTFPUnaryOp>;
-def X86rsqrt28SAE: SDNode<"X86ISD::RSQRT28_SAE", SDTFPUnaryOp>;
-def X86rcp28     : SDNode<"X86ISD::RCP28",       SDTFPUnaryOp>;
-def X86rcp28SAE  : SDNode<"X86ISD::RCP28_SAE",   SDTFPUnaryOp>;
-def X86exp2      : SDNode<"X86ISD::EXP2",        SDTFPUnaryOp>;
-def X86exp2SAE   : SDNode<"X86ISD::EXP2_SAE",    SDTFPUnaryOp>;
-
 def X86rsqrt14s  : SDNode<"X86ISD::RSQRT14S",   SDTFPBinOp>;
 def X86rcp14s    : SDNode<"X86ISD::RCP14S",     SDTFPBinOp>;
-def X86rsqrt28s  : SDNode<"X86ISD::RSQRT28S",   SDTFPBinOp>;
-def X86rsqrt28SAEs : SDNode<"X86ISD::RSQRT28S_SAE", SDTFPBinOp>;
-def X86rcp28s    : SDNode<"X86ISD::RCP28S",     SDTFPBinOp>;
-def X86rcp28SAEs : SDNode<"X86ISD::RCP28S_SAE", SDTFPBinOp>;
 def X86Ranges    : SDNode<"X86ISD::VRANGES",    SDTFPBinOpImm>;
 def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImm>;
 def X86Reduces   : SDNode<"X86ISD::VREDUCES",   SDTFPBinOpImm>;
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 9f2709d..419ff9e 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -79,8 +79,6 @@ def UseAVX2      : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
 def NoAVX512     : Predicate<"!Subtarget->hasAVX512()">;
 def HasCDI       : Predicate<"Subtarget->hasCDI()">;
 def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">;
-def HasPFI       : Predicate<"Subtarget->hasPFI()">;
-def HasERI       : Predicate<"Subtarget->hasERI()">;
 def HasDQI       : Predicate<"Subtarget->hasDQI()">;
 def NoDQI        : Predicate<"!Subtarget->hasDQI()">;
 def HasBWI       : Predicate<"Subtarget->hasBWI()">;
@@ -147,7 +145,6 @@ def NoSSEPrefetch : Predicate<"!Subtarget->hasSSEPrefetch()">;
 def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;
 def HasPREFETCHI : Predicate<"Subtarget->hasPREFETCHI()">;
 def HasPrefetchW : Predicate<"Subtarget->hasPrefetchW()">;
-def HasPREFETCHWT1 : Predicate<"Subtarget->hasPREFETCHWT1()">;
 def HasLAHFSAHF  : Predicate<"Subtarget->hasLAHFSAHF()">;
 def HasLAHFSAHF64 : Predicate<"Subtarget->hasLAHFSAHF64()">;
 def HasMWAITX    : Predicate<"Subtarget->hasMWAITX()">;
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 3bb2f07..e3961e0 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -108,15 +108,6 @@ static const IntrinsicData IntrinsicsWithChain[] = {
   X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, 0, 0),
   X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, 0, 0),
 
-  X86_INTRINSIC_DATA(avx512_gatherpf_dpd_512, PREFETCH,
-                     X86::VGATHERPF0DPDm, X86::VGATHERPF1DPDm),
-  X86_INTRINSIC_DATA(avx512_gatherpf_dps_512, PREFETCH,
-                     X86::VGATHERPF0DPSm, X86::VGATHERPF1DPSm),
-  X86_INTRINSIC_DATA(avx512_gatherpf_qpd_512, PREFETCH,
-                     X86::VGATHERPF0QPDm, X86::VGATHERPF1QPDm),
-  X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH,
-                     X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm),
-
   X86_INTRINSIC_DATA(avx512_mask_gather_dpd_512, GATHER, 0, 0),
   X86_INTRINSIC_DATA(avx512_mask_gather_dpi_512, GATHER, 0, 0),
   X86_INTRINSIC_DATA(avx512_mask_gather_dpq_512, GATHER, 0, 0),
@@ -292,14 +283,6 @@ static const IntrinsicData IntrinsicsWithChain[] = {
   X86_INTRINSIC_DATA(avx512_scatterdiv4_si, SCATTER, 0, 0),
   X86_INTRINSIC_DATA(avx512_scatterdiv8_sf, SCATTER, 0, 0),
   X86_INTRINSIC_DATA(avx512_scatterdiv8_si, SCATTER, 0, 0),
-  X86_INTRINSIC_DATA(avx512_scatterpf_dpd_512, PREFETCH, X86::VSCATTERPF0DPDm,
-                     X86::VSCATTERPF1DPDm),
-  X86_INTRINSIC_DATA(avx512_scatterpf_dps_512, PREFETCH, X86::VSCATTERPF0DPSm,
-                     X86::VSCATTERPF1DPSm),
-  X86_INTRINSIC_DATA(avx512_scatterpf_qpd_512, PREFETCH, X86::VSCATTERPF0QPDm,
-                     X86::VSCATTERPF1QPDm),
-  X86_INTRINSIC_DATA(avx512_scatterpf_qps_512, PREFETCH, X86::VSCATTERPF0QPSm,
-                     X86::VSCATTERPF1QPSm),
   X86_INTRINSIC_DATA(avx512_scattersiv2_df, SCATTER, 0, 0),
   X86_INTRINSIC_DATA(avx512_scattersiv2_di, SCATTER, 0, 0),
   X86_INTRINSIC_DATA(avx512_scattersiv4_df, SCATTER, 0, 0),
@@ -454,8 +437,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_dbpsadbw_512, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0),
   X86_INTRINSIC_DATA(avx512_div_pd_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND),
   X86_INTRINSIC_DATA(avx512_div_ps_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND),
-  X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_SAE, X86ISD::EXP2, X86ISD::EXP2_SAE),
-  X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_SAE, X86ISD::EXP2, X86ISD::EXP2_SAE),
   X86_INTRINSIC_DATA(avx512_fpclass_pd_128, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
   X86_INTRINSIC_DATA(avx512_fpclass_pd_256, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
   X86_INTRINSIC_DATA(avx512_fpclass_pd_512, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
@@ -908,10 +889,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
   X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
   X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
-  X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_SAE, X86ISD::RCP28, X86ISD::RCP28_SAE),
-  X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_SAE, X86ISD::RCP28, X86ISD::RCP28_SAE),
-  X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_SAE, X86ISD::RCP28S, X86ISD::RCP28S_SAE),
-  X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_SAE, X86ISD::RCP28S, X86ISD::RCP28S_SAE),
   X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
@@ -920,10 +897,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
-  X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_SAE,X86ISD::RSQRT28, X86ISD::RSQRT28_SAE),
-  X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_SAE,X86ISD::RSQRT28, X86ISD::RSQRT28_SAE),
-  X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_SAE,X86ISD::RSQRT28S, X86ISD::RSQRT28S_SAE),
-  X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_SAE,X86ISD::RSQRT28S, X86ISD::RSQRT28S_SAE),
   X86_INTRINSIC_DATA(avx512_sitofp_round, INTR_TYPE_1OP, ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND),
   X86_INTRINSIC_DATA(avx512_sqrt_pd_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND),
   X86_INTRINSIC_DATA(avx512_sqrt_ps_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND),
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 4d55a08..4532db1 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -213,17 +213,15 @@ public:
   bool hasAnyFMA() const { return hasFMA() || hasFMA4(); }
   bool hasPrefetchW() const {
     // The PREFETCHW instruction was added with 3DNow but later CPUs gave it
-    // its own CPUID bit as part of deprecating 3DNow. Intel eventually added
-    // it and KNL has another that prefetches to L2 cache. We assume the
+    // its own CPUID bit as part of deprecating 3DNow. We assume the
     // L1 version exists if the L2 version does.
-    return hasThreeDNow() || hasPRFCHW() || hasPREFETCHWT1();
+    return hasThreeDNow() || hasPRFCHW();
   }
   bool hasSSEPrefetch() const {
     // We implicitly enable these when we have a write prefix supporting cache
     // level OR if we have prfchw, but don't already have a read prefetch from
     // 3dnow.
-    return hasSSE1() || (hasPRFCHW() && !hasThreeDNow()) || hasPREFETCHWT1() ||
-           hasPREFETCHI();
+    return hasSSE1() || (hasPRFCHW() && !hasThreeDNow()) || hasPREFETCHI();
   }
   bool canUseLAHFSAHF() const { return hasLAHFSAHF64() || !is64Bit(); }
   // These are generic getters that OR together all of the thunk types
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index c5156c6..68155ac 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1005,8 +1005,6 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
         CPU = "cascadelake";
       } else if (testFeature(X86::FEATURE_AVX512VL)) {
         CPU = "skylake-avx512";
-      } else if (testFeature(X86::FEATURE_AVX512ER)) {
-        CPU = "knl";
       } else if (testFeature(X86::FEATURE_CLFLUSHOPT)) {
         if (testFeature(X86::FEATURE_SHA))
           CPU = "goldmont";
@@ -1300,10 +1298,6 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
     setFeature(X86::FEATURE_AVX512IFMA);
   if (HasLeaf7 && ((EBX >> 23) & 1))
     setFeature(X86::FEATURE_CLFLUSHOPT);
-  if (HasLeaf7 && ((EBX >> 26) & 1) && HasAVX512Save)
-    setFeature(X86::FEATURE_AVX512PF);
-  if (HasLeaf7 && ((EBX >> 27) & 1) && HasAVX512Save)
-    setFeature(X86::FEATURE_AVX512ER);
   if (HasLeaf7 && ((EBX >> 28) & 1) && HasAVX512Save)
     setFeature(X86::FEATURE_AVX512CD);
   if (HasLeaf7 && ((EBX >> 29) & 1))
@@ -1810,14 +1804,11 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["avx512ifma"] = HasLeaf7 && ((EBX >> 21) & 1) && HasAVX512Save;
   Features["clflushopt"] = HasLeaf7 && ((EBX >> 23) & 1);
   Features["clwb"]       = HasLeaf7 && ((EBX >> 24) & 1);
-  Features["avx512pf"]   = HasLeaf7 && ((EBX >> 26) & 1) && HasAVX512Save;
-  Features["avx512er"]   = HasLeaf7 && ((EBX >> 27) & 1) && HasAVX512Save;
   Features["avx512cd"]   = HasLeaf7 && ((EBX >> 28) & 1) && HasAVX512Save;
   Features["sha"]        = HasLeaf7 && ((EBX >> 29) & 1);
   Features["avx512bw"]   = HasLeaf7 && ((EBX >> 30) & 1) && HasAVX512Save;
   Features["avx512vl"]   = HasLeaf7 && ((EBX >> 31) & 1) && HasAVX512Save;
 
-  Features["prefetchwt1"]     = HasLeaf7 && ((ECX >>  0) & 1);
   Features["avx512vbmi"]      = HasLeaf7 && ((ECX >>  1) & 1) && HasAVX512Save;
   Features["pku"]             = HasLeaf7 && ((ECX >>  4) & 1);
   Features["waitpkg"]         = HasLeaf7 && ((ECX >>  5) & 1);
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index 827bc5b..01d0c71 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -880,7 +880,7 @@ void RISCVISAInfo::updateImplication() {
   // implied
   if (!HasE && !HasI) {
     auto Version = findDefaultVersion("i");
-    addExtension("i", Version.value());
+    addExtension("i", *Version);
   }
 
   if (HasE && HasI)
@@ -906,7 +906,7 @@ void RISCVISAInfo::updateImplication() {
                     if (Exts.count(ImpliedExt))
                       return;
                     auto Version = findDefaultVersion(ImpliedExt);
-                    addExtension(ImpliedExt, Version.value());
+                    addExtension(ImpliedExt, *Version);
                     WorkList.insert(ImpliedExt);
                   });
   }
@@ -915,7 +915,7 @@ void RISCVISAInfo::updateImplication() {
   if (XLen == 32 && Exts.count("zce") && Exts.count("f") &&
       !Exts.count("zcf")) {
     auto Version = findDefaultVersion("zcf");
-    addExtension("zcf", Version.value());
+    addExtension("zcf", *Version);
   }
 }
 
@@ -942,7 +942,7 @@ void RISCVISAInfo::updateCombination() {
           });
       if (HasAllRequiredFeatures) {
         auto Version = findDefaultVersion(CombineExt);
-        addExtension(CombineExt, Version.value());
+        addExtension(CombineExt, *Version);
         MadeChange = true;
       }
     }
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index efe392b..e380238 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -95,9 +95,9 @@ constexpr FeatureBitset FeaturesBroadwell =
 
 // Intel Knights Landing and Knights Mill
 // Knights Landing has feature parity with Broadwell.
-constexpr FeatureBitset FeaturesKNL =
-    FeaturesBroadwell | FeatureAES | FeatureAVX512F | FeatureEVEX512 |
-    FeatureAVX512CD | FeatureAVX512ER | FeatureAVX512PF | FeaturePREFETCHWT1;
+constexpr FeatureBitset FeaturesKNL = FeaturesBroadwell | FeatureAES |
+                                      FeatureAVX512F | FeatureEVEX512 |
+                                      FeatureAVX512CD;
 constexpr FeatureBitset FeaturesKNM = FeaturesKNL | FeatureAVX512VPOPCNTDQ;
 
 // Intel Skylake processors.
@@ -500,7 +500,6 @@ constexpr FeatureBitset ImpliedFeaturesMOVDIRI = {};
 constexpr FeatureBitset ImpliedFeaturesPCONFIG = {};
 constexpr FeatureBitset ImpliedFeaturesPOPCNT = {};
 constexpr FeatureBitset ImpliedFeaturesPKU = {};
-constexpr FeatureBitset ImpliedFeaturesPREFETCHWT1 = {};
 constexpr FeatureBitset ImpliedFeaturesPRFCHW = {};
 constexpr FeatureBitset ImpliedFeaturesPTWRITE = {};
 constexpr FeatureBitset ImpliedFeaturesRDPID = {};
@@ -569,8 +568,6 @@ constexpr FeatureBitset ImpliedFeaturesSM4 = FeatureAVX2;
 constexpr FeatureBitset ImpliedFeaturesAVX512CD = FeatureAVX512F;
 constexpr FeatureBitset ImpliedFeaturesAVX512BW = FeatureAVX512F;
 constexpr FeatureBitset ImpliedFeaturesAVX512DQ = FeatureAVX512F;
-constexpr FeatureBitset ImpliedFeaturesAVX512ER = FeatureAVX512F;
-constexpr FeatureBitset ImpliedFeaturesAVX512PF = FeatureAVX512F;
 constexpr FeatureBitset ImpliedFeaturesAVX512VL = FeatureAVX512F;
 
 constexpr FeatureBitset ImpliedFeaturesAVX512BF16 = FeatureAVX512BW;
@@ -751,13 +748,13 @@ unsigned llvm::X86::getFeaturePriority(ProcessorFeatures Feat) {
 #ifndef NDEBUG
   // Check that priorities are set properly in the .def file. We expect that
   // "compat" features are assigned non-duplicate consecutive priorities
-  // starting from one (1, ..., 37) and multiple zeros.
+  // starting from one (1, ..., 35) and multiple zeros.
 #define X86_FEATURE_COMPAT(ENUM, STR, PRIORITY) PRIORITY,
   unsigned Priorities[] = {
 #include "llvm/TargetParser/X86TargetParser.def"
   };
   std::array<unsigned, std::size(Priorities)> HelperList;
-  const size_t MaxPriority = 37;
+  const size_t MaxPriority = 35;
   std::iota(HelperList.begin(), HelperList.begin() + MaxPriority + 1, 0);
   for (size_t i = MaxPriority + 1; i != std::size(Priorities); ++i)
     HelperList[i] = 0;
diff --git a/llvm/lib/Transforms/Coroutines/CoroElide.cpp b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
index bb24448..74b5ccb 100644
--- a/llvm/lib/Transforms/Coroutines/CoroElide.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
@@ -464,13 +464,9 @@ bool CoroIdElider::attemptElide() {
   return true;
 }
 
-static bool declaresCoroElideIntrinsics(Module &M) {
-  return coro::declaresIntrinsics(M, {"llvm.coro.id", "llvm.coro.id.async"});
-}
-
 PreservedAnalyses CoroElidePass::run(Function &F, FunctionAnalysisManager &AM) {
   auto &M = *F.getParent();
-  if (!declaresCoroElideIntrinsics(M))
+  if (!coro::declaresIntrinsics(M, {"llvm.coro.id"}))
     return PreservedAnalyses::all();
 
   FunctionElideInfo FEI{&F};
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 08a4522..38b8dab 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/PtrUseVisitor.h"
 #include "llvm/Analysis/StackLifetime.h"
 #include "llvm/Config/llvm-config.h"
@@ -1440,17 +1441,22 @@ namespace {
 struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
   using Base = PtrUseVisitor<AllocaUseVisitor>;
   AllocaUseVisitor(const DataLayout &DL, const DominatorTree &DT,
-                   const CoroBeginInst &CB, const SuspendCrossingInfo &Checker,
+                   const coro::Shape &CoroShape,
+                   const SuspendCrossingInfo &Checker,
                    bool ShouldUseLifetimeStartInfo)
-      : PtrUseVisitor(DL), DT(DT), CoroBegin(CB), Checker(Checker),
-        ShouldUseLifetimeStartInfo(ShouldUseLifetimeStartInfo) {}
+      : PtrUseVisitor(DL), DT(DT), CoroShape(CoroShape), Checker(Checker),
+        ShouldUseLifetimeStartInfo(ShouldUseLifetimeStartInfo) {
+    for (AnyCoroSuspendInst *SuspendInst : CoroShape.CoroSuspends)
+      CoroSuspendBBs.insert(SuspendInst->getParent());
+  }
 
   void visit(Instruction &I) {
     Users.insert(&I);
     Base::visit(I);
     // If the pointer is escaped prior to CoroBegin, we have to assume it would
     // be written into before CoroBegin as well.
-    if (PI.isEscaped() && !DT.dominates(&CoroBegin, PI.getEscapingInst())) {
+    if (PI.isEscaped() &&
+        !DT.dominates(CoroShape.CoroBegin, PI.getEscapingInst())) {
       MayWriteBeforeCoroBegin = true;
     }
   }
@@ -1553,10 +1559,19 @@ struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
     // When we found the lifetime markers refers to a
     // subrange of the original alloca, ignore the lifetime
     // markers to avoid misleading the analysis.
-    if (II.getIntrinsicID() != Intrinsic::lifetime_start || !IsOffsetKnown ||
-        !Offset.isZero())
+    if (!IsOffsetKnown || !Offset.isZero())
+      return Base::visitIntrinsicInst(II);
+    switch (II.getIntrinsicID()) {
+    default:
       return Base::visitIntrinsicInst(II);
-    LifetimeStarts.insert(&II);
+    case Intrinsic::lifetime_start:
+      LifetimeStarts.insert(&II);
+      LifetimeStartBBs.push_back(II.getParent());
+      break;
+    case Intrinsic::lifetime_end:
+      LifetimeEndBBs.insert(II.getParent());
+      break;
+    }
   }
 
   void visitCallBase(CallBase &CB) {
@@ -1586,7 +1601,7 @@ struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
 
 private:
   const DominatorTree &DT;
-  const CoroBeginInst &CoroBegin;
+  const coro::Shape &CoroShape;
   const SuspendCrossingInfo &Checker;
   // All alias to the original AllocaInst, created before CoroBegin and used
   // after CoroBegin. Each entry contains the instruction and the offset in the
@@ -1594,6 +1609,9 @@ private:
   DenseMap<Instruction *, std::optional<APInt>> AliasOffetMap{};
   SmallPtrSet<Instruction *, 4> Users{};
   SmallPtrSet<IntrinsicInst *, 2> LifetimeStarts{};
+  SmallVector<BasicBlock *> LifetimeStartBBs{};
+  SmallPtrSet<BasicBlock *, 2> LifetimeEndBBs{};
+  SmallPtrSet<const BasicBlock *, 2> CoroSuspendBBs{};
   bool MayWriteBeforeCoroBegin{false};
   bool ShouldUseLifetimeStartInfo{true};
 
@@ -1605,10 +1623,19 @@ private:
     // every basic block that uses the pointer to see if they cross suspension
     // points. The uses cover both direct uses as well as indirect uses.
     if (ShouldUseLifetimeStartInfo && !LifetimeStarts.empty()) {
-      for (auto *I : Users)
-        for (auto *S : LifetimeStarts)
-          if (Checker.isDefinitionAcrossSuspend(*S, I))
-            return true;
+      // If there is no explicit lifetime.end, then assume the address can
+      // cross suspension points.
+      if (LifetimeEndBBs.empty())
+        return true;
+
+      // If there is a path from a lifetime.start to a suspend without a
+      // corresponding lifetime.end, then the alloca's lifetime persists
+      // beyond that suspension point and the alloca must go on the frame.
+      llvm::SmallVector<BasicBlock *> Worklist(LifetimeStartBBs);
+      if (isManyPotentiallyReachableFromMany(Worklist, CoroSuspendBBs,
+                                             &LifetimeEndBBs, &DT))
+        return true;
+
       // Addresses are guaranteed to be identical after every lifetime.start so
       // we cannot use the local stack if the address escaped and there is a
       // suspend point between lifetime markers. This should also cover the
@@ -1646,13 +1673,13 @@ private:
   }
 
   void handleMayWrite(const Instruction &I) {
-    if (!DT.dominates(&CoroBegin, &I))
+    if (!DT.dominates(CoroShape.CoroBegin, &I))
       MayWriteBeforeCoroBegin = true;
   }
 
   bool usedAfterCoroBegin(Instruction &I) {
     for (auto &U : I.uses())
-      if (DT.dominates(&CoroBegin, U))
+      if (DT.dominates(CoroShape.CoroBegin, U))
         return true;
     return false;
   }
@@ -1661,7 +1688,7 @@ private:
     // We track all aliases created prior to CoroBegin but used after.
     // These aliases may need to be recreated after CoroBegin if the alloca
     // need to live on the frame.
-    if (DT.dominates(&CoroBegin, &I) || !usedAfterCoroBegin(I))
+    if (DT.dominates(CoroShape.CoroBegin, &I) || !usedAfterCoroBegin(I))
       return;
 
     if (!IsOffsetKnown) {
@@ -2830,8 +2857,7 @@ static void collectFrameAlloca(AllocaInst *AI, coro::Shape &Shape,
   bool ShouldUseLifetimeStartInfo =
       (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon &&
        Shape.ABI != coro::ABI::RetconOnce);
-  AllocaUseVisitor Visitor{AI->getModule()->getDataLayout(), DT,
-                           *Shape.CoroBegin, Checker,
+  AllocaUseVisitor Visitor{AI->getModule()->getDataLayout(), DT, Shape, Checker,
                            ShouldUseLifetimeStartInfo};
   Visitor.visitPtr(*AI);
   if (!Visitor.getShouldLiveOnFrame())
@@ -2948,10 +2974,12 @@ void coro::salvageDebugInfo(
     std::optional<BasicBlock::iterator> InsertPt;
     if (auto *I = dyn_cast<Instruction>(Storage)) {
       InsertPt = I->getInsertionPointAfterDef();
-      // Update DILocation only in O0 since it is easy to get out of sync in
-      // optimizations. See https://github.com/llvm/llvm-project/pull/75104 for
-      // an example.
-      if (!OptimizeFrame && I->getDebugLoc())
+      // Update DILocation only if variable was not inlined.
+      DebugLoc ILoc = I->getDebugLoc();
+      DebugLoc DVILoc = DVI.getDebugLoc();
+      if (ILoc && DVILoc &&
+          DVILoc->getScope()->getSubprogram() ==
+              ILoc->getScope()->getSubprogram())
         DVI.setDebugLoc(I->getDebugLoc());
     } else if (isa<Argument>(Storage))
       InsertPt = F->getEntryBlock().begin();
@@ -2988,11 +3016,13 @@ void coro::salvageDebugInfo(
     std::optional<BasicBlock::iterator> InsertPt;
     if (auto *I = dyn_cast<Instruction>(Storage)) {
       InsertPt = I->getInsertionPointAfterDef();
-      // Update DILocation only in O0 since it is easy to get out of sync in
-      // optimizations. See https://github.com/llvm/llvm-project/pull/75104 for
-      // an example.
-      if (!OptimizeFrame && I->getDebugLoc())
-        DVR.setDebugLoc(I->getDebugLoc());
+      // Update DILocation only if variable was not inlined.
+      DebugLoc ILoc = I->getDebugLoc();
+      DebugLoc DVRLoc = DVR.getDebugLoc();
+      if (ILoc && DVRLoc &&
+          DVRLoc->getScope()->getSubprogram() ==
+              ILoc->getScope()->getSubprogram())
+        DVR.setDebugLoc(ILoc);
     } else if (isa<Argument>(Storage))
       InsertPt = F->getEntryBlock().begin();
     if (InsertPt) {
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 1d9cf18..5a58a99 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -227,6 +227,7 @@ static void lowerAwaitSuspend(IRBuilder<> &Builder, CoroAwaitSuspendInst *CB,
     FunctionType *ResumeTy = FunctionType::get(
         Type::getVoidTy(Ctx), PointerType::getUnqual(Ctx), false);
     auto *ResumeCall = Builder.CreateCall(ResumeTy, ResumeAddr, {NewCall});
+    ResumeCall->setCallingConv(CallingConv::Fast);
 
     // We can't insert the 'ret' instruction and adjust the cc until the
     // function has been split, so remember this for later.
@@ -1088,7 +1089,6 @@ void CoroCloner::create() {
   // Turn symmetric transfers into musttail calls.
   for (CallInst *ResumeCall : Shape.SymmetricTransfers) {
     ResumeCall = cast<CallInst>(VMap[ResumeCall]);
-    ResumeCall->setCallingConv(NewF->getCallingConv());
     if (TTI.supportsTailCallFor(ResumeCall)) {
       // FIXME: Could we support symmetric transfer effectively without
       // musttail?
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index e3920b9..b686658 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -3954,7 +3954,7 @@ static bool runAttributorLightOnFunctions(InformationCache &InfoCache,
     // We look at internal functions only on-demand but if any use is not a
     // direct call or outside the current set of analyzed functions, we have
     // to do it eagerly.
-    if (F->hasLocalLinkage()) {
+    if (AC.UseLiveness && F->hasLocalLinkage()) {
       if (llvm::all_of(F->uses(), [&Functions](const Use &U) {
             const auto *CB = dyn_cast<CallBase>(U.getUser());
             return CB && CB->isCallee(&U) &&
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 41b66aa..1b3bf3c 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -5690,6 +5690,9 @@ bool AANoCapture::isImpliedByIR(Attributor &A, const IRPosition &IRP,
     return V.use_empty();
 
   // You cannot "capture" null in the default address space.
+  //
+  // FIXME: This should use NullPointerIsDefined to account for the function
+  // attribute.
   if (isa<UndefValue>(V) || (isa<ConstantPointerNull>(V) &&
                              V.getType()->getPointerAddressSpace() == 0)) {
     return true;
@@ -5899,10 +5902,13 @@ ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) {
 
   const Function *F =
       isArgumentPosition() ? IRP.getAssociatedFunction() : IRP.getAnchorScope();
-  assert(F && "Expected a function!");
-  const IRPosition &FnPos = IRPosition::function(*F);
+
+  // TODO: Is the checkForAllUses below useful for constants?
+  if (!F)
+    return indicatePessimisticFixpoint();
 
   AANoCapture::StateType T;
+  const IRPosition &FnPos = IRPosition::function(*F);
 
   // Readonly means we cannot capture through memory.
   bool IsKnown;
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index a116fd6..cb19bf2 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -1435,7 +1435,8 @@ void llvm::gatherImportedSummariesForModule(
     StringRef ModulePath,
     const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
     const FunctionImporter::ImportMapTy &ImportList,
-    std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex) {
+    std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex,
+    GVSummaryPtrSet &DecSummaries) {
   // Include all summaries from the importing module.
   ModuleToSummariesForIndex[std::string(ModulePath)] =
       ModuleToDefinedGVSummaries.lookup(ModulePath);
@@ -1450,7 +1451,7 @@ void llvm::gatherImportedSummariesForModule(
       assert(DS != DefinedGVSummaries.end() &&
              "Expected a defined summary for imported global value");
       if (Type == GlobalValueSummary::Declaration)
-        continue;
+        DecSummaries.insert(DS->second);
 
       SummariesForIndex[GUID] = DS->second;
     }
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index b9d84d5..c53b945 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -1889,15 +1889,17 @@ bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
       } else if (findProfiledCalleeThroughTailCalls(
                      ProfiledCallee, CalledFunction, Depth + 1,
                      FoundCalleeChain, FoundMultipleCalleeChains)) {
-        if (FoundMultipleCalleeChains)
-          return false;
+        // findProfiledCalleeThroughTailCalls should not have returned
+        // true if FoundMultipleCalleeChains.
+        assert(!FoundMultipleCalleeChains);
         if (FoundSingleCalleeChain) {
           FoundMultipleCalleeChains = true;
           return false;
         }
         FoundSingleCalleeChain = true;
         SaveCallsiteInfo(&I, CalleeFunc);
-      }
+      } else if (FoundMultipleCalleeChains)
+        return false;
     }
   }
 
@@ -2004,8 +2006,9 @@ bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
       } else if (findProfiledCalleeThroughTailCalls(
                      ProfiledCallee, CallEdge.first, Depth + 1,
                      FoundCalleeChain, FoundMultipleCalleeChains)) {
-        if (FoundMultipleCalleeChains)
-          return false;
+        // findProfiledCalleeThroughTailCalls should not have returned
+        // true if FoundMultipleCalleeChains.
+        assert(!FoundMultipleCalleeChains);
         if (FoundSingleCalleeChain) {
           FoundMultipleCalleeChains = true;
           return false;
@@ -2015,7 +2018,8 @@ bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
         // Add FS to FSToVIMap  in case it isn't already there.
         assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
         FSToVIMap[FS] = FSVI;
-      }
+      } else if (FoundMultipleCalleeChains)
+        return false;
     }
   }
 
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index eea9399..e3a4821 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -4238,7 +4238,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
           ORA << "Value has potential side effects preventing SPMD-mode "
                  "execution";
           if (isa<CallBase>(NonCompatibleI)) {
-            ORA << ". Add `__attribute__((assume(\"ompx_spmd_amenable\")))` to "
+            ORA << ". Add `[[omp::assume(\"ompx_spmd_amenable\")]]` to "
                    "the called function to override";
           }
           return ORA << ".";
@@ -4380,7 +4380,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
           continue;
         auto Remark = [&](OptimizationRemarkAnalysis ORA) {
           return ORA << "Call may contain unknown parallel regions. Use "
-                     << "`__attribute__((assume(\"omp_no_parallelism\")))` to "
+                     << "`[[omp::assume(\"omp_no_parallelism\")]]` to "
                         "override.";
         };
         A.emitRemark<OptimizationRemarkAnalysis>(UnknownParallelRegionCB,
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 542a1c8..430f3e1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -214,6 +214,9 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
     // Find out if the comparison would be true or false for the i'th element.
     Constant *C = ConstantFoldCompareInstOperands(ICI.getPredicate(), Elt,
                                                   CompareRHS, DL, &TLI);
+    if (!C)
+      return nullptr;
+
     // If the result is undef for this element, ignore it.
     if (isa<UndefValue>(C)) {
       // Extend range state machines to cover this element in case there is an
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 4351a55..832f89e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -332,7 +332,7 @@ bool PointerReplacer::collectUsersRecursive(Instruction &I) {
       Worklist.insert(SI);
       if (!collectUsersRecursive(*SI))
         return false;
-    } else if (isa<GetElementPtrInst, BitCastInst>(Inst)) {
+    } else if (isa<GetElementPtrInst>(Inst)) {
       Worklist.insert(Inst);
       if (!collectUsersRecursive(*Inst))
         return false;
@@ -393,15 +393,6 @@ void PointerReplacer::replace(Instruction *I) {
     NewI->takeName(GEP);
     NewI->setIsInBounds(GEP->isInBounds());
     WorkMap[GEP] = NewI;
-  } else if (auto *BC = dyn_cast<BitCastInst>(I)) {
-    auto *V = getReplacement(BC->getOperand(0));
-    assert(V && "Operand not replaced");
-    auto *NewT = PointerType::get(BC->getType()->getContext(),
-                                  V->getType()->getPointerAddressSpace());
-    auto *NewI = new BitCastInst(V, NewT);
-    IC.InsertNewInstWith(NewI, BC->getIterator());
-    NewI->takeName(BC);
-    WorkMap[BC] = NewI;
   } else if (auto *SI = dyn_cast<SelectInst>(I)) {
     auto *NewSI = SelectInst::Create(
         SI->getCondition(), getReplacement(SI->getTrueValue()),
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 6c25ff2..eb48157 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -5000,31 +5000,24 @@ bool InstCombinerImpl::run() {
       BasicBlock *UserParent = nullptr;
       unsigned NumUsers = 0;
 
-      for (auto *U : I->users()) {
-        if (U->isDroppable())
+      for (Use &U : I->uses()) {
+        User *User = U.getUser();
+        if (User->isDroppable())
           continue;
         if (NumUsers > MaxSinkNumUsers)
           return std::nullopt;
 
-        Instruction *UserInst = cast<Instruction>(U);
+        Instruction *UserInst = cast<Instruction>(User);
         // Special handling for Phi nodes - get the block the use occurs in.
-        if (PHINode *PN = dyn_cast<PHINode>(UserInst)) {
-          for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
-            if (PN->getIncomingValue(i) == I) {
-              // Bail out if we have uses in different blocks. We don't do any
-              // sophisticated analysis (i.e finding NearestCommonDominator of
-              // these use blocks).
-              if (UserParent && UserParent != PN->getIncomingBlock(i))
-                return std::nullopt;
-              UserParent = PN->getIncomingBlock(i);
-            }
-          }
-          assert(UserParent && "expected to find user block!");
-        } else {
-          if (UserParent && UserParent != UserInst->getParent())
-            return std::nullopt;
-          UserParent = UserInst->getParent();
-        }
+        BasicBlock *UserBB = UserInst->getParent();
+        if (PHINode *PN = dyn_cast<PHINode>(UserInst))
+          UserBB = PN->getIncomingBlock(U);
+        // Bail out if we have uses in different blocks. We don't do any
+        // sophisticated analysis (i.e finding NearestCommonDominator of these
+        // use blocks).
+        if (UserParent && UserParent != UserBB)
+          return std::nullopt;
+        UserParent = UserBB;
 
         // Make sure these checks are done only once, naturally we do the checks
         // the first time we get the userparent, this will save compile time.
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 8d39217..2aa2175 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1589,6 +1589,14 @@ void HWAddressSanitizer::sanitizeFunction(Function &F,
 
   assert(!ShadowBase);
 
+  // Remove memory attributes that are about to become invalid.
+  // HWASan checks read from shadow, which invalidates memory(argmem: *)
+  // Short granule checks on function arguments read from the argument memory
+  // (last byte of the granule), which invalidates writeonly.
+  F.removeFnAttr(llvm::Attribute::Memory);
+  for (auto &A : F.args())
+    A.removeAttr(llvm::Attribute::WriteOnly);
+
   BasicBlock::iterator InsertPt = F.getEntryBlock().begin();
   IRBuilder<> EntryIRB(&F.getEntryBlock(), InsertPt);
   emitPrologue(EntryIRB,
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 7e48c28..70bfa46 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -554,6 +554,12 @@ static Decomposition decompose(Value *V,
     V = Op0;
   }
 
+  if (match(V, m_SExt(m_Value(Op0)))) {
+    V = Op0;
+    Preconditions.emplace_back(CmpInst::ICMP_SGE, Op0,
+                               ConstantInt::get(Op0->getType(), 0));
+  }
+
   Value *Op1;
   ConstantInt *CI;
   if (match(V, m_NUWAdd(m_Value(Op0), m_Value(Op1)))) {
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index ce40e8b..4f36bac 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -43,7 +43,7 @@ static ValueLatticeElement::MergeOptions getMaxWidenStepsOpts() {
 }
 
 static ConstantRange getConstantRange(const ValueLatticeElement &LV, Type *Ty,
-                                      bool UndefAllowed = true) {
+                                      bool UndefAllowed) {
   assert(Ty->isIntOrIntVectorTy() && "Should be int or int vector");
   if (LV.isConstantRange(UndefAllowed))
     return LV.getConstantRange();
@@ -1297,7 +1297,8 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) {
 
   if (I.getDestTy()->isIntegerTy() && I.getSrcTy()->isIntOrIntVectorTy()) {
     auto &LV = getValueState(&I);
-    ConstantRange OpRange = getConstantRange(OpSt, I.getSrcTy());
+    ConstantRange OpRange =
+        getConstantRange(OpSt, I.getSrcTy(), /*UndefAllowed=*/false);
 
     Type *DestTy = I.getDestTy();
     // Vectors where all elements have the same known constant range are treated
@@ -1329,8 +1330,8 @@ void SCCPInstVisitor::handleExtractOfWithOverflow(ExtractValueInst &EVI,
     return; // Wait to resolve.
 
   Type *Ty = LHS->getType();
-  ConstantRange LR = getConstantRange(L, Ty);
-  ConstantRange RR = getConstantRange(R, Ty);
+  ConstantRange LR = getConstantRange(L, Ty, /*UndefAllowed=*/false);
+  ConstantRange RR = getConstantRange(R, Ty, /*UndefAllowed=*/false);
   if (Idx == 0) {
     ConstantRange Res = LR.binaryOp(WO->getBinaryOp(), RR);
     mergeInValue(&EVI, ValueLatticeElement::getRange(Res));
@@ -1534,8 +1535,10 @@ void SCCPInstVisitor::visitBinaryOperator(Instruction &I) {
     return markOverdefined(&I);
 
   // Try to simplify to a constant range.
-  ConstantRange A = getConstantRange(V1State, I.getType());
-  ConstantRange B = getConstantRange(V2State, I.getType());
+  ConstantRange A =
+      getConstantRange(V1State, I.getType(), /*UndefAllowed=*/false);
+  ConstantRange B =
+      getConstantRange(V2State, I.getType(), /*UndefAllowed=*/false);
 
   auto *BO = cast<BinaryOperator>(&I);
   ConstantRange R = ConstantRange::getEmpty(I.getType()->getScalarSizeInBits());
@@ -1818,7 +1821,8 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
 
         // Combine range info for the original value with the new range from the
         // condition.
-        auto CopyOfCR = getConstantRange(CopyOfVal, CopyOf->getType());
+        auto CopyOfCR = getConstantRange(CopyOfVal, CopyOf->getType(),
+                                         /*UndefAllowed=*/true);
         auto NewCR = ImposedCR.intersectWith(CopyOfCR);
         // If the existing information is != x, do not use the information from
         // a chained predicate, as the != x information is more likely to be
@@ -1863,7 +1867,8 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
         const ValueLatticeElement &State = getValueState(Op);
         if (State.isUnknownOrUndef())
           return;
-        OpRanges.push_back(getConstantRange(State, Op->getType()));
+        OpRanges.push_back(
+            getConstantRange(State, Op->getType(), /*UndefAllowed=*/false));
       }
 
       ConstantRange Result =
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 93701b2..fe6ec88 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -5501,11 +5501,13 @@ static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) {
 }
 
 static void createUnreachableSwitchDefault(SwitchInst *Switch,
-                                           DomTreeUpdater *DTU) {
+                                           DomTreeUpdater *DTU,
+                                           bool RemoveOrigDefaultBlock = true) {
   LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
   auto *BB = Switch->getParent();
   auto *OrigDefaultBlock = Switch->getDefaultDest();
-  OrigDefaultBlock->removePredecessor(BB);
+  if (RemoveOrigDefaultBlock)
+    OrigDefaultBlock->removePredecessor(BB);
   BasicBlock *NewDefaultBlock = BasicBlock::Create(
       BB->getContext(), BB->getName() + ".unreachabledefault", BB->getParent(),
       OrigDefaultBlock);
@@ -5514,7 +5516,8 @@ static void createUnreachableSwitchDefault(SwitchInst *Switch,
   if (DTU) {
     SmallVector<DominatorTree::UpdateType, 2> Updates;
     Updates.push_back({DominatorTree::Insert, BB, &*NewDefaultBlock});
-    if (!is_contained(successors(BB), OrigDefaultBlock))
+    if (RemoveOrigDefaultBlock &&
+        !is_contained(successors(BB), OrigDefaultBlock))
       Updates.push_back({DominatorTree::Delete, BB, &*OrigDefaultBlock});
     DTU->applyUpdates(Updates);
   }
@@ -5696,10 +5699,33 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
       Known.getBitWidth() - (Known.Zero | Known.One).popcount();
   assert(NumUnknownBits <= Known.getBitWidth());
   if (HasDefault && DeadCases.empty() &&
-      NumUnknownBits < 64 /* avoid overflow */ &&
-      SI->getNumCases() == (1ULL << NumUnknownBits)) {
-    createUnreachableSwitchDefault(SI, DTU);
-    return true;
+      NumUnknownBits < 64 /* avoid overflow */) {
+    uint64_t AllNumCases = 1ULL << NumUnknownBits;
+    if (SI->getNumCases() == AllNumCases) {
+      createUnreachableSwitchDefault(SI, DTU);
+      return true;
+    }
+    // When only one case value is missing, replace default with that case.
+    // Eliminating the default branch will provide more opportunities for
+    // optimization, such as lookup tables.
+    if (SI->getNumCases() == AllNumCases - 1) {
+      assert(NumUnknownBits > 1 && "Should be canonicalized to a branch");
+      IntegerType *CondTy = cast<IntegerType>(Cond->getType());
+      if (CondTy->getIntegerBitWidth() > 64 ||
+          !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth()))
+        return false;
+
+      uint64_t MissingCaseVal = 0;
+      for (const auto &Case : SI->cases())
+        MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue();
+      auto *MissingCase =
+          cast<ConstantInt>(ConstantInt::get(Cond->getType(), MissingCaseVal));
+      SwitchInstProfUpdateWrapper SIW(*SI);
+      SIW.addCase(MissingCase, SI->getDefaultDest(), SIW.getSuccessorWeight(0));
+      createUnreachableSwitchDefault(SI, DTU, /*RemoveOrigDefaultBlock*/ false);
+      SIW.setSuccessorWeight(0, 0);
+      return true;
+    }
   }
 
   if (DeadCases.empty())
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6d64aaa..48981a6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3384,18 +3384,6 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
                                    TargetTransformInfo::TCK_RecipThroughput);
 }
 
-static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
-  auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
-  auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
-  return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
-}
-
-static Type *largestIntegerVectorType(Type *T1, Type *T2) {
-  auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
-  auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
-  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
-}
-
 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
                                             VPlan &Plan) {
   // Fix widened non-induction PHIs by setting up the PHI operands.
@@ -7120,26 +7108,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
       return *RedCost;
 
     Type *SrcScalarTy = I->getOperand(0)->getType();
+    Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
+    if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
+      SrcScalarTy =
+          IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
     Type *SrcVecTy =
         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
-    if (canTruncateToMinimalBitwidth(I, VF)) {
-      // This cast is going to be shrunk. This may remove the cast or it might
-      // turn it into slightly different cast. For example, if MinBW == 16,
-      // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
-      //
-      // Calculate the modified src and dest types.
-      Type *MinVecTy = VectorTy;
-      if (Opcode == Instruction::Trunc) {
-        SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
-        VectorTy =
-            largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
-      } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
-        // Leave SrcVecTy unchanged - we only shrink the destination element
-        // type.
-        VectorTy =
-            smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
-      }
-    }
 
     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
   }
@@ -7533,8 +7507,9 @@ LoopVectorizationPlanner::executePlan(
   LLVM_DEBUG(BestVPlan.dump());
 
   // Perform the actual loop transformation.
-  VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
-                         OrigLoop->getHeader()->getContext());
+  VPTransformState State(BestVF, BestUF, LI,
+                         EnableVPlanNativePath ? nullptr : DT, ILV.Builder,
+                         &ILV, &BestVPlan, OrigLoop->getHeader()->getContext());
 
   // 0. Generate SCEV-dependent code into the preheader, including TripCount,
   // before making any changes to the CFG.
@@ -8157,8 +8132,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
 static VPWidenIntOrFpInductionRecipe *
 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
                             VPValue *Start, const InductionDescriptor &IndDesc,
-                            VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop,
-                            VFRange &Range) {
+                            VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
   assert(IndDesc.getStartValue() ==
          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
@@ -8180,7 +8154,7 @@ VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
   // produces its scalar and vector values.
   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
-                                       *PSE.getSE(), *OrigLoop, Range);
+                                       *PSE.getSE(), *OrigLoop);
 
   // Check if this is pointer induction. If so, build the recipe for it.
   if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
@@ -8220,7 +8194,7 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
     VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
     return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
-                                       *OrigLoop, Range);
+                                       *OrigLoop);
   }
   return nullptr;
 }
@@ -8561,8 +8535,10 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
             *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
       VPlanTransforms::optimize(*Plan, *PSE.getSE());
       // TODO: try to put it close to addActiveLaneMask().
-      if (CM.foldTailWithEVL())
-        VPlanTransforms::addExplicitVectorLength(*Plan);
+      // Discard the plan if it is not EVL-compatible
+      if (CM.foldTailWithEVL() &&
+          !VPlanTransforms::tryAddExplicitVectorLength(*Plan))
+        break;
       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
       VPlans.push_back(std::move(Plan));
     }
@@ -10402,6 +10378,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
       PA.preserve<DominatorTreeAnalysis>();
       PA.preserve<ScalarEvolutionAnalysis>();
     }
+
     PA.preserve<LoopAnalysis>();
 
     if (Result.MadeCFGChange) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 140a1b1..f044a8c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -255,6 +255,21 @@ static bool isVectorLikeInstWithConstOps(Value *V) {
   return isConstant(I->getOperand(2));
 }
 
+/// Returns power-of-2 number of elements in a single register (part), given the
+/// total number of elements \p Size and number of registers (parts) \p
+/// NumParts.
+static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
+  return PowerOf2Ceil(divideCeil(Size, NumParts));
+}
+
+/// Returns correct remaining number of elements, considering total amount \p
+/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
+/// and current register (part) \p Part.
+static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
+                            unsigned Part) {
+  return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
+}
+
 #if !defined(NDEBUG)
 /// Print a short descriptor of the instruction bundle suitable for debug output.
 static std::string shortBundleName(ArrayRef<Value *> VL) {
@@ -502,6 +517,15 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
       cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
   Value *Vec1 = nullptr;
   Value *Vec2 = nullptr;
+  bool HasNonUndefVec = any_of(VL, [](Value *V) {
+    auto *EE = dyn_cast<ExtractElementInst>(V);
+    if (!EE)
+      return false;
+    Value *Vec = EE->getVectorOperand();
+    if (isa<UndefValue>(Vec))
+      return false;
+    return isGuaranteedNotToBePoison(Vec);
+  });
   enum ShuffleMode { Unknown, Select, Permute };
   ShuffleMode CommonShuffleMode = Unknown;
   Mask.assign(VL.size(), PoisonMaskElem);
@@ -514,21 +538,27 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
       return std::nullopt;
     auto *Vec = EI->getVectorOperand();
     // We can extractelement from undef or poison vector.
-    if (isUndefVector(Vec).all())
+    if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
       continue;
     // All vector operands must have the same number of vector elements.
-    if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
-      return std::nullopt;
-    if (isa<UndefValue>(EI->getIndexOperand()))
-      continue;
-    auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
-    if (!Idx)
-      return std::nullopt;
-    // Undefined behavior if Idx is negative or >= Size.
-    if (Idx->getValue().uge(Size))
+    if (isa<UndefValue>(Vec)) {
+      Mask[I] = I;
+    } else {
+      if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
+        return std::nullopt;
+      if (isa<UndefValue>(EI->getIndexOperand()))
+        continue;
+      auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
+      if (!Idx)
+        return std::nullopt;
+      // Undefined behavior if Idx is negative or >= Size.
+      if (Idx->getValue().uge(Size))
+        continue;
+      unsigned IntIdx = Idx->getValue().getZExtValue();
+      Mask[I] = IntIdx;
+    }
+    if (isUndefVector(Vec).all() && HasNonUndefVec)
       continue;
-    unsigned IntIdx = Idx->getValue().getZExtValue();
-    Mask[I] = IntIdx;
     // For correct shuffling we have to have at most 2 different vector operands
     // in all extractelement instructions.
     if (!Vec1 || Vec1 == Vec) {
@@ -543,7 +573,7 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
       continue;
     // If the extract index is not the same as the operation number, it is a
     // permutation.
-    if (IntIdx != I) {
+    if (Mask[I] % Size != I) {
       CommonShuffleMode = Permute;
       continue;
     }
@@ -4066,7 +4096,8 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
       const int VF = GetVF(I);
       if (VF == 0)
         continue;
-      MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, PartSz);
+      unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
+      MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
       // Shuffle of at least 2 vectors - ignore.
       if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
         std::fill(Slice.begin(), Slice.end(), NumScalars);
@@ -4076,7 +4107,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
       // Try to include as much elements from the mask as possible.
       int FirstMin = INT_MAX;
       int SecondVecFound = false;
-      for (int K : seq<int>(0, PartSz)) {
+      for (int K : seq<int>(Limit)) {
         int Idx = Mask[I * PartSz + K];
         if (Idx == PoisonMaskElem) {
           Value *V = GatheredScalars[I * PartSz + K];
@@ -4101,7 +4132,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
         ShuffledSubMasks.set(I);
         continue;
       }
-      for (int K : seq<int>(0, PartSz)) {
+      for (int K : seq<int>(Limit)) {
         int Idx = Mask[I * PartSz + K];
         if (Idx == PoisonMaskElem)
           continue;
@@ -4124,14 +4155,15 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
       }
     }
   };
-  int PartSz = NumScalars / NumParts;
+  int PartSz = getPartNumElems(NumScalars, NumParts);
   if (!ExtractShuffles.empty())
     TransformMaskToOrder(
         CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
           if (!ExtractShuffles[I])
             return 0U;
           unsigned VF = 0;
-          for (unsigned Idx : seq<unsigned>(0, PartSz)) {
+          unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
+          for (unsigned Idx : seq<unsigned>(Sz)) {
             int K = I * PartSz + Idx;
             if (ExtractMask[K] == PoisonMaskElem)
               continue;
@@ -4762,12 +4794,13 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
       ::addMask(ReorderMask, TE.ReuseShuffleIndices);
       unsigned VF = ReorderMask.size();
       OrdersType ResOrder(VF, VF);
-      unsigned NumParts = VF / Sz;
+      unsigned NumParts = divideCeil(VF, Sz);
       SmallBitVector UsedVals(NumParts);
       for (unsigned I = 0; I < VF; I += Sz) {
         int Val = PoisonMaskElem;
         unsigned UndefCnt = 0;
-        if (any_of(ArrayRef(ReorderMask).slice(I, Sz),
+        unsigned Limit = std::min(Sz, VF - I);
+        if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
                    [&](int Idx) {
                      if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
                        Val = Idx;
@@ -6861,23 +6894,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     case Instruction::ExtractElement: {
       if (CurrentOrder.empty()) {
         LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
-        newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                     ReuseShuffleIndices);
-        // This is a special case, as it does not gather, but at the same time
-        // we are not extending buildTree_rec() towards the operands.
-        ValueList Op0;
-        Op0.assign(VL.size(), VL0->getOperand(0));
-        VectorizableTree.back()->setOperand(0, Op0);
-        return;
+      } else {
+        LLVM_DEBUG({
+          dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
+                    "with order";
+          for (unsigned Idx : CurrentOrder)
+            dbgs() << " " << Idx;
+          dbgs() << "\n";
+        });
+        fixupOrderingIndices(CurrentOrder);
       }
-      LLVM_DEBUG({
-        dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
-                  "with order";
-        for (unsigned Idx : CurrentOrder)
-          dbgs() << " " << Idx;
-        dbgs() << "\n";
-      });
-      fixupOrderingIndices(CurrentOrder);
       // Insert new order with initial value 0, if it does not exist,
       // otherwise return the iterator to the existing one.
       newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
@@ -6931,28 +6957,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       fixupOrderingIndices(CurrentOrder);
       switch (State) {
       case TreeEntry::Vectorize:
-        if (CurrentOrder.empty()) {
-          // Original loads are consecutive and does not require reordering.
-          TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                            ReuseShuffleIndices);
+        TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                          ReuseShuffleIndices, CurrentOrder);
+        if (CurrentOrder.empty())
           LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
-        } else {
-          // Need to reorder.
-          TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                            ReuseShuffleIndices, CurrentOrder);
+        else
           LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
-        }
         TE->setOperandsInOrder();
         break;
       case TreeEntry::StridedVectorize:
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
-        if (CurrentOrder.empty()) {
-          TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
-                            UserTreeIdx, ReuseShuffleIndices);
-        } else {
-          TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
-                            UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
-        }
+        TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
+                          UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
         TE->setOperandsInOrder();
         LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
         break;
@@ -7966,6 +7982,10 @@ void BoUpSLP::transformNodes() {
     TreeEntry &E = *TE.get();
     switch (E.getOpcode()) {
     case Instruction::Load: {
+      // No need to reorder masked gather loads, just reorder the scalar
+      // operands.
+      if (E.State != TreeEntry::Vectorize)
+        break;
       Type *ScalarTy = E.getMainOp()->getType();
       auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size());
       Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
@@ -8279,19 +8299,18 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
             return Sz;
           return std::max(Sz, VecTy->getNumElements());
         });
-    unsigned NumSrcRegs =
-        TTI.getNumberOfParts(FixedVectorType::get(ScalarTy, NumElts));
-    if (NumSrcRegs == 0)
-      NumSrcRegs = 1;
     // FIXME: this must be moved to TTI for better estimation.
-    unsigned EltsPerVector = PowerOf2Ceil(std::max(
-        divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
+    unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
     auto CheckPerRegistersShuffle =
-        [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
+        [&](MutableArrayRef<int> Mask,
+            SmallVector<int> Indices) -> std::optional<TTI::ShuffleKind> {
+      if (NumElts <= EltsPerVector)
+        return std::nullopt;
       DenseSet<int> RegIndices;
       // Check that if trying to permute same single/2 input vectors.
       TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
       int FirstRegId = -1;
+      Indices.assign(1, -1);
       for (int &I : Mask) {
         if (I == PoisonMaskElem)
           continue;
@@ -8301,8 +8320,15 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         RegIndices.insert(RegId);
         if (RegIndices.size() > 2)
           return std::nullopt;
-        if (RegIndices.size() == 2)
+        if (RegIndices.size() == 2) {
           ShuffleKind = TTI::SK_PermuteTwoSrc;
+          if (Indices.size() == 1)
+            Indices.push_back(-1);
+        }
+        if (RegId == FirstRegId)
+          Indices.front() = I % NumElts;
+        else
+          Indices.back() = I % NumElts;
         I = (I % NumElts) % EltsPerVector +
             (RegId == FirstRegId ? 0 : EltsPerVector);
       }
@@ -8313,22 +8339,23 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     // Process extracts in blocks of EltsPerVector to check if the source vector
     // operand can be re-used directly. If not, add the cost of creating a
     // shuffle to extract the values into a vector register.
-    for (unsigned Part = 0; Part < NumParts; ++Part) {
+    for (unsigned Part : seq<unsigned>(NumParts)) {
       if (!ShuffleKinds[Part])
         continue;
-      ArrayRef<int> MaskSlice =
-          Mask.slice(Part * EltsPerVector,
-                     (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
-                         ? Mask.size() % EltsPerVector
-                         : EltsPerVector);
+      ArrayRef<int> MaskSlice = Mask.slice(
+          Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
       SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
       copy(MaskSlice, SubMask.begin());
+      SmallVector<int> Indices;
       std::optional<TTI::ShuffleKind> RegShuffleKind =
-          CheckPerRegistersShuffle(SubMask);
+          CheckPerRegistersShuffle(SubMask, Indices);
       if (!RegShuffleKind) {
-        Cost += ::getShuffleCost(TTI, *ShuffleKinds[Part],
-                                 FixedVectorType::get(ScalarTy, NumElts),
-                                 MaskSlice);
+        if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
+            !ShuffleVectorInst::isIdentityMask(
+                MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
+          Cost += ::getShuffleCost(TTI, *ShuffleKinds[Part],
+                                   FixedVectorType::get(ScalarTy, NumElts),
+                                   MaskSlice);
         continue;
       }
       if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
@@ -8337,6 +8364,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                                  FixedVectorType::get(ScalarTy, EltsPerVector),
                                  SubMask);
       }
+      for (int Idx : Indices) {
+        Cost += ::getShuffleCost(TTI, TTI::SK_ExtractSubvector,
+                                 FixedVectorType::get(ScalarTy, NumElts),
+                                 std::nullopt, CostKind, Idx,
+                                 FixedVectorType::get(ScalarTy, EltsPerVector));
+      }
     }
     return Cost;
   }
@@ -8364,11 +8397,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
            InVectors.front().get<const TreeEntry *>() == &E1 &&
            InVectors.back().get<const TreeEntry *>() == E2) ||
           (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
-        assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
+        unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
+        assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
                       [](int Idx) { return Idx == PoisonMaskElem; }) &&
                "Expected all poisoned elements.");
-        ArrayRef<int> SubMask =
-            ArrayRef(Mask).slice(Part * SliceSize, SliceSize);
+        ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
         copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
         return;
       }
@@ -8688,10 +8721,11 @@ public:
                  });
         });
     SmallPtrSet<Value *, 4> UniqueBases;
-    unsigned SliceSize = VL.size() / NumParts;
-    for (unsigned Part = 0; Part < NumParts; ++Part) {
-      ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
-      for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
+    unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
+    for (unsigned Part : seq<unsigned>(NumParts)) {
+      unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
+      ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
+      for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, Limit))) {
         // Ignore non-extractelement scalars.
         if (isa<UndefValue>(V) ||
             (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
@@ -8788,7 +8822,7 @@ public:
     unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
     if (NumParts == 0 || NumParts >= Mask.size())
       NumParts = 1;
-    unsigned SliceSize = Mask.size() / NumParts;
+    unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
     const auto *It =
         find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
     unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
@@ -8805,7 +8839,7 @@ public:
     unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
     if (NumParts == 0 || NumParts >= Mask.size())
       NumParts = 1;
-    unsigned SliceSize = Mask.size() / NumParts;
+    unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
     const auto *It =
         find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
     unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
@@ -10662,12 +10696,12 @@ BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
   assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
   SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
   Mask.assign(VL.size(), PoisonMaskElem);
-  unsigned SliceSize = VL.size() / NumParts;
-  for (unsigned Part = 0; Part < NumParts; ++Part) {
+  unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
+  for (unsigned Part : seq<unsigned>(NumParts)) {
     // Scan list of gathered scalars for extractelements that can be represented
     // as shuffles.
-    MutableArrayRef<Value *> SubVL =
-        MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);
+    MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
+        Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
     SmallVector<int> SubMask;
     std::optional<TTI::ShuffleKind> Res =
         tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
@@ -11071,10 +11105,11 @@ BoUpSLP::isGatherShuffledEntry(
          "Expected only single user of the gather node.");
   assert(VL.size() % NumParts == 0 &&
          "Number of scalars must be divisible by NumParts.");
-  unsigned SliceSize = VL.size() / NumParts;
+  unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
   SmallVector<std::optional<TTI::ShuffleKind>> Res;
-  for (unsigned Part = 0; Part < NumParts; ++Part) {
-    ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
+  for (unsigned Part : seq<unsigned>(NumParts)) {
+    ArrayRef<Value *> SubVL =
+        VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
     SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
     std::optional<TTI::ShuffleKind> SubRes =
         isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
@@ -11677,11 +11712,12 @@ public:
     // into a long virtual vector register, forming the original vector.
     Value *Vec = nullptr;
     SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
-    unsigned SliceSize = E->Scalars.size() / NumParts;
-    for (unsigned Part = 0; Part < NumParts; ++Part) {
+    unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
+    for (unsigned Part : seq<unsigned>(NumParts)) {
+      unsigned Limit = getNumElems(E->Scalars.size(), SliceSize, Part);
       ArrayRef<Value *> VL =
-          ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);
-      MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
+          ArrayRef(E->Scalars).slice(Part * SliceSize, Limit);
+      MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
       constexpr int MaxBases = 2;
       SmallVector<Value *, MaxBases> Bases(MaxBases);
 #ifndef NDEBUG
@@ -11718,7 +11754,9 @@ public:
         assert((Part == 0 || all_of(seq<unsigned>(0, Part),
                                     [&](unsigned P) {
                                       ArrayRef<int> SubMask =
-                                          Mask.slice(P * SliceSize, SliceSize);
+                                          Mask.slice(P * SliceSize,
+                                                     getNumElems(Mask.size(),
+                                                                 SliceSize, P));
                                       return all_of(SubMask, [](int Idx) {
                                         return Idx == PoisonMaskElem;
                                       });
@@ -12102,13 +12140,19 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
          Idx == 0) ||
         (Mask.size() == InputVF &&
          ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
-      std::iota(std::next(Mask.begin(), I * SliceSize),
-                std::next(Mask.begin(), (I + 1) * SliceSize), 0);
+      std::iota(
+          std::next(Mask.begin(), I * SliceSize),
+          std::next(Mask.begin(),
+                    I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
+          0);
     } else {
       unsigned IVal =
           *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
-      std::fill(std::next(Mask.begin(), I * SliceSize),
-                std::next(Mask.begin(), (I + 1) * SliceSize), IVal);
+      std::fill(
+          std::next(Mask.begin(), I * SliceSize),
+          std::next(Mask.begin(),
+                    I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
+          IVal);
     }
     return true;
   };
@@ -12368,7 +12412,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
       }
     }
     if (!GatherShuffles.empty()) {
-      unsigned SliceSize = E->Scalars.size() / NumParts;
+      unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
       SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
       for (const auto [I, TEs] : enumerate(Entries)) {
         if (TEs.empty()) {
@@ -12378,7 +12422,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
         }
         assert((TEs.size() == 1 || TEs.size() == 2) &&
                "Expected shuffle of 1 or 2 entries.");
-        auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
+        unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
+        auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
         VecMask.assign(VecMask.size(), PoisonMaskElem);
         copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
         if (TEs.size() == 1) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 27f8e23..d71d758 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -218,7 +219,7 @@ VPTransformState::VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
                                    DominatorTree *DT, IRBuilderBase &Builder,
                                    InnerLoopVectorizer *ILV, VPlan *Plan,
                                    LLVMContext &Ctx)
-    : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan),
+    : VF(VF), UF(UF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan),
       LVer(nullptr),
       TypeAnalysis(Plan->getCanonicalIV()->getScalarType(), Ctx) {}
 
@@ -436,6 +437,7 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
              "Trying to reset an existing successor block.");
       TermBr->setSuccessor(idx, NewBB);
     }
+    CFG.DTU.applyUpdates({{DominatorTree::Insert, PredBB, NewBB}});
   }
   return NewBB;
 }
@@ -467,6 +469,7 @@ void VPBasicBlock::execute(VPTransformState *State) {
     // The Exit block of a loop is always set to be successor 0 of the Exiting
     // block.
     cast<BranchInst>(ExitingBB->getTerminator())->setSuccessor(0, NewBB);
+    State->CFG.DTU.applyUpdates({{DominatorTree::Insert, ExitingBB, NewBB}});
   } else if (PrevVPBB && /* A */
              !((SingleHPred = getSingleHierarchicalPredecessor()) &&
                SingleHPred->getExitingBasicBlock() == PrevVPBB &&
@@ -829,6 +832,11 @@ void VPlan::execute(VPTransformState *State) {
   BasicBlock *VectorPreHeader = State->CFG.PrevBB;
   State->Builder.SetInsertPoint(VectorPreHeader->getTerminator());
 
+  // Disconnect VectorPreHeader from ExitBB in both the CFG and DT.
+  cast<BranchInst>(VectorPreHeader->getTerminator())->setSuccessor(0, nullptr);
+  State->CFG.DTU.applyUpdates(
+      {{DominatorTree::Delete, VectorPreHeader, State->CFG.ExitBB}});
+
   // Generate code in the loop pre-header and body.
   for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
     Block->execute(State);
@@ -891,13 +899,10 @@ void VPlan::execute(VPTransformState *State) {
     }
   }
 
-  // We do not attempt to preserve DT for outer loop vectorization currently.
-  if (!EnableVPlanNativePath) {
-    BasicBlock *VectorHeaderBB = State->CFG.VPBB2IRBB[Header];
-    State->DT->addNewBlock(VectorHeaderBB, VectorPreHeader);
-    updateDominatorTree(State->DT, VectorHeaderBB, VectorLatchBB,
-                        State->CFG.ExitBB);
-  }
+  State->CFG.DTU.flush();
+  // DT is currently updated for non-native path only.
+  assert(EnableVPlanNativePath || State->CFG.DTU.getDomTree().verify(
+                                      DominatorTree::VerificationLevel::Fast));
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -995,44 +1000,6 @@ void VPlan::addLiveOut(PHINode *PN, VPValue *V) {
   LiveOuts.insert({PN, new VPLiveOut(PN, V)});
 }
 
-void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB,
-                                BasicBlock *LoopLatchBB,
-                                BasicBlock *LoopExitBB) {
-  // The vector body may be more than a single basic-block by this point.
-  // Update the dominator tree information inside the vector body by propagating
-  // it from header to latch, expecting only triangular control-flow, if any.
-  BasicBlock *PostDomSucc = nullptr;
-  for (auto *BB = LoopHeaderBB; BB != LoopLatchBB; BB = PostDomSucc) {
-    // Get the list of successors of this block.
-    std::vector<BasicBlock *> Succs(succ_begin(BB), succ_end(BB));
-    assert(Succs.size() <= 2 &&
-           "Basic block in vector loop has more than 2 successors.");
-    PostDomSucc = Succs[0];
-    if (Succs.size() == 1) {
-      assert(PostDomSucc->getSinglePredecessor() &&
-             "PostDom successor has more than one predecessor.");
-      DT->addNewBlock(PostDomSucc, BB);
-      continue;
-    }
-    BasicBlock *InterimSucc = Succs[1];
-    if (PostDomSucc->getSingleSuccessor() == InterimSucc) {
-      PostDomSucc = Succs[1];
-      InterimSucc = Succs[0];
-    }
-    assert(InterimSucc->getSingleSuccessor() == PostDomSucc &&
-           "One successor of a basic block does not lead to the other.");
-    assert(InterimSucc->getSinglePredecessor() &&
-           "Interim successor has more than one predecessor.");
-    assert(PostDomSucc->hasNPredecessors(2) &&
-           "PostDom successor has more than two predecessors.");
-    DT->addNewBlock(InterimSucc, BB);
-    DT->addNewBlock(PostDomSucc, BB);
-  }
-  // Latch block is a new dominator for the loop exit.
-  DT->changeImmediateDominator(LoopExitBB, LoopLatchBB);
-  assert(DT->verify(DominatorTree::VerificationLevel::Fast));
-}
-
 static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
                           DenseMap<VPValue *, VPValue *> &Old2NewVPValues) {
   // Update the operands of all cloned recipes starting at NewEntry. This
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 4b3cb15..3aee179 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -35,6 +35,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/ilist.h"
 #include "llvm/ADT/ilist_node.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
@@ -372,7 +373,11 @@ struct VPTransformState {
     /// of replication, maps the BasicBlock of the last replica created.
     SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;
 
-    CFGState() = default;
+    /// Updater for the DominatorTree.
+    DomTreeUpdater DTU;
+
+    CFGState(DominatorTree *DT)
+        : DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy) {}
 
     /// Returns the BasicBlock* mapped to the pre-header of the loop region
     /// containing \p R.
@@ -382,9 +387,6 @@ struct VPTransformState {
   /// Hold a pointer to LoopInfo to register new basic blocks in the loop.
   LoopInfo *LI;
 
-  /// Hold a pointer to Dominator Tree to register new basic blocks in the loop.
-  DominatorTree *DT;
-
   /// Hold a reference to the IRBuilder used to generate output IR code.
   IRBuilderBase &Builder;
 
@@ -3289,13 +3291,6 @@ public:
   /// Clone the current VPlan, update all VPValues of the new VPlan and cloned
   /// recipes to refer to the clones, and return it.
   VPlan *duplicate();
-
-private:
-  /// Add to the given dominator tree the header block and every new basic block
-  /// that was created between it and the latch block, inclusive.
-  static void updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB,
-                                  BasicBlock *LoopLatchBB,
-                                  BasicBlock *LoopExitBB);
 };
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 7ff8d8e..422579e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1318,8 +1318,16 @@ void VPlanTransforms::addActiveLaneMask(
 /// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
 /// ...
 ///
-void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) {
+bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
   VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+  // The transform updates all users of inductions to work based on EVL, instead
+  // of the VF directly. At the moment, widened inductions cannot be updated, so
+  // bail out if the plan contains any.
+  if (any_of(Header->phis(), [](VPRecipeBase &Phi) {
+        return (isa<VPWidenIntOrFpInductionRecipe>(&Phi) ||
+                isa<VPWidenPointerInductionRecipe>(&Phi));
+      }))
+    return false;
   auto *CanonicalIVPHI = Plan.getCanonicalIV();
   VPValue *StartV = CanonicalIVPHI->getStartValue();
 
@@ -1377,6 +1385,7 @@ void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) {
   CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
   // TODO: support unroll factor > 1.
   Plan.setUF(1);
+  return true;
 }
 
 void VPlanTransforms::dropPoisonGeneratingRecipes(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 0cbc707..96b8a66 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -104,7 +104,8 @@ struct VPlanTransforms {
   /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe.
   /// VPCanonicalIVPHIRecipe is only used to control the loop after
   /// this transformation.
-  static void addExplicitVectorLength(VPlan &Plan);
+  /// \returns true if the transformation succeeds, or false if it doesn't.
+  static bool tryAddExplicitVectorLength(VPlan &Plan);
 };
 
 } // namespace llvm
diff --git a/llvm/test/Analysis/CostModel/AArch64/cast.ll b/llvm/test/Analysis/CostModel/AArch64/cast.ll
index 0cd444f..fa77886 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cast.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cast.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 %s | FileCheck --check-prefixes=CHECK,CHECK-NOFP16 %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve -force-streaming-compatible-sve %s | FileCheck --check-prefixes=SVE,SVE128-NO-NEON %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve -force-streaming-compatible %s | FileCheck --check-prefixes=SVE,SVE128-NO-NEON %s
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16 %s | FileCheck --check-prefixes=CHECK,CHECK-FP16 %s
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=256 %s | FileCheck --check-prefixes=SVE,FIXED-MIN-256 %s
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=2048 %s | FileCheck --check-prefixes=SVE,FIXED-MIN-2048 %s
diff --git a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
index cc1532e..e1a9ee1 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
@@ -13,15 +13,15 @@ define void @foo_no_vscale_range() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res.i64.v32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res.i32.v32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
@@ -33,15 +33,15 @@ define void @foo_no_vscale_range() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res.i64.v32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res.i32.v32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
index be5cca0..a181567 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
@@ -7,603 +7,1140 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji -cost-kind=code-size -S | FileCheck -check-prefixes=ALL-SIZE,VI-SIZE %s
 ; END.
 
-define amdgpu_kernel void @shufflevector_i16() {
+define amdgpu_kernel void @shufflevector_i16(<2 x i16> %vec1, <2 x i16> %vec2) {
 ; GFX9-10-LABEL: 'shufflevector_i16'
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 2>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 2>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 3>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 3>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 2>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 3>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 3>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 2>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf000 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> zeroinitializer
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 1, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 1, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 0, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 0, i32 2>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 2, i32 2>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 0, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 0, i32 2>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf222 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 2>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 2>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 2, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 2, i32 2>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 1, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 1, i32 2>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> zeroinitializer
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 3>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 3>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 3>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 3>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> zeroinitializer
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf222 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> zeroinitializer
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 3>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 3>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 3>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 3>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> zeroinitializer
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf222_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
 ; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; VI-LABEL: 'shufflevector_i16'
-; VI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf00 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
-; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf10 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf11 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 2>
-; VI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf22 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 2>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 3>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf33 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 3>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 2>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 3>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 3>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf32 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 2>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf000 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> zeroinitializer
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 1, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 1, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 0, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 0, i32 2>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 2, i32 2>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 0, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 0, i32 2>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf222 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 2>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 2>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 2, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 2, i32 2>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 1, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 1, i32 2>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf00 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> zeroinitializer
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf10 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf11 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf22 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 3>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf33 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 3>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 3>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 3>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf32 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> zeroinitializer
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf222 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf00_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> zeroinitializer
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf10_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf11_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf22_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 3>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf33_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 3>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 3>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 3>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf32_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> zeroinitializer
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf222_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
 ; VI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-10-SIZE-LABEL: 'shufflevector_i16'
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 2>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 2>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 3>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 3>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 2>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 3>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 3>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 2>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf000 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> zeroinitializer
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 1, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 1, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 0, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 0, i32 2>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 2, i32 2>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 0, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 0, i32 2>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf222 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 2>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 2>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 2, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 2, i32 2>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 1, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 1, i32 2>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> zeroinitializer
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 3>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 3>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 3>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 3>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> zeroinitializer
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf222 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> zeroinitializer
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 3>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 3>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 3>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 3>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> zeroinitializer
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf222_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
 ; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; VI-SIZE-LABEL: 'shufflevector_i16'
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf00 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf10 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf11 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 2>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf22 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 2>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 3>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf33 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 3>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 2>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 3>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 3>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf32 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 2>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf000 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> zeroinitializer
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 1, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 1, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 0, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 0, i32 2>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 2, i32 2>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 0, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 0, i32 2>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf222 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 2>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 2>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 2, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 2, i32 2>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 1, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 1, i32 2>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf00 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> zeroinitializer
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf10 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf11 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf22 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 3>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf33 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 3>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 3>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 3>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf32 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> zeroinitializer
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf222 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf00_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> zeroinitializer
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf10_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf11_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf22_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 3>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf33_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 3>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 3>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 3>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf32_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> zeroinitializer
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf222_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
 ; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %shuf00 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
-  %shuf01 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 1>
-  %shuf10 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-  %shuf11 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-  %shuf02 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 2>
-  %shuf20 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 0>
-  %shuf22 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 2>
-  %shuf03 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 3>
-  %shuf30 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 0>
-  %shuf33 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 3>
-  %shuf12 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 2>
-  %shuf21 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 1>
-  %shuf13 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 3>
-  %shuf31 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 1>
-  %shuf23 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 3>
-  %shuf32 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 2>
-  %shuf000 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 0, i32 0>
-  %shuf001 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 0, i32 1>
-  %shuf010 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 1, i32 0>
-  %shuf011 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 1, i32 1>
-  %shuf100 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 0, i32 0>
-  %shuf101 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 0, i32 1>
-  %shuf110 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 0>
-  %shuf111 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 1>
-  %shuf002 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 0, i32 2>
-  %shuf020 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 2, i32 0>
-  %shuf022 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 2, i32 2>
-  %shuf200 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 0, i32 0>
-  %shuf202 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 0, i32 2>
-  %shuf220 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 0>
-  %shuf222 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 2>
-  %shuf112 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 2>
-  %shuf121 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 2, i32 1>
-  %shuf122 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 2, i32 2>
-  %shuf211 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 1, i32 1>
-  %shuf212 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 1, i32 2>
-  %shuf221 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 1>
+  %shuf00 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> zeroinitializer
+  %shuf01 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 1>
+  %shuf10 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 0>
+  %shuf11 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 1>
+  %shuf02 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 2>
+  %shuf20 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 0>
+  %shuf22 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 2>
+  %shuf03 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 3>
+  %shuf30 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 0>
+  %shuf33 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 3>
+  %shuf12 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 2>
+  %shuf21 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 1>
+  %shuf13 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 3>
+  %shuf31 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 1>
+  %shuf23 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 3>
+  %shuf32 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 2>
+  %shuf000 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 0>
+  %shuf001 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+  %shuf010 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+  %shuf011 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+  %shuf100 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+  %shuf101 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+  %shuf110 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+  %shuf111 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+  %shuf002 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+  %shuf020 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+  %shuf022 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+  %shuf200 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+  %shuf202 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+  %shuf220 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+  %shuf222 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+  %shuf112 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+  %shuf121 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+  %shuf122 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+  %shuf211 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+  %shuf212 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+  %shuf221 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+  %shuf00_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> zeroinitializer
+  %shuf01_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 1>
+  %shuf10_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 0>
+  %shuf11_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 1>
+  %shuf02_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 2>
+  %shuf20_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 0>
+  %shuf22_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 2>
+  %shuf03_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 3>
+  %shuf30_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 0>
+  %shuf33_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 3>
+  %shuf12_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 2>
+  %shuf21_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 1>
+  %shuf13_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 3>
+  %shuf31_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 1>
+  %shuf23_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 3>
+  %shuf32_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 2>
+  %shuf000_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 0, i32 0>
+  %shuf001_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+  %shuf010_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+  %shuf011_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+  %shuf100_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+  %shuf101_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+  %shuf110_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+  %shuf111_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+  %shuf002_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+  %shuf020_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+  %shuf022_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+  %shuf200_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+  %shuf202_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+  %shuf220_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+  %shuf222_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+  %shuf112_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+  %shuf121_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+  %shuf122_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+  %shuf211_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+  %shuf212_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+  %shuf221_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
   ret void
 }
 
 ; Should not assert
-define amdgpu_kernel void @shufflevector_i8() {
+define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) {
 ; ALL-LABEL: 'shufflevector_i8'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf31 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf32 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> zeroinitializer
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf001 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf010 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf011 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf100 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 0, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf101 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf110 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf111 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf002 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf020 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf022 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf200 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 0, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf202 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf220 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf222 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf112 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 1, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf121 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 2, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf122 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf211 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf212 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 1, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf221 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf31 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf32 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf000 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf001 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf010 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf011 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf100 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf101 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf110 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf111 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf002 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf020 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf022 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf200 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf202 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf220 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf222 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf112 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf121 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf122 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf211 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf212 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf221 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf31_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf32_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf000_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf001_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf010_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf011_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf100_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf101_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf110_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf111_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf002_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf020_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf022_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf200_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf202_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf220_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf222_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf112_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf121_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf122_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf211_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf212_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf221_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; ALL-SIZE-LABEL: 'shufflevector_i8'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf31 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf32 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> zeroinitializer
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf001 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf010 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf011 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf100 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 0, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf101 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf110 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf111 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf002 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf020 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf022 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf200 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 0, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf202 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf220 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf222 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf112 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 1, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf121 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 2, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf122 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf211 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf212 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 1, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf221 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf31 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf32 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf000 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf001 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf010 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf011 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf100 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf101 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf110 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf111 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf002 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf020 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf022 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf200 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf202 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf220 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf222 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf112 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf121 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf122 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf211 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf212 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf221 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf31_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf32_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf000_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf001_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf010_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf011_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf100_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf101_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf110_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf111_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf002_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf020_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf022_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf200_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf202_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf220_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf222_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf112_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf121_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf122_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf211_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf212_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf221_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %shuf00 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
-  %shuf01 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 1>
-  %shuf10 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-  %shuf11 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-  %shuf02 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 2>
-  %shuf20 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 0>
-  %shuf22 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 2>
-  %shuf03 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
-  %shuf30 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 0>
-  %shuf33 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 3>
-  %shuf12 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 2>
-  %shuf21 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 1>
-  %shuf13 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 3>
-  %shuf31 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 1>
-  %shuf23 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 3>
-  %shuf32 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 2>
-  %shuf000 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 0, i32 0>
-  %shuf001 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 0, i32 1>
-  %shuf010 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 1, i32 0>
-  %shuf011 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 1, i32 1>
-  %shuf100 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 0, i32 0>
-  %shuf101 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 0, i32 1>
-  %shuf110 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 1, i32 0>
-  %shuf111 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 1, i32 1>
-  %shuf002 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 0, i32 2>
-  %shuf020 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 2, i32 0>
-  %shuf022 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 2, i32 2>
-  %shuf200 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 0, i32 0>
-  %shuf202 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 0, i32 2>
-  %shuf220 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 2, i32 0>
-  %shuf222 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 2, i32 2>
-  %shuf112 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 1, i32 2>
-  %shuf121 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 2, i32 1>
-  %shuf122 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 2, i32 2>
-  %shuf211 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 1, i32 1>
-  %shuf212 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 1, i32 2>
-  %shuf221 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 2, i32 1>
+  %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
+  %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 1>
+  %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 0>
+  %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
+  %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 2>
+  %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 0>
+  %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 2>
+  %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 3>
+  %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 0>
+  %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
+  %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 2>
+  %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 1>
+  %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 3>
+  %shuf31 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 1>
+  %shuf23 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 3>
+  %shuf32 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 2>
+  %shuf000 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 0>
+  %shuf001 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+  %shuf010 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+  %shuf011 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+  %shuf100 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+  %shuf101 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+  %shuf110 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+  %shuf111 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+  %shuf002 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+  %shuf020 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+  %shuf022 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+  %shuf200 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+  %shuf202 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+  %shuf220 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+  %shuf222 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+  %shuf112 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+  %shuf121 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+  %shuf122 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+  %shuf211 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+  %shuf212 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+  %shuf221 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+  %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer
+  %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 1>
+  %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 0>
+  %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
+  %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 2>
+  %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 0>
+  %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 2>
+  %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 3>
+  %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 0>
+  %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
+  %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 2>
+  %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 1>
+  %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 3>
+  %shuf31_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 1>
+  %shuf23_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 3>
+  %shuf32_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 2>
+  %shuf000_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 0>
+  %shuf001_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+  %shuf010_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+  %shuf011_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+  %shuf100_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+  %shuf101_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+  %shuf110_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+  %shuf111_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+  %shuf002_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+  %shuf020_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+  %shuf022_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+  %shuf200_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+  %shuf202_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+  %shuf220_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+  %shuf222_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+  %shuf112_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+  %shuf121_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+  %shuf122_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+  %shuf211_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+  %shuf212_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+  %shuf221_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
   ret void
 }
 
-define amdgpu_kernel void @shufflevector_i32() {
+define amdgpu_kernel void @shufflevector_i32(<2 x i32> %vec1, <2 x i32> %vec2) {
 ; ALL-LABEL: 'shufflevector_i32'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf02 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf20 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf03 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 0, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf30 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf12 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf21 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf13 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf31 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf000 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> zeroinitializer
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf001 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf010 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf011 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf100 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 0, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf101 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf110 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf111 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf002 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf020 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf022 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf200 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 0, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf202 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf220 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf222 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf112 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 1, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf121 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 2, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf122 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf211 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf212 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 1, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf221 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf02 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf20 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf03 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 0, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf30 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf12 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf21 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf13 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf31 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf000 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf001 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf010 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf011 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf100 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf101 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf110 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf111 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf002 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf020 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf022 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf200 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf202 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf220 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf222 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf112 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf121 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf122 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf211 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf212 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf221 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf02_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf20_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf03_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 0, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf30_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf12_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf21_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf13_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf31_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf000_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf001_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf010_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf011_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf100_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf101_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf110_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf111_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf002_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf020_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf022_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf200_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf202_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf220_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf222_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf112_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf121_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf122_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf211_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf212_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf221_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; ALL-SIZE-LABEL: 'shufflevector_i32'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf02 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf20 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf03 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 0, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf30 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf12 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf21 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf13 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf31 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf000 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> zeroinitializer
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf001 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf010 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf011 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf100 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 0, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf101 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf110 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf111 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf002 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf020 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf022 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf200 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 0, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf202 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf220 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf222 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf112 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 1, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf121 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 2, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf122 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf211 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf212 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 1, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf221 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf02 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf20 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf03 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 0, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf30 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf12 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf21 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf13 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf31 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf000 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf001 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf010 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf011 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf100 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf101 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf110 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf111 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf002 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf020 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf022 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf200 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf202 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf220 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf222 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf112 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf121 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf122 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf211 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf212 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf221 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf02_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf20_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf03_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 0, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf30_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf12_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf21_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf13_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf31_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf000_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf001_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf010_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf011_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf100_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf101_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf110_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf111_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf002_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf020_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf022_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf200_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf202_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf220_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf222_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf112_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf121_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf122_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf211_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf212_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf221_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %shuf00 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
-  %shuf01 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 0, i32 1>
-  %shuf10 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-  %shuf11 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %shuf02 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 0, i32 2>
-  %shuf20 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 0>
-  %shuf22 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 2>
-  %shuf03 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 0, i32 3>
-  %shuf30 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 0>
-  %shuf33 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %shuf12 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 2>
-  %shuf21 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 1>
-  %shuf13 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 3>
-  %shuf31 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 1>
-  %shuf23 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuf32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 2>
-  %shuf000 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 0, i32 0>
-  %shuf001 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 0, i32 1>
-  %shuf010 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 1, i32 0>
-  %shuf011 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 1, i32 1>
-  %shuf100 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 0, i32 0>
-  %shuf101 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 0, i32 1>
-  %shuf110 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 1, i32 0>
-  %shuf111 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 1, i32 1>
-  %shuf002 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 0, i32 2>
-  %shuf020 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 2, i32 0>
-  %shuf022 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 2, i32 2>
-  %shuf200 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 0, i32 0>
-  %shuf202 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 0, i32 2>
-  %shuf220 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 2, i32 0>
-  %shuf222 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 2, i32 2>
-  %shuf112 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 1, i32 2>
-  %shuf121 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 2, i32 1>
-  %shuf122 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 2, i32 2>
-  %shuf211 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 1, i32 1>
-  %shuf212 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 1, i32 2>
-  %shuf221 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 2, i32 1>
+  %shuf00 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> zeroinitializer
+  %shuf01 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 0, i32 1>
+  %shuf10 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 0>
+  %shuf11 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 1>
+  %shuf02 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 0, i32 2>
+  %shuf20 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 0>
+  %shuf22 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 2>
+  %shuf03 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 0, i32 3>
+  %shuf30 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 0>
+  %shuf33 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 3>
+  %shuf12 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 2>
+  %shuf21 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 1>
+  %shuf13 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 3>
+  %shuf31 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 1>
+  %shuf23 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 3>
+  %shuf32 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 2>
+  %shuf000 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 0, i32 0>
+  %shuf001 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+  %shuf010 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+  %shuf011 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+  %shuf100 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+  %shuf101 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+  %shuf110 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+  %shuf111 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+  %shuf002 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+  %shuf020 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+  %shuf022 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+  %shuf200 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+  %shuf202 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+  %shuf220 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+  %shuf222 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+  %shuf112 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+  %shuf121 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+  %shuf122 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+  %shuf211 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+  %shuf212 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+  %shuf221 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+  %shuf00_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> zeroinitializer
+  %shuf01_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 0, i32 1>
+  %shuf10_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 0>
+  %shuf11_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 1>
+  %shuf02_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 0, i32 2>
+  %shuf20_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 0>
+  %shuf22_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 2>
+  %shuf03_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 0, i32 3>
+  %shuf30_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 0>
+  %shuf33_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 3>
+  %shuf12_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 2>
+  %shuf21_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 1>
+  %shuf13_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 3>
+  %shuf31_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 1>
+  %shuf23_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 3>
+  %shuf32_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 2>
+  %shuf000_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 0, i32 0>
+  %shuf001_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+  %shuf010_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+  %shuf011_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+  %shuf100_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+  %shuf101_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+  %shuf110_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+  %shuf111_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+  %shuf002_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+  %shuf020_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+  %shuf022_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+  %shuf200_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+  %shuf202_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+  %shuf220_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+  %shuf222_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+  %shuf112_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+  %shuf121_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+  %shuf122_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+  %shuf211_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+  %shuf212_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+  %shuf221_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
   ret void
 }
 
 ; Other shuffle cases
-define void @shuffle() {
+define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> %i8v4_2, <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i8> %i8v16, <16 x i8> %i8v16_2, <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i16> %i16v8, <8 x i16> %i16v8_2, <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> %i32v4, <4 x i32> %i32v4_2, <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x float> %floatv4, <4 x float> %floatv4_2,<2 x i64> %i64v2, <2 x i64> %i64v2_2,<2 x double> %doublev2, <2 x double> %doublev2_2) {
 ; GFX9-10-LABEL: 'shuffle'
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> undef, <2 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> undef, <6 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i16_4 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v8i16_8 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4_2 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4_2 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <2 x i32> <i32 1, i32 0>
 ; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; VI-LABEL: 'shuffle'
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> undef, <2 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> undef, <6 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i16_4 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v8i16_8 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4_2 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2_2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4_2 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2_2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <2 x i32> <i32 1, i32 0>
 ; VI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-10-SIZE-LABEL: 'shuffle'
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> undef, <2 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> undef, <6 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i16_4 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v8i16_8 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4_2 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4_2 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <2 x i32> <i32 1, i32 0>
 ; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; VI-SIZE-LABEL: 'shuffle'
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> undef, <2 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> undef, <6 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i16_4 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v8i16_8 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4_2 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2_2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4_2 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2_2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <2 x i32> <i32 1, i32 0>
 ; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %v2i8_2 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-  %v2i8_4 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-  %v4i8_4 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-  %v2i8_8 = shufflevector <2 x i8> undef, <2 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-  %v4i8_8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-  %v6i8_8 = shufflevector <6 x i8> undef, <6 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-  %v8i8_8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-  %v16i8_16 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-  %v2i16_2 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-  %v4i16_4 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-  %v8i16_8 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-  %v2i32_2 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-  %v4i32_4 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-  %v2f32_2 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-  %v4f32_4 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-  %v2i64_2 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-  %v2f64_2 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
+  %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
+  %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %v2i8_8 = shufflevector <2 x i8>  %i8v2, <2 x i8>  %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v2i8_8_2 = shufflevector <2 x i8>  %i8v2, <2 x i8>  %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> <i32 1, i32 0>
+  %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> <i32 1, i32 0>
+  %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %v4i16_4_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %v8i16_8 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v8i16_8_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v2i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <2 x i32> <i32 1, i32 0>
+  %v2i32_2_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <2 x i32> <i32 1, i32 0>
+  %v4i32_4 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %v4i32_4_2 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %v2f32_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <2 x i32> <i32 1, i32 0>
+  %v2f32_2_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2_2, <2 x i32> <i32 1, i32 0>
+  %v4f32_4 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %v4f32_4_2 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %v2i64_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2, <2 x i32> <i32 1, i32 0>
+  %v2i64_2_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2_2, <2 x i32> <i32 1, i32 0>
+  %v2f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <2 x i32> <i32 1, i32 0>
+  %v2f64_2_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <2 x i32> <i32 1, i32 0>
   ret void
 }
 
-define void @concat() {
+define void @concat(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i8> %i8v8, <8 x i8> %i8v8_2, <2 x half> %halfv2, <2 x half> %halfv2_2, <4 x half> %halfv4, <4 x half> %halfv4_2, <8 x half> %halfv8, <8 x half> %halfv8_2, <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i16> %i16v8, <8 x i16> %i16v8_2, <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> %i32v4, <4 x i32> %i32v4_2, <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x float> %floatv4, <4 x float> %floatv4_2,<2 x i64> %i64v2, <2 x i64> %i64v2_2,<2 x double> %doublev2, <2 x double> %doublev2_2) {
 ; ALL-LABEL: 'concat'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32_2 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16_2 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16_2 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16_2 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_2 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; ALL-SIZE-LABEL: 'concat'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32_2 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16_2 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16_2 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16_2 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_2 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v4i16 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i16 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v16i16 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v4i32 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i32 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v4i64 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v4f16 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8f16 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v16f16 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v4f32 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8f32 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v4f64 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v4i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i16_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v16i16_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v4i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i32_2 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v4i64_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v4f16_2 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8f16_2 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v16f16_2 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v4f32_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8f32_2 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v4f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret void
 }
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
index f333bc3..809b15b 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S -disable-output -passes='print<access-info>' < %s 2>&1 | FileCheck %s
 
 
@@ -7,7 +8,8 @@
 
 %int_pair = type { i32, i32 }
 
-; CHECK-LABEL: function 'backdep_type_size_equivalence':
+define void @backdep_type_size_equivalence(ptr nocapture %vec, i64 %n) {
+; CHECK-LABEL: 'backdep_type_size_equivalence'
 ; CHECK-NEXT:    loop:
 ; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 3200 bits
 ; CHECK-NEXT:      Dependences:
@@ -23,10 +25,15 @@
 ; CHECK-NEXT:            store float %val, ptr %gep.iv.min.100, align 8 ->
 ; CHECK-NEXT:            store i32 %indvars.iv.i32, ptr %gep.iv, align 8
 ; CHECK-EMPTY:
-; CHECK-NEXT:        Run-time memory checks:
-; CHECK-NEXT:        Grouped accesses:
-
-define void @backdep_type_size_equivalence(ptr nocapture %vec, i64 %n) {
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {(4 + (8 * %n) + %vec),+,8}<%loop> Added Flags: <nusw>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   br label %loop
 
@@ -72,20 +79,25 @@ exit:
 ; different store size than the i32 type, even though their alloc sizes are
 ; equivalent. This is a negative test to ensure that they are not analyzed as
 ; in the tests above.
-;
-; CHECK-LABEL: function 'backdep_type_store_size_equivalence':
+
+define void @backdep_type_store_size_equivalence(ptr nocapture %vec, i64 %n) {
+; CHECK-LABEL: 'backdep_type_store_size_equivalence'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop.
-; CHECK-NEXT:      Unknown data dependence.
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:        Unknown:
 ; CHECK-NEXT:            %ld.f32 = load float, ptr %gep.iv, align 8 ->
 ; CHECK-NEXT:            store i19 %indvars.iv.i19, ptr %gep.iv, align 8
 ; CHECK-EMPTY:
-; CHECK-NEXT:        Run-time memory checks:
-; CHECK-NEXT:        Grouped accesses:
-
-define void @backdep_type_store_size_equivalence(ptr nocapture %vec, i64 %n) {
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   br label %loop
 
@@ -114,10 +126,11 @@ exit:
 ; are done as i64 and i32 types. This is a negative test to ensure that they
 ; are not analyzed as in the tests above.
 
-; CHECK-LABEL: function 'neg_dist_dep_type_size_equivalence':
+define void @neg_dist_dep_type_size_equivalence(ptr nocapture %vec, i64 %n) {
+; CHECK-LABEL: 'neg_dist_dep_type_size_equivalence'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop.
-; CHECK-NEXT:      Backward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Backward loop carried data dependence that prevents store-to-load forwarding.
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:        BackwardVectorizableButPreventsForwarding:
 ; CHECK-NEXT:            %ld.f64 = load double, ptr %gep.iv, align 8 ->
@@ -129,12 +142,17 @@ exit:
 ; CHECK-EMPTY:
 ; CHECK-NEXT:        Unknown:
 ; CHECK-NEXT:            store double %val, ptr %gep.iv.101.i64, align 8 ->
-; CHECK-NEXT:                   store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
+; CHECK-NEXT:            store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
-
-define void @neg_dist_dep_type_size_equivalence(ptr nocapture %vec, i64 %n) {
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {((8 * %n) + %vec),+,8}<%loop> Added Flags: <nusw>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   br label %loop
 
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll
index 42d87ed..f1ae1a8 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes='print<access-info>' -disable-output  < %s 2>&1 | FileCheck %s
 
 ; Check that loop-indepedent forward dependences are discovered properly.
@@ -21,17 +22,31 @@
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 define void @f(ptr noalias %A, ptr noalias %B, ptr noalias %C, i64 %N) {
-
-; CHECK: Dependences:
-; CHECK-NEXT:   Forward:
-; CHECK-NEXT:       store i32 %b_p1, ptr %Aidx, align 4 ->
-; CHECK-NEXT:       %a = load i32, ptr %Aidx, align 4
-; CHECK:        ForwardButPreventsForwarding:
-; CHECK-NEXT:       store i32 %b_p2, ptr %Aidx_next, align 4 ->
-; CHECK-NEXT:       %a = load i32, ptr %Aidx, align 4
-; CHECK:        Forward:
-; CHECK-NEXT:       store i32 %b_p2, ptr %Aidx_next, align 4 ->
-; CHECK-NEXT:       store i32 %b_p1, ptr %Aidx, align 4
+; CHECK-LABEL: 'f'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            store i32 %b_p1, ptr %Aidx, align 4 ->
+; CHECK-NEXT:            %a = load i32, ptr %Aidx, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:        ForwardButPreventsForwarding:
+; CHECK-NEXT:            store i32 %b_p2, ptr %Aidx_next, align 4 ->
+; CHECK-NEXT:            %a = load i32, ptr %Aidx, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            store i32 %b_p2, ptr %Aidx_next, align 4 ->
+; CHECK-NEXT:            store i32 %b_p1, ptr %Aidx, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/pr64637.ll b/llvm/test/Analysis/LoopAccessAnalysis/pr64637.ll
index 4d4d2bf..d3e589c 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/pr64637.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/pr64637.ll
@@ -1,4 +1,5 @@
-; RUN: opt -S -passes='print<access-info>' -pass-remarks-analysis=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ANALYSIS
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes='print<access-info>' -pass-remarks-analysis=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s
 
 ; Test that LoopVectorize don't report 'Use #pragma loop distribute(enable) to allow loop distribution'
 ; when we already add #pragma clang loop distribute(enable).
@@ -17,8 +18,31 @@
 ; }
 
 define void @foo(ptr noalias nocapture noundef %y, ptr noalias nocapture noundef readnone %x, ptr noalias nocapture noundef readonly %indices, i32 noundef %n) {
-; ANALYSIS: Report: unsafe dependent memory operations in loop.
-; ANALYSIS: Backward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-LABEL: 'foo'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop.
+; CHECK-NEXT:  Backward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        BackwardVectorizableButPreventsForwarding:
+; CHECK-NEXT:            %1 = load i32, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store i32 %add8, ptr %arrayidx12, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:        BackwardVectorizable:
+; CHECK-NEXT:            store i32 %add1, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store i32 %add8, ptr %arrayidx12, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            %1 = load i32, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store i32 %add1, ptr %arrayidx, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   %cmp22 = icmp sgt i32 %n, 0
   br i1 %cmp22, label %for.body.preheader, label %for.cond.cleanup
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll b/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll
index 07e32f4..60fe8b4 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll
@@ -1,30 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes='print<access-info>' -disable-output 2>&1 < %s | FileCheck %s
 
-; CHECK: Dependences:
-; CHECK-NEXT: Unknown:
-; CHECK-NEXT: %t63 = load double, ptr %t62, align 8 ->
-; CHECK-NEXT: store double %t63, ptr %t64, align 8
-
-define i32 @test() {
-   %a1 = alloca [128 x double], align 8
-   %a2 = alloca [128 x double], align 8
-   %a3 = alloca [128 x double], align 8
-   %t30 = getelementptr double, ptr %a2, i64 -32
+define void @test(ptr noalias %x, ptr noalias %y, ptr noalias %z) {
+; CHECK-LABEL: 'test'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %load = load double, ptr %gep.sel, align 8 ->
+; CHECK-NEXT:            store double %load, ptr %gep.sel2, align 8
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+   %gep.y = getelementptr double, ptr %y, i64 -32
    br label %loop
 
 loop:
-   %t58 = phi i64 [ %t65, %loop ], [ 0, %0 ]
-   %t59 = icmp ule i64 %t58, 32
-   %t60 = select i1 %t59, ptr %a1, ptr %t30
-   %t62 = getelementptr inbounds double, ptr %t60, i64 %t58
-   %t63 = load double, ptr %t62, align 8
-   %t61 = select i1 %t59, ptr %a2, ptr %a3
-   %t64 = getelementptr inbounds double, ptr %t61, i64 %t58
-   store double %t63, ptr %t64, align 8
-   %t65 = add nuw nsw i64 %t58, 1
-   %t66 = icmp eq i64 %t65, 94
-   br i1 %t66, label %exit, label %loop
+   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+   %icmp = icmp ule i64 %iv, 32
+   %sel = select i1 %icmp, ptr %x, ptr %gep.y
+   %gep.sel = getelementptr inbounds double, ptr %sel, i64 %iv
+   %load = load double, ptr %gep.sel, align 8
+   %sel2 = select i1 %icmp, ptr %y, ptr %z
+   %gep.sel2 = getelementptr inbounds double, ptr %sel2, i64 %iv
+   store double %load, ptr %gep.sel2, align 8
+   %iv.next = add nuw nsw i64 %iv, 1
+   %exit.cond = icmp eq i64 %iv, 94
+   br i1 %exit.cond, label %exit, label %loop
 
 exit:
-   ret i32 0
+   ret void
 }
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll b/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll
index bfdd15f..ef19e17 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes='print<access-info>' -disable-output  < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
@@ -10,13 +11,19 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 ;     B[i] = A[i] + 1;
 ; }
 
-; CHECK: function 'nodep_Read_Write':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Memory dependences are safe
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:     Run-time memory checks:
-
 define void @nodep_Read_Write(ptr nocapture %A) {
+; CHECK-LABEL: 'nodep_Read_Write'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   %add.ptr = getelementptr inbounds i32, ptr %A, i64 1
   br label %for.body
@@ -42,17 +49,23 @@ for.body:                                         ; preds = %entry, %for.body
 ;     A[i] = i;
 ;     sum += A[i+3];
 ;   }
-;   
+;
 ;   return sum;
 ; }
 
-; CHECK: function 'nodep_Write_Read':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Memory dependences are safe
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:     Run-time memory checks:
-
 define i32 @nodep_Write_Read(ptr nocapture %A) {
+; CHECK-LABEL: 'nodep_Write_Read'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -81,13 +94,19 @@ for.body:                                         ; preds = %entry, %for.body
 ;   }
 ; }
 
-; CHECK: function 'nodep_Write_Write':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Memory dependences are safe
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:     Run-time memory checks:
-
 define void @nodep_Write_Write(ptr nocapture %A) {
+; CHECK-LABEL: 'nodep_Write_Write'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -115,16 +134,24 @@ for.body:                                         ; preds = %entry, %for.body
 ;     A[i+3] = A[i] + 1;
 ; }
 
-; CHECK: function 'unsafe_Read_Write':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop
-; CHECK-NEXT:     Backward loop carried data dependence.
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:      Backward:
-; CHECK-NEXT:           %0 = load i32, ptr %arrayidx, align 4 -> 
-; CHECK-NEXT:           store i32 %add, ptr %arrayidx3, align 4
-
 define void @unsafe_Read_Write(ptr nocapture %A) {
+; CHECK-LABEL: 'unsafe_Read_Write'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Backward loop carried data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Backward:
+; CHECK-NEXT:            %0 = load i32, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store i32 %add, ptr %arrayidx3, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -155,16 +182,24 @@ for.body:                                         ; preds = %entry, %for.body
 ;   return sum;
 ; }
 
-; CHECK: function 'unsafe_Write_Read':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop
-; CHECK-NEXT:     Backward loop carried data dependence.
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:      Backward:
-; CHECK-NEXT:           store i32 %0, ptr %arrayidx, align 4 ->
-; CHECK-NEXT:           %1 = load i32, ptr %arrayidx2, align 4
-
 define i32 @unsafe_Write_Read(ptr nocapture %A) {
+; CHECK-LABEL: 'unsafe_Write_Read'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Backward loop carried data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Backward:
+; CHECK-NEXT:            store i32 %0, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            %1 = load i32, ptr %arrayidx2, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -192,16 +227,24 @@ for.body:                                         ; preds = %entry, %for.body
 ;   }
 ; }
 
-; CHECK: function 'unsafe_Write_Write':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop
-; CHECK-NEXT:     Backward loop carried data dependence.
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:      Backward:
-; CHECK-NEXT:           store i32 %0, ptr %arrayidx, align 4 ->
-; CHECK-NEXT:           store i32 %2, ptr %arrayidx3, align 4
-
 define void @unsafe_Write_Write(ptr nocapture %A) {
+; CHECK-LABEL: 'unsafe_Write_Write'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Backward loop carried data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Backward:
+; CHECK-NEXT:            store i32 %0, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store i32 %2, ptr %arrayidx3, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -230,15 +273,23 @@ for.body:                                         ; preds = %entry, %for.body
 ;     B[i] = A[i] + 1;
 ; }
 
-; CHECK: function 'vectorizable_Read_Write':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Memory dependences are safe
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       BackwardVectorizable:
-; CHECK-NEXT:           %0 = load i32, ptr %arrayidx, align 4 ->
-; CHECK-NEXT:           store i32 %add, ptr %arrayidx2, align 4
-
 define void @vectorizable_Read_Write(ptr nocapture %A) {
+; CHECK-LABEL: 'vectorizable_Read_Write'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 64 bits
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        BackwardVectorizable:
+; CHECK-NEXT:            %0 = load i32, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store i32 %add, ptr %arrayidx2, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   %add.ptr = getelementptr inbounds i32, ptr %A, i64 4
   br label %for.body
@@ -265,19 +316,27 @@ for.body:                                         ; preds = %entry, %for.body
 ;     A[i] = i;
 ;     sum += B[i];
 ;   }
-;   
+;
 ;   return sum;
 ; }
 
-; CHECK: function 'vectorizable_Write_Read':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Memory dependences are safe
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       BackwardVectorizable:
-; CHECK-NEXT:           store i32 %0, ptr %arrayidx, align 4 ->
-; CHECK-NEXT:           %1 = load i32, ptr %arrayidx2, align 4
-
 define i32 @vectorizable_Write_Read(ptr nocapture %A) {
+; CHECK-LABEL: 'vectorizable_Write_Read'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 64 bits
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        BackwardVectorizable:
+; CHECK-NEXT:            store i32 %0, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            %1 = load i32, ptr %arrayidx2, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   %add.ptr = getelementptr inbounds i32, ptr %A, i64 4
   br label %for.body
@@ -307,15 +366,23 @@ for.body:                                         ; preds = %entry, %for.body
 ;   }
 ; }
 
-; CHECK: function 'vectorizable_Write_Write':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Memory dependences are safe
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       BackwardVectorizable:
-; CHECK-NEXT:           store i32 %0, ptr %arrayidx, align 4 -> 
-; CHECK-NEXT:           store i32 %2, ptr %arrayidx2, align 4
-
 define void @vectorizable_Write_Write(ptr nocapture %A) {
+; CHECK-LABEL: 'vectorizable_Write_Write'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 64 bits
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        BackwardVectorizable:
+; CHECK-NEXT:            store i32 %0, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store i32 %2, ptr %arrayidx2, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   %add.ptr = getelementptr inbounds i32, ptr %A, i64 4
   br label %for.body
@@ -346,16 +413,24 @@ for.body:                                         ; preds = %entry, %for.body
 ; FIXME: This case looks like previous case @vectorizable_Read_Write. It sould
 ; be vectorizable.
 
-; CHECK: function 'vectorizable_unscaled_Read_Write':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop
-; CHECK-NEXT:     Backward loop carried data dependence that prevents store-to-load forwarding.
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       BackwardVectorizableButPreventsForwarding:
-; CHECK-NEXT:           %0 = load i32, ptr %arrayidx, align 4 ->
-; CHECK-NEXT:           store i32 %add, ptr %arrayidx2, align 4
-
 define void @vectorizable_unscaled_Read_Write(ptr nocapture %A) {
+; CHECK-LABEL: 'vectorizable_unscaled_Read_Write'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Backward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        BackwardVectorizableButPreventsForwarding:
+; CHECK-NEXT:            %0 = load i32, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store i32 %add, ptr %arrayidx2, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   %add.ptr = getelementptr inbounds i8, ptr %A, i64 14
   br label %for.body
@@ -382,19 +457,27 @@ for.body:                                         ; preds = %entry, %for.body
 ;     A[i] = i;
 ;     sum += B[i];
 ;   }
-; 
+;
 ;   return sum;
 ; }
 
-; CHECK: function 'vectorizable_unscaled_Write_Read':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Memory dependences are safe
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       BackwardVectorizable:
-; CHECK-NEXT:           store i32 %0, ptr %arrayidx, align 4 -> 
-; CHECK-NEXT:           %1 = load i32, ptr %arrayidx2, align 4
-
 define i32 @vectorizable_unscaled_Write_Read(ptr nocapture %A) {
+; CHECK-LABEL: 'vectorizable_unscaled_Write_Read'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 64 bits
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        BackwardVectorizable:
+; CHECK-NEXT:            store i32 %0, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            %1 = load i32, ptr %arrayidx2, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   %add.ptr = getelementptr inbounds i8, ptr %A, i64 17
   br label %for.body
@@ -422,16 +505,24 @@ for.body:                                         ; preds = %entry, %for.body
 ;     B[i] = A[i] + 1;
 ; }
 
-; CHECK: function 'unsafe_unscaled_Read_Write':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop
-; CHECK-NEXT:     Backward loop carried data dependence.
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       Backward:
-; CHECK-NEXT:           %0 = load i32, ptr %arrayidx, align 4 -> 
-; CHECK-NEXT:           store i32 %add, ptr %arrayidx2, align 4
-
 define void @unsafe_unscaled_Read_Write(ptr nocapture %A) {
+; CHECK-LABEL: 'unsafe_unscaled_Read_Write'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Backward loop carried data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Backward:
+; CHECK-NEXT:            %0 = load i32, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store i32 %add, ptr %arrayidx2, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   %add.ptr = getelementptr inbounds i8, ptr %A, i64 11
   br label %for.body
@@ -451,15 +542,6 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 }
 
-; CHECK: function 'unsafe_unscaled_Read_Write2':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop
-; CHECK-NEXT:     Backward loop carried data dependence.
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       Backward:
-; CHECK-NEXT:           %0 = load i32, ptr %arrayidx, align 4 -> 
-; CHECK-NEXT:           store i32 %add, ptr %arrayidx2, align 4
-
 ; void unsafe_unscaled_Read_Write2(int *A) {
 ;   int *B = (int *)((char *)A + 1);
 ;   for (unsigned i = 0; i < 1024; i+=2)
@@ -467,6 +549,23 @@ for.body:                                         ; preds = %entry, %for.body
 ; }
 
 define void @unsafe_unscaled_Read_Write2(ptr nocapture %A) {
+; CHECK-LABEL: 'unsafe_unscaled_Read_Write2'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Backward loop carried data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Backward:
+; CHECK-NEXT:            %0 = load i32, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store i32 %add, ptr %arrayidx2, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   %add.ptr = getelementptr inbounds i8, ptr %A, i64 1
   br label %for.body
@@ -500,19 +599,28 @@ for.body:                                         ; preds = %entry, %for.body
 ;
 ; The access (2) has overlaps with (1) and (3).
 
-; CHECK: function 'interleaved_stores':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop
-; CHECK-NEXT:     Backward loop carried data dependence.
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       Backward:
-; CHECK-NEXT:           store i32 %2, ptr %arrayidx5, align 4 -> 
-; CHECK-NEXT:           store i32 %2, ptr %arrayidx9, align 4
-; CHECK:       Backward:
-; CHECK-NEXT:           store i32 %0, ptr %arrayidx2, align 4 -> 
-; CHECK-NEXT:           store i32 %2, ptr %arrayidx5, align 4
-
 define void @interleaved_stores(ptr nocapture %A) {
+; CHECK-LABEL: 'interleaved_stores'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Backward loop carried data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Backward:
+; CHECK-NEXT:            store i32 %2, ptr %arrayidx5, align 4 ->
+; CHECK-NEXT:            store i32 %2, ptr %arrayidx9, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:        Backward:
+; CHECK-NEXT:            store i32 %0, ptr %arrayidx2, align 4 ->
+; CHECK-NEXT:            store i32 %2, ptr %arrayidx5, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   %incdec.ptr = getelementptr inbounds i8, ptr %A, i64 1
   br label %for.body
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
index 6cc045d..3da0f54 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
@@ -95,6 +95,127 @@ exit:
   ret void
 }
 
+define void @single_stride_castexpr(i32 %offset, ptr %src, ptr %dst, i1 %cond) {
+; CHECK-LABEL: 'single_stride_castexpr'
+; CHECK-NEXT:    inner.loop:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP1:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.dst = getelementptr i32, ptr %dst, i64 %iv.2
+; CHECK-NEXT:        Against group ([[GRP2:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv.3
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP1]]:
+; CHECK-NEXT:          (Low: ((4 * %iv.1) + %dst) High: (804 + (4 * %iv.1) + %dst))
+; CHECK-NEXT:            Member: {((4 * %iv.1) + %dst),+,4}<%inner.loop>
+; CHECK-NEXT:        Group [[GRP2]]:
+; CHECK-NEXT:          (Low: %src High: (804 + %src))
+; CHECK-NEXT:            Member: {%src,+,4}<nuw><%inner.loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      Equal predicate: %offset == 1
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:      [PSE] %gep.dst = getelementptr i32, ptr %dst, i64 %iv.2:
+; CHECK-NEXT:        {((4 * %iv.1) + %dst),+,(4 * (sext i32 %offset to i64))<nsw>}<%inner.loop>
+; CHECK-NEXT:        --> {((4 * %iv.1) + %dst),+,4}<%inner.loop>
+; CHECK-NEXT:    outer.header:
+; CHECK-NEXT:      Report: loop is not the innermost loop
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %offset.ext = sext i32 %offset to i64
+  br label %outer.header
+
+outer.header:
+  %iv.1 = phi i64 [ 0, %entry ], [ %iv.2.next, %inner.loop ]
+  br i1 %cond, label %inner.loop, label %exit
+
+inner.loop:
+  %iv.2 = phi i64 [ %iv.1, %outer.header ], [ %iv.2.next, %inner.loop ]
+  %iv.3 = phi i32 [ 0, %outer.header ], [ %iv.3.next, %inner.loop ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv.3
+  %load = load i32, ptr %gep.src, align 8
+  %gep.dst = getelementptr i32, ptr %dst, i64 %iv.2
+  store i32 %load, ptr %gep.dst, align 8
+  %iv.2.next = add i64 %iv.2, %offset.ext
+  %iv.3.next = add i32 %iv.3, 1
+  %ec = icmp eq i32 %iv.3, 200
+  br i1 %ec, label %outer.header, label %inner.loop
+
+exit:
+  ret void
+}
+
+define void @single_stride_castexpr_multiuse(i32 %offset, ptr %src, ptr %dst, i1 %cond) {
+; CHECK-LABEL: 'single_stride_castexpr_multiuse'
+; CHECK-NEXT:    inner.loop:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP3:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.dst = getelementptr i32, ptr %dst, i64 %iv.2
+; CHECK-NEXT:        Against group ([[GRP4:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv.3
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP3]]:
+; CHECK-NEXT:          (Low: (((4 * %iv.1) + %dst) umin ((4 * %iv.1) + (4 * (sext i32 %offset to i64) * (200 + (-1 * (zext i32 %offset to i64))<nsw>)<nsw>) + %dst)) High: (4 + (((4 * %iv.1) + %dst) umax ((4 * %iv.1) + (4 * (sext i32 %offset to i64) * (200 + (-1 * (zext i32 %offset to i64))<nsw>)<nsw>) + %dst))))
+; CHECK-NEXT:            Member: {((4 * %iv.1) + %dst),+,(4 * (sext i32 %offset to i64))<nsw>}<%inner.loop>
+; CHECK-NEXT:        Group [[GRP4]]:
+; CHECK-NEXT:          (Low: ((4 * (zext i32 %offset to i64))<nuw><nsw> + %src) High: (804 + %src))
+; CHECK-NEXT:            Member: {((4 * (zext i32 %offset to i64))<nuw><nsw> + %src),+,4}<%inner.loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:    outer.header:
+; CHECK-NEXT:      Report: loop is not the innermost loop
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %offset.ext = sext i32 %offset to i64
+  %offset.zext = zext i32 %offset to i64
+  br label %outer.header
+
+outer.header:
+  %iv.1 = phi i64 [ 0, %entry ], [ %iv.2.next, %inner.loop ]
+  br i1 %cond, label %inner.loop, label %exit
+
+inner.loop:
+  %iv.2 = phi i64 [ %iv.1, %outer.header ], [ %iv.2.next, %inner.loop ]
+  %iv.3 = phi i64 [ %offset.zext, %outer.header ], [ %iv.3.next, %inner.loop ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv.3
+  %load = load i32, ptr %gep.src, align 8
+  %gep.dst = getelementptr i32, ptr %dst, i64 %iv.2
+  store i32 %load, ptr %gep.dst, align 8
+  %iv.2.next = add i64 %iv.2, %offset.ext
+  %iv.3.next = add i64 %iv.3, 1
+  %ec = icmp eq i64 %iv.3, 200
+  br i1 %ec, label %outer.header, label %inner.loop
+
+exit:
+  ret void
+}
+
 ; A loop with two symbolic strides.
 define void @two_strides(ptr noalias %A, ptr noalias %B, i64 %N, i64 %stride.1, i64 %stride.2) {
 ; CHECK-LABEL: 'two_strides'
diff --git a/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll b/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll
index 2117c77..e9faf98 100644
--- a/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll
+++ b/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll
@@ -4,13 +4,14 @@
 define void @ule_from_zero(i32 %M, i32 %N) {
 ; CHECK-LABEL: 'ule_from_zero'
 ; CHECK-NEXT:  Determining loop execution counts for: @ule_from_zero
-; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
-; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %loop: <multiple exits> backedge-taken count is ((zext i32 %N to i64) umin (1 + (zext i32 %M to i64))<nuw><nsw>)
+; CHECK-NEXT:    exit count for loop: (1 + (zext i32 %M to i64))<nuw><nsw>
 ; CHECK-NEXT:    exit count for latch: %N
-; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i32 -1
-; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is %N
-; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 4294967295
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((zext i32 %N to i64) umin (1 + (zext i32 %M to i64))<nuw><nsw>)
+; CHECK-NEXT:    symbolic max exit count for loop: (1 + (zext i32 %M to i64))<nuw><nsw>
 ; CHECK-NEXT:    symbolic max exit count for latch: %N
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
 ;
 entry:
   br label %loop
@@ -61,13 +62,14 @@ exit:
 define void @ule_from_unknown(i32 %M, i32 %N, i32 %S) {
 ; CHECK-LABEL: 'ule_from_unknown'
 ; CHECK-NEXT:  Determining loop execution counts for: @ule_from_unknown
-; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
-; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %loop: <multiple exits> backedge-taken count is (((-1 * (zext i32 %S to i64))<nsw> + ((zext i32 %S to i64) umax (1 + (zext i32 %M to i64))<nuw><nsw>)) umin_seq (zext i32 ((-1 * %S) + %N) to i64))
+; CHECK-NEXT:    exit count for loop: ((-1 * (zext i32 %S to i64))<nsw> + ((zext i32 %S to i64) umax (1 + (zext i32 %M to i64))<nuw><nsw>))
 ; CHECK-NEXT:    exit count for latch: ((-1 * %S) + %N)
-; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i32 -1
-; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-1 * %S) + %N)
-; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 4294967295
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (((-1 * (zext i32 %S to i64))<nsw> + ((zext i32 %S to i64) umax (1 + (zext i32 %M to i64))<nuw><nsw>)) umin_seq (zext i32 ((-1 * %S) + %N) to i64))
+; CHECK-NEXT:    symbolic max exit count for loop: ((-1 * (zext i32 %S to i64))<nsw> + ((zext i32 %S to i64) umax (1 + (zext i32 %M to i64))<nuw><nsw>))
 ; CHECK-NEXT:    symbolic max exit count for latch: ((-1 * %S) + %N)
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
 ;
 entry:
   br label %loop
@@ -96,6 +98,9 @@ define void @ule_from_zero_no_nuw(i32 %M, i32 %N) {
 ; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is %N
 ; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    symbolic max exit count for latch: %N
+; CHECK-NEXT:  Loop %loop: Predicated backedge-taken count is ((zext i32 %N to i64) umin (1 + (zext i32 %M to i64))<nuw><nsw>)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%loop> Added Flags: <nusw>
 ;
 entry:
   br label %loop
@@ -117,13 +122,14 @@ exit:
 define void @sle_from_int_min(i32 %M, i32 %N) {
 ; CHECK-LABEL: 'sle_from_int_min'
 ; CHECK-NEXT:  Determining loop execution counts for: @sle_from_int_min
-; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
-; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %loop: <multiple exits> backedge-taken count is ((zext i32 (-2147483648 + %N) to i64) umin (2147483649 + (sext i32 %M to i64))<nsw>)
+; CHECK-NEXT:    exit count for loop: (2147483649 + (sext i32 %M to i64))<nsw>
 ; CHECK-NEXT:    exit count for latch: (-2147483648 + %N)
-; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i32 -1
-; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (-2147483648 + %N)
-; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 4294967295
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((zext i32 (-2147483648 + %N) to i64) umin (2147483649 + (sext i32 %M to i64))<nsw>)
+; CHECK-NEXT:    symbolic max exit count for loop: (2147483649 + (sext i32 %M to i64))<nsw>
 ; CHECK-NEXT:    symbolic max exit count for latch: (-2147483648 + %N)
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
 ;
 entry:
   br label %loop
@@ -174,13 +180,14 @@ exit:
 define void @sle_from_unknown(i32 %M, i32 %N, i32 %S) {
 ; CHECK-LABEL: 'sle_from_unknown'
 ; CHECK-NEXT:  Determining loop execution counts for: @sle_from_unknown
-; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
-; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %loop: <multiple exits> backedge-taken count is (((-1 * (sext i32 %S to i64))<nsw> + ((sext i32 %S to i64) smax (1 + (sext i32 %M to i64))<nsw>)) umin_seq (zext i32 ((-1 * %S) + %N) to i64))
+; CHECK-NEXT:    exit count for loop: ((-1 * (sext i32 %S to i64))<nsw> + ((sext i32 %S to i64) smax (1 + (sext i32 %M to i64))<nsw>))
 ; CHECK-NEXT:    exit count for latch: ((-1 * %S) + %N)
-; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i32 -1
-; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-1 * %S) + %N)
-; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 4294967295
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (((-1 * (sext i32 %S to i64))<nsw> + ((sext i32 %S to i64) smax (1 + (sext i32 %M to i64))<nsw>)) umin_seq (zext i32 ((-1 * %S) + %N) to i64))
+; CHECK-NEXT:    symbolic max exit count for loop: ((-1 * (sext i32 %S to i64))<nsw> + ((sext i32 %S to i64) smax (1 + (sext i32 %M to i64))<nsw>))
 ; CHECK-NEXT:    symbolic max exit count for latch: ((-1 * %S) + %N)
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
 ;
 entry:
   br label %loop
@@ -209,6 +216,9 @@ define void @sle_from_int_min_no_nsw(i32 %M, i32 %N) {
 ; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (-2147483648 + %N)
 ; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    symbolic max exit count for latch: (-2147483648 + %N)
+; CHECK-NEXT:  Loop %loop: Predicated backedge-taken count is ((zext i32 (-2147483648 + %N) to i64) umin (2147483649 + (sext i32 %M to i64))<nsw>)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {-2147483648,+,1}<%loop> Added Flags: <nssw>
 ;
 entry:
   br label %loop
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-integer.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-integer.mir
new file mode 100644
index 0000000..be33f9f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-integer.mir
@@ -0,0 +1,252 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple aarch64 -run-pass=aarch64-prelegalizer-combiner %s -o - | FileCheck %s
+
+
+---
+name:   ZeroMinusAPlusB
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: ZeroMinusAPlusB
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %a:_(s32) = COPY $w0
+    ; CHECK-NEXT: %b:_(s32) = COPY $w0
+    ; CHECK-NEXT: %add:_(s32) = G_SUB %b, %a
+    ; CHECK-NEXT: $w0 = COPY %add(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %x:_(s32) = COPY $w0
+    %a:_(s32) = COPY $w0
+    %b:_(s32) = COPY $w0
+    %zero:_(s32) = G_CONSTANT i32 0
+    %sub:_(s32) = G_SUB %zero, %a
+    %add:_(s32) = G_ADD %sub, %b
+    $w0 = COPY %add
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:   ZeroMinusAPlusB_multi_use
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: ZeroMinusAPlusB_multi_use
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %a:_(s32) = COPY $w0
+    ; CHECK-NEXT: %b:_(s32) = COPY $w0
+    ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %sub:_(s32) = G_SUB %zero, %a
+    ; CHECK-NEXT: %add:_(s32) = G_SUB %b, %a
+    ; CHECK-NEXT: $w0 = COPY %add(s32)
+    ; CHECK-NEXT: $w0 = COPY %sub(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %x:_(s32) = COPY $w0
+    %a:_(s32) = COPY $w0
+    %b:_(s32) = COPY $w0
+    %zero:_(s32) = G_CONSTANT i32 0
+    %sub:_(s32) = G_SUB %zero, %a
+    %add:_(s32) = G_ADD %sub, %b
+    $w0 = COPY %add
+    $w0 = COPY %sub
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:   APlusZeroMiunusB
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: APlusZeroMiunusB
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %a:_(s64) = COPY $x1
+    ; CHECK-NEXT: %b:_(s64) = COPY $x2
+    ; CHECK-NEXT: %add:_(s64) = G_SUB %a, %b
+    ; CHECK-NEXT: $x0 = COPY %add(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(s64) = COPY $x0
+    %a:_(s64) = COPY $x1
+    %b:_(s64) = COPY $x2
+    %zero:_(s64) = G_CONSTANT i64 0
+    %sub:_(s64) = G_SUB %zero, %b
+    %add:_(s64) = G_ADD %a, %sub
+    $x0 = COPY %add
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:   APlusBMinusB
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: APlusBMinusB
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %b:_(s64) = COPY $x1
+    ; CHECK-NEXT: $x0 = COPY %b(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %a:_(s64) = COPY $x0
+    %b:_(s64) = COPY $x1
+    %zero:_(s64) = G_CONSTANT i64 0
+    %sub:_(s64) = G_SUB %b, %a
+    %add:_(s64) = G_ADD %a, %sub
+    $x0 = COPY %add
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:   BMinusAPlusA
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: BMinusAPlusA
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %b:_(s64) = COPY $x1
+    ; CHECK-NEXT: $x0 = COPY %b(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %a:_(s64) = COPY $x0
+    %b:_(s64) = COPY $x1
+    %zero:_(s64) = G_CONSTANT i64 0
+    %sub:_(s64) = G_SUB %b, %a
+    %add:_(s64) = G_ADD %sub, %a
+    $x0 = COPY %add
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:   AMinusBPlusCMinusA
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: AMinusBPlusCMinusA
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %b:_(s64) = COPY $x1
+    ; CHECK-NEXT: %c:_(s64) = COPY $x2
+    ; CHECK-NEXT: %add:_(s64) = G_SUB %c, %b
+    ; CHECK-NEXT: $x0 = COPY %add(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %a:_(s64) = COPY $x0
+    %b:_(s64) = COPY $x1
+    %c:_(s64) = COPY $x2
+    %zero:_(s64) = G_CONSTANT i64 0
+    %sub2:_(s64) = G_SUB %c, %a
+    %sub1:_(s64) = G_SUB %a, %b
+    %add:_(s64) = G_ADD %sub1, %sub2
+    $x0 = COPY %add
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:   AMinusBPlusBMinusC
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: AMinusBPlusBMinusC
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %a:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s64) = COPY $x2
+    ; CHECK-NEXT: %add:_(s64) = G_SUB %a, %c
+    ; CHECK-NEXT: $x0 = COPY %add(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %a:_(s64) = COPY $x0
+    %b:_(s64) = COPY $x1
+    %c:_(s64) = COPY $x2
+    %zero:_(s64) = G_CONSTANT i64 0
+    %sub2:_(s64) = G_SUB %b, %c
+    %sub1:_(s64) = G_SUB %a, %b
+    %add:_(s64) = G_ADD %sub1, %sub2
+    $x0 = COPY %add
+    RET_ReallyLR implicit $x0
+
+
+...
+---
+name:   APlusBMinusAplusC
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: APlusBMinusAplusC
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %b:_(s64) = COPY $x1
+    ; CHECK-NEXT: %c:_(s64) = COPY $x2
+    ; CHECK-NEXT: %add:_(s64) = G_SUB %b, %c
+    ; CHECK-NEXT: $x0 = COPY %add(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %a:_(s64) = COPY $x0
+    %b:_(s64) = COPY $x1
+    %c:_(s64) = COPY $x2
+    %zero:_(s64) = G_CONSTANT i64 0
+    %add1:_(s64) = G_ADD %a, %c
+    %sub1:_(s64) = G_SUB %b, %add1
+    %add:_(s64) = G_ADD %a, %sub1
+    $x0 = COPY %add
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:   APlusBMinusCPlusA
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: APlusBMinusCPlusA
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %b:_(s64) = COPY $x1
+    ; CHECK-NEXT: %c:_(s64) = COPY $x2
+    ; CHECK-NEXT: %add:_(s64) = G_SUB %b, %c
+    ; CHECK-NEXT: $x0 = COPY %add(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %a:_(s64) = COPY $x0
+    %b:_(s64) = COPY $x1
+    %c:_(s64) = COPY $x2
+    %zero:_(s64) = G_CONSTANT i64 0
+    %add1:_(s64) = G_ADD %c, %a
+    %sub1:_(s64) = G_SUB %b, %add1
+    %add:_(s64) = G_ADD %a, %sub1
+    $x0 = COPY %add
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:   APlusBMinusCPlusA_BV
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: APlusBMinusCPlusA_BV
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %a1:_(s64) = COPY $x0
+    ; CHECK-NEXT: %b1:_(s64) = COPY $x1
+    ; CHECK-NEXT: %c1:_(s64) = COPY $x2
+    ; CHECK-NEXT: %b:_(<2 x s64>) = G_BUILD_VECTOR %b1(s64), %ba:_(s64)
+    ; CHECK-NEXT: %c:_(<2 x s64>) = G_BUILD_VECTOR %a1(s64), %c1(s64)
+    ; CHECK-NEXT: %add:_(<2 x s64>) = G_SUB %b, %c
+    ; CHECK-NEXT: $q0 = COPY %add(<2 x s64>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %a1:_(s64) = COPY $x0
+    %b1:_(s64) = COPY $x1
+    %c1:_(s64) = COPY $x2
+    %a:_(<2 x s64>) = G_BUILD_VECTOR %a1:_(s64), %b1:_(s64)
+    %b:_(<2 x s64>) = G_BUILD_VECTOR %b1:_(s64), %ba:_(s64)
+    %c:_(<2 x s64>) = G_BUILD_VECTOR %a1:_(s64), %c1:_(s64)
+    %zero:_(s64) = G_CONSTANT i64 0
+    %add1:_(<2 x s64>) = G_ADD %c, %a
+    %sub1:_(<2 x s64>) = G_SUB %b, %add1
+    %add:_(<2 x s64>) = G_ADD %a, %sub1
+    $q0 = COPY %add
+    RET_ReallyLR implicit $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
index 353c1550..074d4ec 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
@@ -117,9 +117,9 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %f
-    ; CHECK-NEXT: %sel:_(s1) = G_OR %c, [[FREEZE]]
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]]
+    ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[FREEZE]](s64)
+    ; CHECK-NEXT: %sel:_(s1) = G_OR %c, %f
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
@@ -144,9 +144,9 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %f
-    ; CHECK-NEXT: %sel:_(s1) = G_OR %c, [[FREEZE]]
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]]
+    ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[FREEZE]](s64)
+    ; CHECK-NEXT: %sel:_(s1) = G_OR %c, %f
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
@@ -172,9 +172,9 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d2
     ; CHECK-NEXT: %c:_(<2 x s1>) = G_TRUNC [[COPY]](<2 x s32>)
-    ; CHECK-NEXT: %f:_(<2 x s1>) = G_TRUNC [[COPY1]](<2 x s32>)
-    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<2 x s1>) = G_FREEZE %f
-    ; CHECK-NEXT: %sel:_(<2 x s1>) = G_OR %c, [[FREEZE]]
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<2 x s32>) = G_FREEZE [[COPY1]]
+    ; CHECK-NEXT: %f:_(<2 x s1>) = G_TRUNC [[FREEZE]](<2 x s32>)
+    ; CHECK-NEXT: %sel:_(<2 x s1>) = G_OR %c, %f
     ; CHECK-NEXT: %ext:_(<2 x s32>) = G_ANYEXT %sel(<2 x s1>)
     ; CHECK-NEXT: $d0 = COPY %ext(<2 x s32>)
     %0:_(<2 x s32>) = COPY $d0
@@ -201,9 +201,9 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
     ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %t
-    ; CHECK-NEXT: %sel:_(s1) = G_AND %c, [[FREEZE]]
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]]
+    ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[FREEZE]](s64)
+    ; CHECK-NEXT: %sel:_(s1) = G_AND %c, %t
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
@@ -229,9 +229,9 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
     ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %t
-    ; CHECK-NEXT: %sel:_(s1) = G_AND %c, [[FREEZE]]
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]]
+    ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[FREEZE]](s64)
+    ; CHECK-NEXT: %sel:_(s1) = G_AND %c, %t
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
@@ -257,11 +257,11 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
     ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]]
+    ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[FREEZE]](s64)
     ; CHECK-NEXT: %one:_(s1) = G_CONSTANT i1 true
     ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, %one
-    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %t
-    ; CHECK-NEXT: %sel:_(s1) = G_OR [[XOR]], [[FREEZE]]
+    ; CHECK-NEXT: %sel:_(s1) = G_OR [[XOR]], %t
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
@@ -287,11 +287,11 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]]
+    ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[FREEZE]](s64)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
     ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, [[C]]
-    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %f
-    ; CHECK-NEXT: %sel:_(s1) = G_AND [[XOR]], [[FREEZE]]
+    ; CHECK-NEXT: %sel:_(s1) = G_AND [[XOR]], %f
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
index e754f01..a8be8bb 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
@@ -1379,7 +1379,7 @@ define <8 x i8> @sextmask2v8i8(<8 x i16> %src1, <8 x i8> %src2) {
 define <8 x i8> @sextmask3v8i8(<8 x i16> %src1, <8 x i8> %src2) {
 ; CHECK-LABEL: sextmask3v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sshr.8h v0, v0, #7
+; CHECK-NEXT:    ushr.8h v0, v0, #7
 ; CHECK-NEXT:    sshll.8h v1, v1, #0
 ; CHECK-NEXT:    shadd.8h v0, v0, v1
 ; CHECK-NEXT:    xtn.8b v0, v0
diff --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
index 30b5e86..14a594e 100644
--- a/llvm/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
@@ -193,11 +193,10 @@ define void @test_64bit_badmask(ptr %existing, ptr %new) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    mov w10, #135 // =0x87
-; CHECK-NEXT:    mov w11, #664 // =0x298
-; CHECK-NEXT:    lsl w9, w9, #3
-; CHECK-NEXT:    and x8, x8, x10
-; CHECK-NEXT:    and x9, x9, x11
+; CHECK-NEXT:    mov w10, #664 // =0x298
+; CHECK-NEXT:    mov w11, #135 // =0x87
+; CHECK-NEXT:    and x9, x10, x9, lsl #3
+; CHECK-NEXT:    and x8, x8, x11
 ; CHECK-NEXT:    orr x8, x8, x9
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    ret
@@ -579,7 +578,6 @@ define <2 x i32> @test_complex_type(ptr %addr, i64 %in, ptr %bf ) {
 define i64 @test_truncated_shift(i64 %x, i64 %y) {
 ; CHECK-LABEL: test_truncated_shift:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $w1 killed $w1 killed $x1 def $x1
 ; CHECK-NEXT:    bfi x0, x1, #25, #5
 ; CHECK-NEXT:    ret
 entry:
@@ -593,7 +591,6 @@ entry:
 define i64 @test_and_extended_shift_with_imm(i64 %0) {
 ; CHECK-LABEL: test_and_extended_shift_with_imm:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0 def $x0
 ; CHECK-NEXT:    ubfiz x0, x0, #7, #8
 ; CHECK-NEXT:    ret
   %2 = shl i64 %0, 7
diff --git a/llvm/test/CodeGen/AArch64/hadd-combine.ll b/llvm/test/CodeGen/AArch64/hadd-combine.ll
index c0f7678..28f4547 100644
--- a/llvm/test/CodeGen/AArch64/hadd-combine.ll
+++ b/llvm/test/CodeGen/AArch64/hadd-combine.ll
@@ -955,6 +955,71 @@ define <8 x i16> @urhadd_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %r0
 }
 
+; Remove unnecessary sign_extend_inreg after shadd
+define <2 x i32> @shadd_signbits_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) {
+; CHECK-LABEL: shadd_signbits_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #17
+; CHECK-NEXT:    sshr v1.2s, v1.2s, #17
+; CHECK-NEXT:    shadd v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+  %x0 = ashr <2 x i32> %a0, <i32 17, i32 17>
+  %x1 = ashr <2 x i32> %a1, <i32 17, i32 17>
+  %m = and <2 x i32> %x0, %x1
+  %s = xor <2 x i32> %x0, %x1
+  %x = ashr <2 x i32> %s, <i32 1, i32 1>
+  %avg = add <2 x i32> %m, %x
+  %avg1 = shl <2 x i32> %avg, <i32 17, i32 17>
+  %avg2 = ashr <2 x i32> %avg1, <i32 17, i32 17>
+  store <2 x i32> %avg, ptr %p2 ; extra use
+  ret <2 x i32> %avg2
+}
+
+; Remove unnecessary sign_extend_inreg after srhadd
+define <2 x i32> @srhadd_signbits_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) {
+; CHECK-LABEL: srhadd_signbits_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #17
+; CHECK-NEXT:    sshr v1.2s, v1.2s, #17
+; CHECK-NEXT:    srhadd v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+  %x0 = ashr <2 x i32> %a0, <i32 17, i32 17>
+  %x1 = ashr <2 x i32> %a1, <i32 17, i32 17>
+  %m = or <2 x i32> %x0, %x1
+  %s = xor <2 x i32> %x0, %x1
+  %x = ashr <2 x i32> %s, <i32 1, i32 1>
+  %avg = sub <2 x i32> %m, %x
+  %avg1 = shl <2 x i32> %avg, <i32 17, i32 17>
+  %avg2 = ashr <2 x i32> %avg1, <i32 17, i32 17>
+  store <2 x i32> %avg, ptr %p2 ; extra use
+  ret <2 x i32> %avg2
+}
+
+; negative test - not enough signbits to remove sign_extend_inreg after srhadd
+define <2 x i32> @srhadd_signbits_v2i32_negative(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) {
+; CHECK-LABEL: srhadd_signbits_v2i32_negative:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #17
+; CHECK-NEXT:    sshr v1.2s, v1.2s, #17
+; CHECK-NEXT:    srhadd v1.2s, v0.2s, v1.2s
+; CHECK-NEXT:    shl v0.2s, v1.2s, #22
+; CHECK-NEXT:    str d1, [x0]
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #22
+; CHECK-NEXT:    ret
+  %x0 = ashr <2 x i32> %a0, <i32 17, i32 17>
+  %x1 = ashr <2 x i32> %a1, <i32 17, i32 17>
+  %m = or <2 x i32> %x0, %x1
+  %s = xor <2 x i32> %x0, %x1
+  %x = ashr <2 x i32> %s, <i32 1, i32 1>
+  %avg = sub <2 x i32> %m, %x
+  %avg1 = shl <2 x i32> %avg, <i32 22, i32 22>
+  %avg2 = ashr <2 x i32> %avg1, <i32 22, i32 22>
+  store <2 x i32> %avg, ptr %p2 ; extra use
+  ret <2 x i32> %avg2
+}
+
 declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>)
 declare <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16>, <4 x i16>)
 declare <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32>, <2 x i32>)
@@ -979,4 +1044,4 @@ declare <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>)
 declare <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>)
 declare <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>)
 declare <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>)
-declare <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>)
-\ No newline at end of file
+declare <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
index 21123754..9c72afd 100644
--- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
+++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
@@ -359,6 +359,152 @@ define i32 @add_i32_ctz_nxv16i1_poison(<vscale x 16 x i1> %a, i32 %b) {
   ret i32 %add
 }
 
+; FIXED-WIDTH VECTOR TYPES
+
+define i32 @ctz_v16i1(<16 x i1> %a) {
+; CHECK-LABEL: ctz_v16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.16b, v0.16b, #7
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.b
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 0)
+  ret i32 %res
+}
+
+define i32 @ctz_v16i1_poison(<16 x i1> %a) {
+; CHECK-LABEL: ctz_v16i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.16b, v0.16b, #7
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.b
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 1)
+  ret i32 %res
+}
+
+define i64 @add_i64_ctz_v16i1_poison(<16 x i1> %a, i64 %b) {
+; CHECK-LABEL: add_i64_ctz_v16i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.16b, v0.16b, #7
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    incp x0, p0.b
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> %a, i1 1)
+  %add = add i64 %res, %b
+  ret i64 %add
+}
+
+define i32 @ctz_v8i1(<8 x i1> %a) {
+; CHECK-LABEL: ctz_v8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.8b, v0.8b, #7
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    cmlt v0.8b, v0.8b, #0
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.b
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> %a, i1 0)
+  ret i32 %res
+}
+
+define i32 @ctz_v8i1_poison(<8 x i1> %a) {
+; CHECK-LABEL: ctz_v8i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.8b, v0.8b, #7
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    cmlt v0.8b, v0.8b, #0
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.b
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> %a, i1 1)
+  ret i32 %res
+}
+
+define i32 @ctz_v4i1(<4 x i1> %a) {
+; CHECK-LABEL: ctz_v4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.4h, v0.4h, #15
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    cmlt v0.4h, v0.4h, #0
+; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.h
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> %a, i1 0)
+  ret i32 %res
+}
+
+define i32 @ctz_v4i1_poison(<4 x i1> %a) {
+; CHECK-LABEL: ctz_v4i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.4h, v0.4h, #15
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    cmlt v0.4h, v0.4h, #0
+; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.h
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> %a, i1 1)
+  ret i32 %res
+}
+
+define i32 @ctz_v2i1(<2 x i1> %a) {
+; CHECK-LABEL: ctz_v2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.2s, v0.2s, #31
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.s
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> %a, i1 0)
+  ret i32 %res
+}
+
+define i32 @ctz_v2i1_poison(<2 x i1> %a) {
+; CHECK-LABEL: ctz_v2i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.2s, v0.2s, #31
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.s
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> %a, i1 1)
+  ret i32 %res
+}
+
 declare i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1>, i1)
 declare i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1>, i1)
 declare i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1>, i1)
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index 736f66c..40b8a47 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -1709,289 +1709,289 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    fmov s4, w0
 ; CHECK-NEXT:    ldr b0, [sp, #80]
 ; CHECK-NEXT:    add x8, sp, #88
-; CHECK-NEXT:    ldr b2, [sp, #144]
-; CHECK-NEXT:    fmov s4, w0
+; CHECK-NEXT:    ldr b1, [sp, #144]
 ; CHECK-NEXT:    add x10, sp, #152
-; CHECK-NEXT:    ldr b3, [sp, #16]
+; CHECK-NEXT:    ldr b6, [sp, #16]
 ; CHECK-NEXT:    ld1 { v0.b }[1], [x8]
-; CHECK-NEXT:    ld1 { v2.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #24
-; CHECK-NEXT:    ldr b1, [sp, #344]
 ; CHECK-NEXT:    add x9, sp, #96
-; CHECK-NEXT:    ld1 { v3.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #352
+; CHECK-NEXT:    ldr b2, [sp, #344]
 ; CHECK-NEXT:    mov v4.b[1], w1
+; CHECK-NEXT:    ld1 { v1.b }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #24
+; CHECK-NEXT:    ld1 { v6.b }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #352
 ; CHECK-NEXT:    add x8, sp, #104
 ; CHECK-NEXT:    ld1 { v0.b }[2], [x9]
 ; CHECK-NEXT:    add x9, sp, #160
-; CHECK-NEXT:    ld1 { v1.b }[1], [x10]
-; CHECK-NEXT:    ld1 { v2.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #32
-; CHECK-NEXT:    add x12, sp, #360
-; CHECK-NEXT:    ld1 { v3.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v2.b }[1], [x10]
+; CHECK-NEXT:    ld1 { v1.b }[2], [x9]
+; CHECK-NEXT:    add x10, sp, #32
 ; CHECK-NEXT:    add x11, sp, #112
-; CHECK-NEXT:    add x10, sp, #120
-; CHECK-NEXT:    ld1 { v1.b }[2], [x12]
-; CHECK-NEXT:    add x12, sp, #168
-; CHECK-NEXT:    ld1 { v0.b }[3], [x8]
 ; CHECK-NEXT:    mov v4.b[2], w2
-; CHECK-NEXT:    ld1 { v2.b }[3], [x12]
-; CHECK-NEXT:    add x12, sp, #40
-; CHECK-NEXT:    ld1 { v3.b }[3], [x12]
-; CHECK-NEXT:    add x13, sp, #176
-; CHECK-NEXT:    ldr b16, [sp, #216]
-; CHECK-NEXT:    ld1 { v0.b }[4], [x11]
-; CHECK-NEXT:    add x11, sp, #48
-; CHECK-NEXT:    add x12, sp, #368
-; CHECK-NEXT:    ld1 { v2.b }[4], [x13]
+; CHECK-NEXT:    ld1 { v6.b }[2], [x10]
+; CHECK-NEXT:    add x10, sp, #168
+; CHECK-NEXT:    ld1 { v0.b }[3], [x8]
+; CHECK-NEXT:    ldr b5, [sp, #216]
 ; CHECK-NEXT:    add x13, sp, #224
-; CHECK-NEXT:    add x9, sp, #128
+; CHECK-NEXT:    ld1 { v1.b }[3], [x10]
+; CHECK-NEXT:    add x10, sp, #40
+; CHECK-NEXT:    add x12, sp, #120
+; CHECK-NEXT:    ld1 { v6.b }[3], [x10]
+; CHECK-NEXT:    add x10, sp, #176
+; CHECK-NEXT:    ld1 { v5.b }[1], [x13]
 ; CHECK-NEXT:    mov v4.b[3], w3
-; CHECK-NEXT:    ld1 { v3.b }[4], [x11]
-; CHECK-NEXT:    ld1 { v16.b }[1], [x13]
-; CHECK-NEXT:    ld1 { v0.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #56
-; CHECK-NEXT:    ld1 { v1.b }[3], [x12]
-; CHECK-NEXT:    add x12, sp, #184
-; CHECK-NEXT:    ldr b5, [sp, #280]
-; CHECK-NEXT:    add x11, sp, #376
-; CHECK-NEXT:    ld1 { v3.b }[5], [x10]
-; CHECK-NEXT:    ld1 { v2.b }[5], [x12]
-; CHECK-NEXT:    add x10, sp, #232
+; CHECK-NEXT:    ld1 { v0.b }[4], [x11]
+; CHECK-NEXT:    add x11, sp, #48
+; CHECK-NEXT:    add x8, sp, #360
+; CHECK-NEXT:    ld1 { v1.b }[4], [x10]
+; CHECK-NEXT:    add x13, sp, #56
+; CHECK-NEXT:    ld1 { v6.b }[4], [x11]
+; CHECK-NEXT:    ldr b7, [sp, #280]
+; CHECK-NEXT:    ld1 { v2.b }[2], [x8]
+; CHECK-NEXT:    add x15, sp, #232
+; CHECK-NEXT:    ld1 { v0.b }[5], [x12]
+; CHECK-NEXT:    add x14, sp, #184
 ; CHECK-NEXT:    mov v4.b[4], w4
+; CHECK-NEXT:    ld1 { v5.b }[2], [x15]
+; CHECK-NEXT:    add x9, sp, #128
+; CHECK-NEXT:    ld1 { v6.b }[5], [x13]
+; CHECK-NEXT:    add x13, sp, #288
+; CHECK-NEXT:    add x10, sp, #368
+; CHECK-NEXT:    ld1 { v7.b }[1], [x13]
+; CHECK-NEXT:    ld1 { v1.b }[5], [x14]
+; CHECK-NEXT:    ld1 { v2.b }[3], [x10]
+; CHECK-NEXT:    add x15, sp, #240
 ; CHECK-NEXT:    ld1 { v0.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #288
-; CHECK-NEXT:    add x15, sp, #64
-; CHECK-NEXT:    ld1 { v16.b }[2], [x10]
-; CHECK-NEXT:    ldr b17, [sp, #408]
-; CHECK-NEXT:    ld1 { v5.b }[1], [x9]
-; CHECK-NEXT:    add x14, sp, #192
-; CHECK-NEXT:    ld1 { v1.b }[4], [x11]
-; CHECK-NEXT:    ld1 { v3.b }[6], [x15]
-; CHECK-NEXT:    add x15, sp, #416
-; CHECK-NEXT:    ld1 { v2.b }[6], [x14]
-; CHECK-NEXT:    add x14, sp, #240
-; CHECK-NEXT:    ld1 { v17.b }[1], [x15]
 ; CHECK-NEXT:    add x9, sp, #296
-; CHECK-NEXT:    add x8, sp, #136
 ; CHECK-NEXT:    mov v4.b[5], w5
-; CHECK-NEXT:    add x13, sp, #384
-; CHECK-NEXT:    ld1 { v16.b }[3], [x14]
-; CHECK-NEXT:    ld1 { v5.b }[2], [x9]
-; CHECK-NEXT:    ld1 { v1.b }[5], [x13]
-; CHECK-NEXT:    ld1 { v0.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #424
-; CHECK-NEXT:    add x9, sp, #248
-; CHECK-NEXT:    ld1 { v17.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #304
-; CHECK-NEXT:    add x10, sp, #392
-; CHECK-NEXT:    ld1 { v16.b }[4], [x9]
-; CHECK-NEXT:    ld1 { v5.b }[3], [x8]
+; CHECK-NEXT:    add x11, sp, #192
+; CHECK-NEXT:    ld1 { v5.b }[3], [x15]
+; CHECK-NEXT:    ldr b3, [sp, #408]
+; CHECK-NEXT:    ld1 { v7.b }[2], [x9]
+; CHECK-NEXT:    add x12, sp, #64
+; CHECK-NEXT:    add x13, sp, #376
+; CHECK-NEXT:    ld1 { v1.b }[6], [x11]
+; CHECK-NEXT:    add x11, sp, #416
+; CHECK-NEXT:    ld1 { v6.b }[6], [x12]
+; CHECK-NEXT:    add x12, sp, #248
+; CHECK-NEXT:    ld1 { v3.b }[1], [x11]
 ; CHECK-NEXT:    mov v4.b[6], w6
-; CHECK-NEXT:    ld1 { v1.b }[6], [x10]
-; CHECK-NEXT:    add x10, sp, #432
-; CHECK-NEXT:    add x9, sp, #256
-; CHECK-NEXT:    ld1 { v17.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #312
-; CHECK-NEXT:    ldr b22, [sp, #608]
-; CHECK-NEXT:    add x8, sp, #400
-; CHECK-NEXT:    ld1 { v16.b }[5], [x9]
-; CHECK-NEXT:    ld1 { v5.b }[4], [x10]
-; CHECK-NEXT:    add x9, sp, #616
-; CHECK-NEXT:    ld1 { v1.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #440
-; CHECK-NEXT:    ld1 { v22.b }[1], [x9]
+; CHECK-NEXT:    ld1 { v2.b }[4], [x13]
+; CHECK-NEXT:    add x11, sp, #304
+; CHECK-NEXT:    ld1 { v5.b }[4], [x12]
+; CHECK-NEXT:    ld1 { v7.b }[3], [x11]
+; CHECK-NEXT:    add x8, sp, #136
+; CHECK-NEXT:    add x15, sp, #384
+; CHECK-NEXT:    add x9, sp, #424
+; CHECK-NEXT:    ld1 { v0.b }[7], [x8]
+; CHECK-NEXT:    ld1 { v3.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v2.b }[5], [x15]
+; CHECK-NEXT:    add x8, sp, #312
 ; CHECK-NEXT:    mov v4.b[7], w7
-; CHECK-NEXT:    ld1 { v17.b }[4], [x8]
+; CHECK-NEXT:    add x9, sp, #256
+; CHECK-NEXT:    add x10, sp, #200
+; CHECK-NEXT:    ld1 { v7.b }[4], [x8]
+; CHECK-NEXT:    ld1 { v5.b }[5], [x9]
+; CHECK-NEXT:    add x14, sp, #72
+; CHECK-NEXT:    ld1 { v1.b }[7], [x10]
+; CHECK-NEXT:    add x10, sp, #432
+; CHECK-NEXT:    add x8, sp, #392
+; CHECK-NEXT:    ld1 { v6.b }[7], [x14]
+; CHECK-NEXT:    ld1 { v3.b }[3], [x10]
+; CHECK-NEXT:    ld1 { v2.b }[6], [x8]
 ; CHECK-NEXT:    add x8, sp, #320
+; CHECK-NEXT:    add x9, sp, #264
+; CHECK-NEXT:    sshll v21.8h, v4.8b, #0
+; CHECK-NEXT:    ldr b4, [sp, #208]
+; CHECK-NEXT:    ld1 { v7.b }[5], [x8]
+; CHECK-NEXT:    ld1 { v5.b }[6], [x9]
+; CHECK-NEXT:    add x10, sp, #440
+; CHECK-NEXT:    add x8, sp, #400
+; CHECK-NEXT:    sshll v16.8h, v6.8b, #0
+; CHECK-NEXT:    sshll v6.8h, v4.8b, #0
+; CHECK-NEXT:    ld1 { v3.b }[4], [x10]
+; CHECK-NEXT:    ld1 { v2.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #272
+; CHECK-NEXT:    add x9, sp, #328
+; CHECK-NEXT:    ldr b4, [sp, #608]
+; CHECK-NEXT:    ld1 { v7.b }[6], [x9]
+; CHECK-NEXT:    ld1 { v5.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #616
 ; CHECK-NEXT:    add x10, sp, #448
-; CHECK-NEXT:    ldr b6, [sp, #208]
-; CHECK-NEXT:    ld1 { v5.b }[5], [x8]
-; CHECK-NEXT:    add x8, sp, #624
-; CHECK-NEXT:    ldr b7, [sp, #472]
-; CHECK-NEXT:    ld1 { v22.b }[2], [x8]
-; CHECK-NEXT:    ld1 { v17.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #328
-; CHECK-NEXT:    sshll v20.8h, v4.8b, #0
-; CHECK-NEXT:    ldr b4, [sp, #480]
+; CHECK-NEXT:    ld1 { v4.b }[1], [x8]
+; CHECK-NEXT:    ldr b18, [sp, #480]
+; CHECK-NEXT:    ld1 { v3.b }[5], [x10]
+; CHECK-NEXT:    add x9, sp, #336
+; CHECK-NEXT:    ldr b17, [sp, #472]
+; CHECK-NEXT:    add x8, sp, #488
+; CHECK-NEXT:    ld1 { v7.b }[7], [x9]
+; CHECK-NEXT:    add x9, sp, #624
+; CHECK-NEXT:    ld1 { v18.b }[1], [x8]
+; CHECK-NEXT:    sshll v22.8h, v5.8b, #0
 ; CHECK-NEXT:    add x8, sp, #456
-; CHECK-NEXT:    ld1 { v5.b }[6], [x10]
-; CHECK-NEXT:    add x10, sp, #632
-; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
-; CHECK-NEXT:    ld1 { v22.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #488
-; CHECK-NEXT:    ld1 { v17.b }[6], [x8]
-; CHECK-NEXT:    add x8, sp, #336
-; CHECK-NEXT:    ld1 { v4.b }[1], [x10]
-; CHECK-NEXT:    sshll v7.8h, v7.8b, #0
-; CHECK-NEXT:    ld1 { v5.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #640
-; CHECK-NEXT:    add x9, sp, #264
-; CHECK-NEXT:    ld1 { v22.b }[4], [x8]
+; CHECK-NEXT:    sshll v5.8h, v17.8b, #0
+; CHECK-NEXT:    ld1 { v4.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v3.b }[6], [x8]
 ; CHECK-NEXT:    add x8, sp, #496
-; CHECK-NEXT:    ld1 { v16.b }[6], [x9]
-; CHECK-NEXT:    ld1 { v4.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #648
-; CHECK-NEXT:    smull v18.4s, v6.4h, v7.4h
-; CHECK-NEXT:    ldr b7, [sp, #544]
-; CHECK-NEXT:    add x9, sp, #272
-; CHECK-NEXT:    movi v6.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v22.b }[5], [x8]
+; CHECK-NEXT:    sshll v17.8h, v7.8b, #0
+; CHECK-NEXT:    add x10, sp, #632
+; CHECK-NEXT:    ld1 { v18.b }[2], [x8]
+; CHECK-NEXT:    add x9, sp, #464
 ; CHECK-NEXT:    add x8, sp, #504
-; CHECK-NEXT:    ld1 { v16.b }[7], [x9]
-; CHECK-NEXT:    ld1 { v4.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #552
-; CHECK-NEXT:    add x9, sp, #656
-; CHECK-NEXT:    ld1 { v7.b }[1], [x8]
+; CHECK-NEXT:    smull v19.4s, v6.4h, v5.4h
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v4.b }[3], [x10]
+; CHECK-NEXT:    ld1 { v3.b }[7], [x9]
+; CHECK-NEXT:    smull v6.4s, v16.4h, v17.4h
+; CHECK-NEXT:    add x9, sp, #640
+; CHECK-NEXT:    ld1 { v18.b }[3], [x8]
+; CHECK-NEXT:    smull2 v16.4s, v16.8h, v17.8h
+; CHECK-NEXT:    ldr b17, [sp, #672]
+; CHECK-NEXT:    ld1 { v4.b }[4], [x9]
+; CHECK-NEXT:    add x9, sp, #680
+; CHECK-NEXT:    ldr b20, [sp, #544]
+; CHECK-NEXT:    mov v5.s[0], v19.s[0]
 ; CHECK-NEXT:    add x8, sp, #512
-; CHECK-NEXT:    ldr b21, [sp, #672]
-; CHECK-NEXT:    ld1 { v22.b }[6], [x9]
-; CHECK-NEXT:    mov v6.s[0], v18.s[0]
-; CHECK-NEXT:    add x9, sp, #664
-; CHECK-NEXT:    ld1 { v4.b }[4], [x8]
-; CHECK-NEXT:    add x8, sp, #560
-; CHECK-NEXT:    sshll v23.8h, v16.8b, #0
-; CHECK-NEXT:    ld1 { v7.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #520
-; CHECK-NEXT:    movi v19.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v22.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #528
-; CHECK-NEXT:    add x10, sp, #464
-; CHECK-NEXT:    ld1 { v4.b }[5], [x8]
-; CHECK-NEXT:    add x8, sp, #568
-; CHECK-NEXT:    smull2 v18.4s, v20.8h, v23.8h
-; CHECK-NEXT:    ld1 { v7.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #680
-; CHECK-NEXT:    smlal v6.4s, v20.4h, v23.4h
-; CHECK-NEXT:    ld1 { v21.b }[1], [x8]
-; CHECK-NEXT:    sshll v20.8h, v22.8b, #0
-; CHECK-NEXT:    ldr b22, [sp, #736]
-; CHECK-NEXT:    ld1 { v4.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #576
-; CHECK-NEXT:    ldr b23, [sp, #1000]
-; CHECK-NEXT:    ld1 { v7.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #688
-; CHECK-NEXT:    sshll v24.8h, v22.8b, #0
-; CHECK-NEXT:    ld1 { v21.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v17.b }[1], [x9]
+; CHECK-NEXT:    add x11, sp, #552
+; CHECK-NEXT:    add x10, sp, #648
+; CHECK-NEXT:    ld1 { v18.b }[4], [x8]
+; CHECK-NEXT:    ld1 { v20.b }[1], [x11]
+; CHECK-NEXT:    ld1 { v4.b }[5], [x10]
+; CHECK-NEXT:    add x10, sp, #688
+; CHECK-NEXT:    add x9, sp, #520
+; CHECK-NEXT:    ld1 { v17.b }[2], [x10]
+; CHECK-NEXT:    add x10, sp, #560
+; CHECK-NEXT:    smull2 v7.4s, v21.8h, v22.8h
+; CHECK-NEXT:    ld1 { v18.b }[5], [x9]
+; CHECK-NEXT:    smlal v5.4s, v21.4h, v22.4h
+; CHECK-NEXT:    ld1 { v20.b }[2], [x10]
+; CHECK-NEXT:    ldr b21, [sp, #736]
+; CHECK-NEXT:    ldr b22, [sp, #1000]
+; CHECK-NEXT:    add x8, sp, #656
 ; CHECK-NEXT:    add x9, sp, #696
-; CHECK-NEXT:    sshll v25.8h, v23.8b, #0
-; CHECK-NEXT:    add x8, sp, #536
-; CHECK-NEXT:    ldr b22, [sp, #872]
-; CHECK-NEXT:    ldr b23, [sp, #936]
-; CHECK-NEXT:    ld1 { v4.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #584
-; CHECK-NEXT:    ld1 { v17.b }[7], [x10]
-; CHECK-NEXT:    ld1 { v21.b }[3], [x9]
-; CHECK-NEXT:    ld1 { v7.b }[5], [x8]
-; CHECK-NEXT:    add x8, sp, #880
-; CHECK-NEXT:    add x9, sp, #704
-; CHECK-NEXT:    smull v25.4s, v24.4h, v25.4h
-; CHECK-NEXT:    ldr b24, [sp, #744]
-; CHECK-NEXT:    ld1 { v22.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #944
-; CHECK-NEXT:    add x10, sp, #888
-; CHECK-NEXT:    ld1 { v21.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #752
-; CHECK-NEXT:    ld1 { v23.b }[1], [x8]
-; CHECK-NEXT:    ld1 { v24.b }[1], [x9]
-; CHECK-NEXT:    add x8, sp, #712
+; CHECK-NEXT:    add x11, sp, #568
+; CHECK-NEXT:    ld1 { v4.b }[6], [x8]
+; CHECK-NEXT:    add x8, sp, #528
+; CHECK-NEXT:    ld1 { v17.b }[3], [x9]
+; CHECK-NEXT:    sshll v21.8h, v21.8b, #0
+; CHECK-NEXT:    sshll v24.8h, v22.8b, #0
+; CHECK-NEXT:    ld1 { v18.b }[6], [x8]
+; CHECK-NEXT:    ld1 { v20.b }[3], [x11]
+; CHECK-NEXT:    add x10, sp, #704
+; CHECK-NEXT:    ldr b23, [sp, #808]
+; CHECK-NEXT:    movi v19.2d, #0000000000000000
+; CHECK-NEXT:    add x9, sp, #536
+; CHECK-NEXT:    ld1 { v17.b }[4], [x10]
+; CHECK-NEXT:    add x10, sp, #576
+; CHECK-NEXT:    ldr b22, [sp, #744]
+; CHECK-NEXT:    add x11, sp, #816
+; CHECK-NEXT:    smull v24.4s, v21.4h, v24.4h
+; CHECK-NEXT:    ld1 { v18.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v20.b }[4], [x10]
+; CHECK-NEXT:    add x10, sp, #752
+; CHECK-NEXT:    ld1 { v23.b }[1], [x11]
+; CHECK-NEXT:    add x9, sp, #712
+; CHECK-NEXT:    ld1 { v22.b }[1], [x10]
+; CHECK-NEXT:    ld1 { v17.b }[5], [x9]
+; CHECK-NEXT:    add x9, sp, #584
+; CHECK-NEXT:    add x10, sp, #824
+; CHECK-NEXT:    sshll v21.8h, v18.8b, #0
+; CHECK-NEXT:    ld1 { v20.b }[5], [x9]
 ; CHECK-NEXT:    add x9, sp, #760
-; CHECK-NEXT:    ld1 { v22.b }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #952
-; CHECK-NEXT:    mov v19.s[0], v25.s[0]
-; CHECK-NEXT:    ldr b25, [sp, #808]
+; CHECK-NEXT:    ldr b18, [sp, #936]
 ; CHECK-NEXT:    ld1 { v23.b }[2], [x10]
-; CHECK-NEXT:    ld1 { v21.b }[5], [x8]
-; CHECK-NEXT:    ld1 { v24.b }[2], [x9]
-; CHECK-NEXT:    add x8, sp, #816
-; CHECK-NEXT:    add x9, sp, #896
-; CHECK-NEXT:    ld1 { v25.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #960
-; CHECK-NEXT:    ld1 { v22.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #768
-; CHECK-NEXT:    ld1 { v23.b }[3], [x8]
-; CHECK-NEXT:    add x10, sp, #904
-; CHECK-NEXT:    ld1 { v24.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #824
-; CHECK-NEXT:    add x8, sp, #720
-; CHECK-NEXT:    ld1 { v25.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #968
-; CHECK-NEXT:    ld1 { v22.b }[4], [x10]
-; CHECK-NEXT:    add x10, sp, #776
-; CHECK-NEXT:    ld1 { v23.b }[4], [x9]
-; CHECK-NEXT:    ld1 { v21.b }[6], [x8]
-; CHECK-NEXT:    ld1 { v24.b }[4], [x10]
-; CHECK-NEXT:    add x8, sp, #832
-; CHECK-NEXT:    add x9, sp, #912
-; CHECK-NEXT:    ld1 { v25.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #976
-; CHECK-NEXT:    ld1 { v22.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #784
-; CHECK-NEXT:    ld1 { v23.b }[5], [x8]
-; CHECK-NEXT:    add x10, sp, #920
-; CHECK-NEXT:    ld1 { v24.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #840
-; CHECK-NEXT:    add x8, sp, #728
-; CHECK-NEXT:    ld1 { v25.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #984
-; CHECK-NEXT:    ld1 { v22.b }[6], [x10]
-; CHECK-NEXT:    add x10, sp, #792
-; CHECK-NEXT:    ld1 { v23.b }[6], [x9]
-; CHECK-NEXT:    ld1 { v21.b }[7], [x8]
-; CHECK-NEXT:    ld1 { v24.b }[6], [x10]
-; CHECK-NEXT:    add x8, sp, #848
-; CHECK-NEXT:    add x9, sp, #928
-; CHECK-NEXT:    ld1 { v25.b }[5], [x8]
-; CHECK-NEXT:    add x12, sp, #72
-; CHECK-NEXT:    add x8, sp, #992
-; CHECK-NEXT:    ld1 { v22.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #800
-; CHECK-NEXT:    ld1 { v3.b }[7], [x12]
-; CHECK-NEXT:    ld1 { v23.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #592
-; CHECK-NEXT:    ld1 { v24.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #856
-; CHECK-NEXT:    ld1 { v7.b }[6], [x8]
-; CHECK-NEXT:    add x11, sp, #200
-; CHECK-NEXT:    ld1 { v25.b }[6], [x9]
-; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-NEXT:    sshll v5.8h, v5.8b, #0
-; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
-; CHECK-NEXT:    sshll v21.8h, v21.8b, #0
+; CHECK-NEXT:    mov v19.s[0], v24.s[0]
+; CHECK-NEXT:    ldr b24, [sp, #872]
+; CHECK-NEXT:    ld1 { v22.b }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #944
+; CHECK-NEXT:    add x11, sp, #880
+; CHECK-NEXT:    add x10, sp, #768
+; CHECK-NEXT:    ld1 { v18.b }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #832
+; CHECK-NEXT:    ld1 { v24.b }[1], [x11]
+; CHECK-NEXT:    ld1 { v23.b }[3], [x9]
+; CHECK-NEXT:    ld1 { v22.b }[3], [x10]
+; CHECK-NEXT:    add x10, sp, #952
+; CHECK-NEXT:    add x12, sp, #888
+; CHECK-NEXT:    add x9, sp, #592
+; CHECK-NEXT:    add x11, sp, #776
+; CHECK-NEXT:    ld1 { v18.b }[2], [x10]
+; CHECK-NEXT:    add x10, sp, #840
+; CHECK-NEXT:    ld1 { v24.b }[2], [x12]
+; CHECK-NEXT:    ld1 { v23.b }[4], [x10]
+; CHECK-NEXT:    ld1 { v22.b }[4], [x11]
+; CHECK-NEXT:    ld1 { v20.b }[6], [x9]
+; CHECK-NEXT:    add x9, sp, #960
+; CHECK-NEXT:    add x11, sp, #896
+; CHECK-NEXT:    add x10, sp, #784
+; CHECK-NEXT:    ld1 { v18.b }[3], [x9]
+; CHECK-NEXT:    add x9, sp, #848
+; CHECK-NEXT:    ld1 { v24.b }[3], [x11]
+; CHECK-NEXT:    ld1 { v23.b }[5], [x9]
+; CHECK-NEXT:    ld1 { v22.b }[5], [x10]
+; CHECK-NEXT:    add x10, sp, #968
+; CHECK-NEXT:    add x12, sp, #904
+; CHECK-NEXT:    add x9, sp, #600
+; CHECK-NEXT:    add x11, sp, #792
+; CHECK-NEXT:    ld1 { v18.b }[4], [x10]
+; CHECK-NEXT:    add x10, sp, #856
+; CHECK-NEXT:    ld1 { v24.b }[4], [x12]
+; CHECK-NEXT:    ld1 { v23.b }[6], [x10]
+; CHECK-NEXT:    ld1 { v22.b }[6], [x11]
+; CHECK-NEXT:    ld1 { v20.b }[7], [x9]
+; CHECK-NEXT:    add x9, sp, #976
+; CHECK-NEXT:    add x11, sp, #912
+; CHECK-NEXT:    add x10, sp, #800
+; CHECK-NEXT:    ld1 { v18.b }[5], [x9]
+; CHECK-NEXT:    add x9, sp, #864
+; CHECK-NEXT:    ld1 { v24.b }[5], [x11]
+; CHECK-NEXT:    ld1 { v23.b }[7], [x9]
+; CHECK-NEXT:    add x9, sp, #720
+; CHECK-NEXT:    ld1 { v22.b }[7], [x10]
+; CHECK-NEXT:    add x10, sp, #984
+; CHECK-NEXT:    ld1 { v17.b }[6], [x9]
+; CHECK-NEXT:    add x9, sp, #920
+; CHECK-NEXT:    ld1 { v18.b }[6], [x10]
+; CHECK-NEXT:    ld1 { v24.b }[6], [x9]
+; CHECK-NEXT:    add x10, sp, #728
+; CHECK-NEXT:    add x8, sp, #664
+; CHECK-NEXT:    sshll v20.8h, v20.8b, #0
 ; CHECK-NEXT:    sshll v22.8h, v22.8b, #0
 ; CHECK-NEXT:    sshll v23.8h, v23.8b, #0
-; CHECK-NEXT:    add x8, sp, #600
-; CHECK-NEXT:    sshll v24.8h, v24.8b, #0
-; CHECK-NEXT:    add x9, sp, #864
-; CHECK-NEXT:    ld1 { v2.b }[7], [x11]
-; CHECK-NEXT:    ld1 { v7.b }[7], [x8]
-; CHECK-NEXT:    ld1 { v25.b }[7], [x9]
-; CHECK-NEXT:    smull v16.4s, v3.4h, v5.4h
-; CHECK-NEXT:    smull2 v3.4s, v3.8h, v5.8h
-; CHECK-NEXT:    smull v5.4s, v21.4h, v23.4h
-; CHECK-NEXT:    smull2 v21.4s, v21.8h, v23.8h
-; CHECK-NEXT:    smull2 v23.4s, v20.8h, v22.8h
-; CHECK-NEXT:    smlal v19.4s, v4.4h, v24.4h
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NEXT:    sshll v17.8h, v17.8b, #0
+; CHECK-NEXT:    add x9, sp, #992
+; CHECK-NEXT:    ld1 { v17.b }[7], [x10]
+; CHECK-NEXT:    add x10, sp, #928
+; CHECK-NEXT:    ld1 { v18.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v4.b }[7], [x8]
+; CHECK-NEXT:    ld1 { v24.b }[7], [x10]
+; CHECK-NEXT:    smlal v19.4s, v21.4h, v22.4h
+; CHECK-NEXT:    smull2 v21.4s, v21.8h, v22.8h
+; CHECK-NEXT:    smull v22.4s, v20.4h, v23.4h
+; CHECK-NEXT:    smull2 v20.4s, v20.8h, v23.8h
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    sshll v7.8h, v7.8b, #0
-; CHECK-NEXT:    sshll v25.8h, v25.8b, #0
-; CHECK-NEXT:    smlal2 v3.4s, v2.8h, v17.8h
-; CHECK-NEXT:    smlal v16.4s, v2.4h, v17.4h
-; CHECK-NEXT:    smlal2 v23.4s, v4.8h, v24.8h
-; CHECK-NEXT:    smlal2 v18.4s, v0.8h, v1.8h
-; CHECK-NEXT:    smlal v6.4s, v0.4h, v1.4h
-; CHECK-NEXT:    smlal v19.4s, v20.4h, v22.4h
-; CHECK-NEXT:    smlal2 v21.4s, v7.8h, v25.8h
-; CHECK-NEXT:    smlal v5.4s, v7.4h, v25.4h
-; CHECK-NEXT:    add v0.4s, v18.4s, v3.4s
-; CHECK-NEXT:    add v1.4s, v6.4s, v16.4s
-; CHECK-NEXT:    add v2.4s, v23.4s, v21.4s
-; CHECK-NEXT:    add v3.4s, v19.4s, v5.4s
+; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
+; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-NEXT:    sshll v17.8h, v17.8b, #0
+; CHECK-NEXT:    sshll v18.8h, v18.8b, #0
+; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
+; CHECK-NEXT:    sshll v23.8h, v24.8b, #0
+; CHECK-NEXT:    smlal2 v16.4s, v1.8h, v3.8h
+; CHECK-NEXT:    smlal v6.4s, v1.4h, v3.4h
+; CHECK-NEXT:    smlal2 v7.4s, v0.8h, v2.8h
+; CHECK-NEXT:    smlal v5.4s, v0.4h, v2.4h
+; CHECK-NEXT:    smlal2 v20.4s, v17.8h, v18.8h
+; CHECK-NEXT:    smlal v22.4s, v17.4h, v18.4h
+; CHECK-NEXT:    smlal2 v21.4s, v4.8h, v23.8h
+; CHECK-NEXT:    smlal v19.4s, v4.4h, v23.4h
+; CHECK-NEXT:    add v0.4s, v7.4s, v16.4s
+; CHECK-NEXT:    add v1.4s, v5.4s, v6.4s
+; CHECK-NEXT:    add v2.4s, v21.4s, v20.4s
+; CHECK-NEXT:    add v3.4s, v19.4s, v22.4s
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    add v1.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
@@ -2050,10 +2050,10 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    ld1 { v3.b }[2], [x10]
 ; CHECK-NEXT:    ld1 { v5.b }[2], [x8]
 ; CHECK-NEXT:    add x8, sp, #176
-; CHECK-NEXT:    ldr b6, [sp, #544]
+; CHECK-NEXT:    ldr b6, [sp, #672]
 ; CHECK-NEXT:    ld1 { v0.b }[4], [x12]
-; CHECK-NEXT:    add x14, sp, #552
-; CHECK-NEXT:    ldr b7, [sp, #672]
+; CHECK-NEXT:    add x14, sp, #680
+; CHECK-NEXT:    ldr b7, [sp, #544]
 ; CHECK-NEXT:    ld1 { v2.b }[4], [x8]
 ; CHECK-NEXT:    add x13, sp, #40
 ; CHECK-NEXT:    ld1 { v6.b }[1], [x14]
@@ -2061,7 +2061,7 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    add x11, sp, #128
 ; CHECK-NEXT:    ld1 { v3.b }[3], [x13]
 ; CHECK-NEXT:    ld1 { v0.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #680
+; CHECK-NEXT:    add x9, sp, #552
 ; CHECK-NEXT:    add x13, sp, #184
 ; CHECK-NEXT:    ld1 { v7.b }[1], [x9]
 ; CHECK-NEXT:    ld1 { v2.b }[5], [x13]
@@ -2070,26 +2070,26 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    ld1 { v4.b }[2], [x13]
 ; CHECK-NEXT:    add x10, sp, #136
 ; CHECK-NEXT:    ld1 { v0.b }[6], [x11]
-; CHECK-NEXT:    add x11, sp, #560
+; CHECK-NEXT:    add x11, sp, #688
 ; CHECK-NEXT:    ld1 { v5.b }[3], [x15]
 ; CHECK-NEXT:    ld1 { v6.b }[2], [x11]
-; CHECK-NEXT:    add x11, sp, #688
+; CHECK-NEXT:    add x11, sp, #560
 ; CHECK-NEXT:    mov v1.b[3], w3
 ; CHECK-NEXT:    ld1 { v7.b }[2], [x11]
 ; CHECK-NEXT:    add x9, sp, #632
 ; CHECK-NEXT:    add x11, sp, #512
 ; CHECK-NEXT:    ld1 { v0.b }[7], [x10]
 ; CHECK-NEXT:    ld1 { v4.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #568
-; CHECK-NEXT:    add x10, sp, #696
+; CHECK-NEXT:    add x9, sp, #696
+; CHECK-NEXT:    add x10, sp, #568
 ; CHECK-NEXT:    ld1 { v6.b }[3], [x9]
 ; CHECK-NEXT:    ld1 { v5.b }[4], [x11]
 ; CHECK-NEXT:    ld1 { v7.b }[3], [x10]
 ; CHECK-NEXT:    add x9, sp, #640
 ; CHECK-NEXT:    mov v1.b[4], w4
 ; CHECK-NEXT:    ld1 { v4.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #576
-; CHECK-NEXT:    add x10, sp, #704
+; CHECK-NEXT:    add x9, sp, #704
+; CHECK-NEXT:    add x10, sp, #576
 ; CHECK-NEXT:    add x11, sp, #520
 ; CHECK-NEXT:    ld1 { v6.b }[4], [x9]
 ; CHECK-NEXT:    ldr b18, [sp, #736]
@@ -2101,8 +2101,8 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    add x9, sp, #648
 ; CHECK-NEXT:    ld1 { v3.b }[4], [x8]
 ; CHECK-NEXT:    add x10, sp, #528
-; CHECK-NEXT:    add x11, sp, #584
-; CHECK-NEXT:    add x12, sp, #712
+; CHECK-NEXT:    add x11, sp, #712
+; CHECK-NEXT:    add x12, sp, #584
 ; CHECK-NEXT:    sshll v18.8h, v18.8b, #0
 ; CHECK-NEXT:    mov v1.b[5], w5
 ; CHECK-NEXT:    ld1 { v6.b }[5], [x11]
@@ -2114,8 +2114,8 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    ld1 { v3.b }[5], [x14]
 ; CHECK-NEXT:    add x9, sp, #656
 ; CHECK-NEXT:    add x10, sp, #536
-; CHECK-NEXT:    add x11, sp, #592
-; CHECK-NEXT:    add x12, sp, #720
+; CHECK-NEXT:    add x11, sp, #720
+; CHECK-NEXT:    add x12, sp, #592
 ; CHECK-NEXT:    sshll v18.4s, v18.4h, #0
 ; CHECK-NEXT:    ldr b16, [sp, #208]
 ; CHECK-NEXT:    ld1 { v6.b }[6], [x11]
@@ -2127,8 +2127,8 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    sshll v16.8h, v16.8b, #0
 ; CHECK-NEXT:    ld1 { v3.b }[6], [x8]
 ; CHECK-NEXT:    add x8, sp, #664
-; CHECK-NEXT:    add x9, sp, #600
-; CHECK-NEXT:    add x10, sp, #728
+; CHECK-NEXT:    add x9, sp, #728
+; CHECK-NEXT:    add x10, sp, #600
 ; CHECK-NEXT:    mov v17.s[0], v18.s[0]
 ; CHECK-NEXT:    ld1 { v6.b }[7], [x9]
 ; CHECK-NEXT:    ld1 { v7.b }[7], [x10]
@@ -2151,7 +2151,7 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
 ; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
 ; CHECK-NEXT:    saddl2 v16.4s, v7.8h, v6.8h
-; CHECK-NEXT:    saddl2 v5.4s, v4.8h, v5.8h
+; CHECK-NEXT:    saddl2 v5.4s, v5.8h, v4.8h
 ; CHECK-NEXT:    saddl v6.4s, v7.4h, v6.4h
 ; CHECK-NEXT:    saddw v4.4s, v17.4s, v4.4h
 ; CHECK-NEXT:    saddl2 v17.4s, v1.8h, v0.8h
diff --git a/llvm/test/CodeGen/AArch64/pr58431.ll b/llvm/test/CodeGen/AArch64/pr58431.ll
index dcd9759..e87d8f7 100644
--- a/llvm/test/CodeGen/AArch64/pr58431.ll
+++ b/llvm/test/CodeGen/AArch64/pr58431.ll
@@ -4,8 +4,8 @@
 define i32 @f(i64 %0) {
 ; CHECK-LABEL: f:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #10
-; CHECK-NEXT:    mov w9, w0
+; CHECK-NEXT:    mov w8, #10 // =0xa
+; CHECK-NEXT:    and x9, x0, #0xffffffff
 ; CHECK-NEXT:    udiv x10, x9, x8
 ; CHECK-NEXT:    msub x0, x10, x8, x9
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
diff --git a/llvm/test/CodeGen/AArch64/selectopt-not.ll b/llvm/test/CodeGen/AArch64/selectopt-not.ll
index 7a949d1..a7939d6 100644
--- a/llvm/test/CodeGen/AArch64/selectopt-not.ll
+++ b/llvm/test/CodeGen/AArch64/selectopt-not.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -S < %s | FileCheck %s
+; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-STANDARD
+; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -S -disable-loop-level-heuristics < %s | FileCheck %s --check-prefixes=CHECK,CHECK-FORCED
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
 target triple = "aarch64"
@@ -29,10 +30,10 @@ define i32 @minloc1(ptr nocapture readonly %0, ptr nocapture readonly %1, ptr no
 ; CHECK-NEXT:    [[TMP21:%.*]] = sub i64 0, [[TMP7]]
 ; CHECK-NEXT:    br label [[DOTPREHEADER35:%.*]]
 ; CHECK:       .preheader35:
-; CHECK-NEXT:    [[TMP22:%.*]] = phi i32 [ 2147483647, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP30:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[TMP23:%.*]] = phi i64 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[IV_N:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[DOT045:%.*]] = phi i1 [ false, [[DOTPREHEADER35_LR_PH]] ], [ [[DOT2:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[DOTLCSSA364144:%.*]] = phi i32 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP29:%.*]], [[DOTPREHEADER35]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi i32 [ 2147483647, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP30:%.*]], [[SELECT_END:%.*]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = phi i64 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[IV_N:%.*]], [[SELECT_END]] ]
+; CHECK-NEXT:    [[DOT045:%.*]] = phi i1 [ false, [[DOTPREHEADER35_LR_PH]] ], [ [[DOT2:%.*]], [[SELECT_END]] ]
+; CHECK-NEXT:    [[DOTLCSSA364144:%.*]] = phi i32 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP29:%.*]], [[SELECT_END]] ]
 ; CHECK-NEXT:    [[TMP24:%.*]] = mul nsw i64 [[TMP23]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP24]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
@@ -40,15 +41,20 @@ define i32 @minloc1(ptr nocapture readonly %0, ptr nocapture readonly %1, ptr no
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp sge i32 [[TMP26]], [[TMP22]]
 ; CHECK-NEXT:    [[DOTNOT33:%.*]] = and i1 [[DOT045]], [[TMP28]]
 ; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[TMP27]], i1 true, i1 [[DOTNOT33]]
-; CHECK-NEXT:    [[TMP29]] = select i1 [[OR_COND]], i32 [[DOTLCSSA364144]], i32 1
+; CHECK-NEXT:    [[OR_COND_FROZEN:%.*]] = freeze i1 [[OR_COND]]
+; CHECK-NEXT:    br i1 [[OR_COND_FROZEN]], label [[SELECT_END]], label [[SELECT_FALSE:%.*]]
+; CHECK:       select.false:
+; CHECK-NEXT:    br label [[SELECT_END]]
+; CHECK:       select.end:
+; CHECK-NEXT:    [[TMP29]] = phi i32 [ [[DOTLCSSA364144]], [[DOTPREHEADER35]] ], [ 1, [[SELECT_FALSE]] ]
+; CHECK-NEXT:    [[DOT2]] = phi i1 [ [[DOT045]], [[DOTPREHEADER35]] ], [ true, [[SELECT_FALSE]] ]
+; CHECK-NEXT:    [[TMP30]] = phi i32 [ [[TMP22]], [[DOTPREHEADER35]] ], [ [[TMP20]], [[SELECT_FALSE]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND:%.*]] = xor i1 [[OR_COND]], true
-; CHECK-NEXT:    [[DOT2]] = select i1 [[NOT_OR_COND]], i1 true, i1 [[DOT045]]
-; CHECK-NEXT:    [[TMP30]] = select i1 [[OR_COND]], i32 [[TMP22]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[IV_N]] = add nuw nsw i64 [[TMP23]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_N]], [[TMP9]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[DOTPREHEADER]], label [[DOTPREHEADER35]]
 ; CHECK:       .preheader:
-; CHECK-NEXT:    [[DOTLCSSA3641_LCSSA:%.*]] = phi i32 [ 0, [[TMP3:%.*]] ], [ [[TMP29]], [[DOTPREHEADER35]] ]
+; CHECK-NEXT:    [[DOTLCSSA3641_LCSSA:%.*]] = phi i32 [ 0, [[TMP3:%.*]] ], [ [[TMP29]], [[SELECT_END]] ]
 ; CHECK-NEXT:    ret i32 [[DOTLCSSA3641_LCSSA]]
 ;
   %4 = getelementptr i8, ptr %0, i64 40
@@ -101,53 +107,106 @@ define i32 @minloc1(ptr nocapture readonly %0, ptr nocapture readonly %1, ptr no
 }
 
 define i32 @minloc1_otherunusednot(ptr nocapture readonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) {
-; CHECK-LABEL: @minloc1_otherunusednot(
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP0:%.*]], i64 40
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 64
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 80
-; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP0]], i64 88
-; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TMP0]], align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP1:%.*]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = add nsw i64 [[TMP14]], -1
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], [[TMP5]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = shl i64 [[TMP7]], 3
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP2:%.*]], align 4
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp slt i64 [[TMP9]], 1
-; CHECK-NEXT:    br i1 [[DOTNOT]], label [[DOTPREHEADER:%.*]], label [[DOTPREHEADER35_LR_PH:%.*]]
-; CHECK:       .preheader35.lr.ph:
-; CHECK-NEXT:    [[TMP21:%.*]] = sub i64 0, [[TMP7]]
-; CHECK-NEXT:    br label [[DOTPREHEADER35:%.*]]
-; CHECK:       .preheader35:
-; CHECK-NEXT:    [[TMP22:%.*]] = phi i32 [ 2147483647, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP30:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[TMP23:%.*]] = phi i64 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[IV_N:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[DOT045:%.*]] = phi i1 [ false, [[DOTPREHEADER35_LR_PH]] ], [ [[DOT2:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[DOTLCSSA364144:%.*]] = phi i32 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP29:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[TMP24:%.*]] = mul nsw i64 [[TMP23]], [[TMP11]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], [[TMP20]]
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp sge i32 [[TMP26]], [[TMP22]]
-; CHECK-NEXT:    [[DOTNOT33:%.*]] = and i1 [[DOT045]], [[TMP28]]
-; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[TMP27]], i1 true, i1 [[DOTNOT33]]
-; CHECK-NEXT:    [[TMP29]] = select i1 [[OR_COND]], i32 [[DOTLCSSA364144]], i32 1
-; CHECK-NEXT:    [[DOT2]] = select i1 [[OR_COND]], i1 [[DOT045]], i1 true
-; CHECK-NEXT:    [[NOT_OR_COND:%.*]] = xor i1 [[OR_COND]], true
-; CHECK-NEXT:    [[TMP30]] = select i1 [[OR_COND]], i32 [[TMP22]], i32 [[TMP20]]
-; CHECK-NEXT:    [[IV_N]] = add nuw nsw i64 [[TMP23]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_N]], [[TMP9]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[DOTPREHEADER]], label [[DOTPREHEADER35]]
-; CHECK:       .preheader:
-; CHECK-NEXT:    [[DOTLCSSA3641_LCSSA:%.*]] = phi i32 [ 0, [[TMP3:%.*]] ], [ [[TMP29]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[P:%.*]] = phi i1 [ false, [[TMP3]] ], [ [[NOT_OR_COND]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[Q:%.*]] = select i1 [[P]], i32 [[DOTLCSSA3641_LCSSA]], i32 1
-; CHECK-NEXT:    ret i32 [[Q]]
+; CHECK-STANDARD-LABEL: @minloc1_otherunusednot(
+; CHECK-STANDARD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP0:%.*]], i64 40
+; CHECK-STANDARD-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-STANDARD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 64
+; CHECK-STANDARD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 8
+; CHECK-STANDARD-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 80
+; CHECK-STANDARD-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-STANDARD-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP0]], i64 88
+; CHECK-STANDARD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP10]], align 8
+; CHECK-STANDARD-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TMP0]], align 8
+; CHECK-STANDARD-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP1:%.*]], align 4
+; CHECK-STANDARD-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-STANDARD-NEXT:    [[TMP15:%.*]] = add nsw i64 [[TMP14]], -1
+; CHECK-STANDARD-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], [[TMP5]]
+; CHECK-STANDARD-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP16]]
+; CHECK-STANDARD-NEXT:    [[TMP18:%.*]] = shl i64 [[TMP7]], 3
+; CHECK-STANDARD-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP18]]
+; CHECK-STANDARD-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP2:%.*]], align 4
+; CHECK-STANDARD-NEXT:    [[DOTNOT:%.*]] = icmp slt i64 [[TMP9]], 1
+; CHECK-STANDARD-NEXT:    br i1 [[DOTNOT]], label [[DOTPREHEADER:%.*]], label [[DOTPREHEADER35_LR_PH:%.*]]
+; CHECK-STANDARD:       .preheader35.lr.ph:
+; CHECK-STANDARD-NEXT:    [[TMP21:%.*]] = sub i64 0, [[TMP7]]
+; CHECK-STANDARD-NEXT:    br label [[DOTPREHEADER35:%.*]]
+; CHECK-STANDARD:       .preheader35:
+; CHECK-STANDARD-NEXT:    [[TMP22:%.*]] = phi i32 [ 2147483647, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP30:%.*]], [[DOTPREHEADER35]] ]
+; CHECK-STANDARD-NEXT:    [[TMP23:%.*]] = phi i64 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[IV_N:%.*]], [[DOTPREHEADER35]] ]
+; CHECK-STANDARD-NEXT:    [[DOT045:%.*]] = phi i1 [ false, [[DOTPREHEADER35_LR_PH]] ], [ [[DOT2:%.*]], [[DOTPREHEADER35]] ]
+; CHECK-STANDARD-NEXT:    [[DOTLCSSA364144:%.*]] = phi i32 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP29:%.*]], [[DOTPREHEADER35]] ]
+; CHECK-STANDARD-NEXT:    [[TMP24:%.*]] = mul nsw i64 [[TMP23]], [[TMP11]]
+; CHECK-STANDARD-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP24]]
+; CHECK-STANDARD-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
+; CHECK-STANDARD-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], [[TMP20]]
+; CHECK-STANDARD-NEXT:    [[TMP28:%.*]] = icmp sge i32 [[TMP26]], [[TMP22]]
+; CHECK-STANDARD-NEXT:    [[DOTNOT33:%.*]] = and i1 [[DOT045]], [[TMP28]]
+; CHECK-STANDARD-NEXT:    [[OR_COND:%.*]] = select i1 [[TMP27]], i1 true, i1 [[DOTNOT33]]
+; CHECK-STANDARD-NEXT:    [[TMP29]] = select i1 [[OR_COND]], i32 [[DOTLCSSA364144]], i32 1
+; CHECK-STANDARD-NEXT:    [[DOT2]] = select i1 [[OR_COND]], i1 [[DOT045]], i1 true
+; CHECK-STANDARD-NEXT:    [[NOT_OR_COND:%.*]] = xor i1 [[OR_COND]], true
+; CHECK-STANDARD-NEXT:    [[TMP30]] = select i1 [[OR_COND]], i32 [[TMP22]], i32 [[TMP20]]
+; CHECK-STANDARD-NEXT:    [[IV_N]] = add nuw nsw i64 [[TMP23]], 1
+; CHECK-STANDARD-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_N]], [[TMP9]]
+; CHECK-STANDARD-NEXT:    br i1 [[EXITCOND_NOT]], label [[DOTPREHEADER]], label [[DOTPREHEADER35]]
+; CHECK-STANDARD:       .preheader:
+; CHECK-STANDARD-NEXT:    [[DOTLCSSA3641_LCSSA:%.*]] = phi i32 [ 0, [[TMP3:%.*]] ], [ [[TMP29]], [[DOTPREHEADER35]] ]
+; CHECK-STANDARD-NEXT:    [[P:%.*]] = phi i1 [ false, [[TMP3]] ], [ [[NOT_OR_COND]], [[DOTPREHEADER35]] ]
+; CHECK-STANDARD-NEXT:    [[Q:%.*]] = select i1 [[P]], i32 [[DOTLCSSA3641_LCSSA]], i32 1
+; CHECK-STANDARD-NEXT:    ret i32 [[Q]]
+;
+; CHECK-FORCED-LABEL: @minloc1_otherunusednot(
+; CHECK-FORCED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP0:%.*]], i64 40
+; CHECK-FORCED-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-FORCED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 64
+; CHECK-FORCED-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 8
+; CHECK-FORCED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 80
+; CHECK-FORCED-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-FORCED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP0]], i64 88
+; CHECK-FORCED-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP10]], align 8
+; CHECK-FORCED-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TMP0]], align 8
+; CHECK-FORCED-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP1:%.*]], align 4
+; CHECK-FORCED-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-FORCED-NEXT:    [[TMP15:%.*]] = add nsw i64 [[TMP14]], -1
+; CHECK-FORCED-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], [[TMP5]]
+; CHECK-FORCED-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP16]]
+; CHECK-FORCED-NEXT:    [[TMP18:%.*]] = shl i64 [[TMP7]], 3
+; CHECK-FORCED-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP18]]
+; CHECK-FORCED-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP2:%.*]], align 4
+; CHECK-FORCED-NEXT:    [[DOTNOT:%.*]] = icmp slt i64 [[TMP9]], 1
+; CHECK-FORCED-NEXT:    br i1 [[DOTNOT]], label [[DOTPREHEADER:%.*]], label [[DOTPREHEADER35_LR_PH:%.*]]
+; CHECK-FORCED:       .preheader35.lr.ph:
+; CHECK-FORCED-NEXT:    [[TMP21:%.*]] = sub i64 0, [[TMP7]]
+; CHECK-FORCED-NEXT:    br label [[DOTPREHEADER35:%.*]]
+; CHECK-FORCED:       .preheader35:
+; CHECK-FORCED-NEXT:    [[TMP22:%.*]] = phi i32 [ 2147483647, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP30:%.*]], [[SELECT_END:%.*]] ]
+; CHECK-FORCED-NEXT:    [[TMP23:%.*]] = phi i64 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[IV_N:%.*]], [[SELECT_END]] ]
+; CHECK-FORCED-NEXT:    [[DOT045:%.*]] = phi i1 [ false, [[DOTPREHEADER35_LR_PH]] ], [ [[DOT2:%.*]], [[SELECT_END]] ]
+; CHECK-FORCED-NEXT:    [[DOTLCSSA364144:%.*]] = phi i32 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP29:%.*]], [[SELECT_END]] ]
+; CHECK-FORCED-NEXT:    [[TMP24:%.*]] = mul nsw i64 [[TMP23]], [[TMP11]]
+; CHECK-FORCED-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP24]]
+; CHECK-FORCED-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
+; CHECK-FORCED-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], [[TMP20]]
+; CHECK-FORCED-NEXT:    [[TMP28:%.*]] = icmp sge i32 [[TMP26]], [[TMP22]]
+; CHECK-FORCED-NEXT:    [[DOTNOT33:%.*]] = and i1 [[DOT045]], [[TMP28]]
+; CHECK-FORCED-NEXT:    [[OR_COND:%.*]] = select i1 [[TMP27]], i1 true, i1 [[DOTNOT33]]
+; CHECK-FORCED-NEXT:    [[OR_COND_FROZEN:%.*]] = freeze i1 [[OR_COND]]
+; CHECK-FORCED-NEXT:    br i1 [[OR_COND_FROZEN]], label [[SELECT_END]], label [[SELECT_FALSE:%.*]]
+; CHECK-FORCED:       select.false:
+; CHECK-FORCED-NEXT:    br label [[SELECT_END]]
+; CHECK-FORCED:       select.end:
+; CHECK-FORCED-NEXT:    [[TMP29]] = phi i32 [ [[DOTLCSSA364144]], [[DOTPREHEADER35]] ], [ 1, [[SELECT_FALSE]] ]
+; CHECK-FORCED-NEXT:    [[DOT2]] = phi i1 [ [[DOT045]], [[DOTPREHEADER35]] ], [ true, [[SELECT_FALSE]] ]
+; CHECK-FORCED-NEXT:    [[TMP30]] = phi i32 [ [[TMP22]], [[DOTPREHEADER35]] ], [ [[TMP20]], [[SELECT_FALSE]] ]
+; CHECK-FORCED-NEXT:    [[NOT_OR_COND:%.*]] = xor i1 [[OR_COND]], true
+; CHECK-FORCED-NEXT:    [[IV_N]] = add nuw nsw i64 [[TMP23]], 1
+; CHECK-FORCED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_N]], [[TMP9]]
+; CHECK-FORCED-NEXT:    br i1 [[EXITCOND_NOT]], label [[DOTPREHEADER]], label [[DOTPREHEADER35]]
+; CHECK-FORCED:       .preheader:
+; CHECK-FORCED-NEXT:    [[DOTLCSSA3641_LCSSA:%.*]] = phi i32 [ 0, [[TMP3:%.*]] ], [ [[TMP29]], [[SELECT_END]] ]
+; CHECK-FORCED-NEXT:    [[P:%.*]] = phi i1 [ false, [[TMP3]] ], [ [[NOT_OR_COND]], [[SELECT_END]] ]
+; CHECK-FORCED-NEXT:    [[Q:%.*]] = select i1 [[P]], i32 [[DOTLCSSA3641_LCSSA]], i32 1
+; CHECK-FORCED-NEXT:    ret i32 [[Q]]
 ;
   %4 = getelementptr i8, ptr %0, i64 40
   %5 = load i64, ptr %4, align 8
@@ -225,10 +284,10 @@ define i32 @minloc1_twonot(ptr nocapture readonly %0, ptr nocapture readonly %1,
 ; CHECK-NEXT:    [[TMP21:%.*]] = sub i64 0, [[TMP7]]
 ; CHECK-NEXT:    br label [[DOTPREHEADER35:%.*]]
 ; CHECK:       .preheader35:
-; CHECK-NEXT:    [[TMP22:%.*]] = phi i32 [ 2147483647, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP30:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[TMP23:%.*]] = phi i64 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[IV_N:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[DOT045:%.*]] = phi i1 [ false, [[DOTPREHEADER35_LR_PH]] ], [ [[DOT3:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[DOTLCSSA364144:%.*]] = phi i32 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP29:%.*]], [[DOTPREHEADER35]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi i32 [ 2147483647, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP30:%.*]], [[SELECT_END:%.*]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = phi i64 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[IV_N:%.*]], [[SELECT_END]] ]
+; CHECK-NEXT:    [[DOT045:%.*]] = phi i1 [ false, [[DOTPREHEADER35_LR_PH]] ], [ [[DOT3:%.*]], [[SELECT_END]] ]
+; CHECK-NEXT:    [[DOTLCSSA364144:%.*]] = phi i32 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP29:%.*]], [[SELECT_END]] ]
 ; CHECK-NEXT:    [[TMP24:%.*]] = mul nsw i64 [[TMP23]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP24]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
@@ -236,16 +295,21 @@ define i32 @minloc1_twonot(ptr nocapture readonly %0, ptr nocapture readonly %1,
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp sge i32 [[TMP26]], [[TMP22]]
 ; CHECK-NEXT:    [[DOTNOT33:%.*]] = and i1 [[DOT045]], [[TMP28]]
 ; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[TMP27]], i1 true, i1 [[DOTNOT33]]
-; CHECK-NEXT:    [[TMP29]] = select i1 [[OR_COND]], i32 [[DOTLCSSA364144]], i32 1
+; CHECK-NEXT:    [[OR_COND_FROZEN:%.*]] = freeze i1 [[OR_COND]]
+; CHECK-NEXT:    br i1 [[OR_COND_FROZEN]], label [[SELECT_END]], label [[SELECT_FALSE:%.*]]
+; CHECK:       select.false:
+; CHECK-NEXT:    br label [[SELECT_END]]
+; CHECK:       select.end:
+; CHECK-NEXT:    [[TMP29]] = phi i32 [ [[DOTLCSSA364144]], [[DOTPREHEADER35]] ], [ 1, [[SELECT_FALSE]] ]
+; CHECK-NEXT:    [[DOT2:%.*]] = phi i1 [ [[DOT045]], [[DOTPREHEADER35]] ], [ true, [[SELECT_FALSE]] ]
+; CHECK-NEXT:    [[DOT3]] = phi i1 [ [[DOT045]], [[DOTPREHEADER35]] ], [ true, [[SELECT_FALSE]] ]
+; CHECK-NEXT:    [[TMP30]] = phi i32 [ [[TMP22]], [[DOTPREHEADER35]] ], [ [[TMP20]], [[SELECT_FALSE]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND:%.*]] = xor i1 [[OR_COND]], true
-; CHECK-NEXT:    [[DOT2:%.*]] = select i1 [[NOT_OR_COND]], i1 true, i1 [[DOT045]]
-; CHECK-NEXT:    [[DOT3]] = select i1 [[NOT_OR_COND]], i1 true, i1 [[DOT2]]
-; CHECK-NEXT:    [[TMP30]] = select i1 [[OR_COND]], i32 [[TMP22]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[IV_N]] = add nuw nsw i64 [[TMP23]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_N]], [[TMP9]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[DOTPREHEADER]], label [[DOTPREHEADER35]]
 ; CHECK:       .preheader:
-; CHECK-NEXT:    [[DOTLCSSA3641_LCSSA:%.*]] = phi i32 [ 0, [[TMP3:%.*]] ], [ [[TMP29]], [[DOTPREHEADER35]] ]
+; CHECK-NEXT:    [[DOTLCSSA3641_LCSSA:%.*]] = phi i32 [ 0, [[TMP3:%.*]] ], [ [[TMP29]], [[SELECT_END]] ]
 ; CHECK-NEXT:    ret i32 [[DOTLCSSA3641_LCSSA]]
 ;
   %4 = getelementptr i8, ptr %0, i64 40
@@ -323,10 +387,10 @@ define i32 @minloc1_onenotdependent(ptr nocapture readonly %0, ptr nocapture rea
 ; CHECK-NEXT:    [[TMP21:%.*]] = sub i64 0, [[TMP7]]
 ; CHECK-NEXT:    br label [[DOTPREHEADER35:%.*]]
 ; CHECK:       .preheader35:
-; CHECK-NEXT:    [[TMP22:%.*]] = phi i32 [ 2147483647, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP30:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[TMP23:%.*]] = phi i64 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[IV_N:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[DOT045:%.*]] = phi i1 [ false, [[DOTPREHEADER35_LR_PH]] ], [ [[DOT3:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[DOTLCSSA364144:%.*]] = phi i32 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP29:%.*]], [[DOTPREHEADER35]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi i32 [ 2147483647, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP30:%.*]], [[SELECT_END:%.*]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = phi i64 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[IV_N:%.*]], [[SELECT_END]] ]
+; CHECK-NEXT:    [[DOT045:%.*]] = phi i1 [ false, [[DOTPREHEADER35_LR_PH]] ], [ [[DOT3:%.*]], [[SELECT_END]] ]
+; CHECK-NEXT:    [[DOTLCSSA364144:%.*]] = phi i32 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP29:%.*]], [[SELECT_END]] ]
 ; CHECK-NEXT:    [[TMP24:%.*]] = mul nsw i64 [[TMP23]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP24]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
@@ -334,16 +398,21 @@ define i32 @minloc1_onenotdependent(ptr nocapture readonly %0, ptr nocapture rea
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp sge i32 [[TMP26]], [[TMP22]]
 ; CHECK-NEXT:    [[DOTNOT33:%.*]] = and i1 [[DOT045]], [[TMP28]]
 ; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[TMP27]], i1 true, i1 [[DOTNOT33]]
-; CHECK-NEXT:    [[TMP29]] = select i1 [[OR_COND]], i32 [[DOTLCSSA364144]], i32 1
+; CHECK-NEXT:    [[OR_COND_FROZEN:%.*]] = freeze i1 [[OR_COND]]
+; CHECK-NEXT:    br i1 [[OR_COND_FROZEN]], label [[SELECT_END]], label [[SELECT_FALSE:%.*]]
+; CHECK:       select.false:
+; CHECK-NEXT:    br label [[SELECT_END]]
+; CHECK:       select.end:
+; CHECK-NEXT:    [[TMP29]] = phi i32 [ [[DOTLCSSA364144]], [[DOTPREHEADER35]] ], [ 1, [[SELECT_FALSE]] ]
+; CHECK-NEXT:    [[DOT2:%.*]] = phi i1 [ true, [[DOTPREHEADER35]] ], [ [[DOT045]], [[SELECT_FALSE]] ]
+; CHECK-NEXT:    [[DOT3]] = phi i1 [ true, [[DOTPREHEADER35]] ], [ true, [[SELECT_FALSE]] ]
+; CHECK-NEXT:    [[TMP30]] = phi i32 [ [[TMP22]], [[DOTPREHEADER35]] ], [ [[TMP20]], [[SELECT_FALSE]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND:%.*]] = xor i1 [[OR_COND]], true
-; CHECK-NEXT:    [[DOT2:%.*]] = select i1 [[OR_COND]], i1 true, i1 [[DOT045]]
-; CHECK-NEXT:    [[DOT3]] = select i1 [[NOT_OR_COND]], i1 true, i1 [[DOT2]]
-; CHECK-NEXT:    [[TMP30]] = select i1 [[OR_COND]], i32 [[TMP22]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[IV_N]] = add nuw nsw i64 [[TMP23]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_N]], [[TMP9]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[DOTPREHEADER]], label [[DOTPREHEADER35]]
 ; CHECK:       .preheader:
-; CHECK-NEXT:    [[DOTLCSSA3641_LCSSA:%.*]] = phi i32 [ 0, [[TMP3:%.*]] ], [ [[TMP29]], [[DOTPREHEADER35]] ]
+; CHECK-NEXT:    [[DOTLCSSA3641_LCSSA:%.*]] = phi i32 [ 0, [[TMP3:%.*]] ], [ [[TMP29]], [[SELECT_END]] ]
 ; CHECK-NEXT:    ret i32 [[DOTLCSSA3641_LCSSA]]
 ;
   %4 = getelementptr i8, ptr %0, i64 40
@@ -429,10 +498,10 @@ define i32 @minloc9(ptr nocapture readonly %0, ptr nocapture readonly %1, ptr no
 ; CHECK-NEXT:    [[DOTNEG55:%.*]] = mul i64 [[TMP7]], -8
 ; CHECK-NEXT:    br label [[DOTPREHEADER35:%.*]]
 ; CHECK:       .preheader35:
-; CHECK-NEXT:    [[TMP22:%.*]] = phi i32 [ 2147483647, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP78:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[TMP23:%.*]] = phi i64 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP79:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[DOT045:%.*]] = phi i1 [ false, [[DOTPREHEADER35_LR_PH]] ], [ [[DOT2_8:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[DOTLCSSA364144:%.*]] = phi i32 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP77:%.*]], [[DOTPREHEADER35]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi i32 [ 2147483647, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP78:%.*]], [[SELECT_END15:%.*]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = phi i64 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP79:%.*]], [[SELECT_END15]] ]
+; CHECK-NEXT:    [[DOT045:%.*]] = phi i1 [ false, [[DOTPREHEADER35_LR_PH]] ], [ [[DOT2_8:%.*]], [[SELECT_END15]] ]
+; CHECK-NEXT:    [[DOTLCSSA364144:%.*]] = phi i32 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP77:%.*]], [[SELECT_END15]] ]
 ; CHECK-NEXT:    [[TMP24:%.*]] = mul nsw i64 [[TMP23]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP24]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
@@ -440,95 +509,140 @@ define i32 @minloc9(ptr nocapture readonly %0, ptr nocapture readonly %1, ptr no
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp sge i32 [[TMP26]], [[TMP22]]
 ; CHECK-NEXT:    [[DOTNOT33:%.*]] = and i1 [[DOT045]], [[TMP28]]
 ; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[TMP27]], i1 true, i1 [[DOTNOT33]]
-; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[OR_COND]], i32 [[DOTLCSSA364144]], i32 1
+; CHECK-NEXT:    [[OR_COND_FROZEN:%.*]] = freeze i1 [[OR_COND]]
+; CHECK-NEXT:    br i1 [[OR_COND_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]]
+; CHECK:       select.false:
+; CHECK-NEXT:    br label [[SELECT_END]]
+; CHECK:       select.end:
+; CHECK-NEXT:    [[TMP29:%.*]] = phi i32 [ [[DOTLCSSA364144]], [[DOTPREHEADER35]] ], [ 1, [[SELECT_FALSE]] ]
+; CHECK-NEXT:    [[DOT2:%.*]] = phi i1 [ [[DOT045]], [[DOTPREHEADER35]] ], [ true, [[SELECT_FALSE]] ]
+; CHECK-NEXT:    [[TMP30:%.*]] = phi i32 [ [[TMP22]], [[DOTPREHEADER35]] ], [ [[TMP20]], [[SELECT_FALSE]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND:%.*]] = xor i1 [[OR_COND]], true
-; CHECK-NEXT:    [[DOT2:%.*]] = select i1 [[NOT_OR_COND]], i1 true, i1 [[DOT045]]
-; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[OR_COND]], i32 [[TMP22]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne i32 [[TMP32]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp sge i32 [[TMP32]], [[TMP30]]
 ; CHECK-NEXT:    [[DOTNOT33_1:%.*]] = and i1 [[DOT2]], [[TMP34]]
 ; CHECK-NEXT:    [[OR_COND_1:%.*]] = select i1 [[TMP33]], i1 true, i1 [[DOTNOT33_1]]
-; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[OR_COND_1]], i32 [[TMP29]], i32 2
+; CHECK-NEXT:    [[OR_COND_1_FROZEN:%.*]] = freeze i1 [[OR_COND_1]]
+; CHECK-NEXT:    br i1 [[OR_COND_1_FROZEN]], label [[SELECT_END1:%.*]], label [[SELECT_FALSE2:%.*]]
+; CHECK:       select.false2:
+; CHECK-NEXT:    br label [[SELECT_END1]]
+; CHECK:       select.end1:
+; CHECK-NEXT:    [[TMP35:%.*]] = phi i32 [ [[TMP29]], [[SELECT_END]] ], [ 2, [[SELECT_FALSE2]] ]
+; CHECK-NEXT:    [[DOT2_1:%.*]] = phi i1 [ [[DOT2]], [[SELECT_END]] ], [ true, [[SELECT_FALSE2]] ]
+; CHECK-NEXT:    [[TMP36:%.*]] = phi i32 [ [[TMP30]], [[SELECT_END]] ], [ [[TMP20]], [[SELECT_FALSE2]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND_1:%.*]] = xor i1 [[OR_COND_1]], true
-; CHECK-NEXT:    [[DOT2_1:%.*]] = select i1 [[NOT_OR_COND_1]], i1 true, i1 [[DOT2]]
-; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[OR_COND_1]], i32 [[TMP30]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[DOTNEG]]
 ; CHECK-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4
 ; CHECK-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP40:%.*]] = icmp sge i32 [[TMP38]], [[TMP36]]
 ; CHECK-NEXT:    [[DOTNOT33_2:%.*]] = and i1 [[DOT2_1]], [[TMP40]]
 ; CHECK-NEXT:    [[OR_COND_2:%.*]] = select i1 [[TMP39]], i1 true, i1 [[DOTNOT33_2]]
-; CHECK-NEXT:    [[TMP41:%.*]] = select i1 [[OR_COND_2]], i32 [[TMP35]], i32 3
+; CHECK-NEXT:    [[OR_COND_2_FROZEN:%.*]] = freeze i1 [[OR_COND_2]]
+; CHECK-NEXT:    br i1 [[OR_COND_2_FROZEN]], label [[SELECT_END3:%.*]], label [[SELECT_FALSE4:%.*]]
+; CHECK:       select.false4:
+; CHECK-NEXT:    br label [[SELECT_END3]]
+; CHECK:       select.end3:
+; CHECK-NEXT:    [[TMP41:%.*]] = phi i32 [ [[TMP35]], [[SELECT_END1]] ], [ 3, [[SELECT_FALSE4]] ]
+; CHECK-NEXT:    [[DOT2_2:%.*]] = phi i1 [ [[DOT2_1]], [[SELECT_END1]] ], [ true, [[SELECT_FALSE4]] ]
+; CHECK-NEXT:    [[TMP42:%.*]] = phi i32 [ [[TMP36]], [[SELECT_END1]] ], [ [[TMP20]], [[SELECT_FALSE4]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND_2:%.*]] = xor i1 [[OR_COND_2]], true
-; CHECK-NEXT:    [[DOT2_2:%.*]] = select i1 [[NOT_OR_COND_2]], i1 true, i1 [[DOT2_1]]
-; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[OR_COND_2]], i32 [[TMP36]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[DOTNEG50]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4
 ; CHECK-NEXT:    [[TMP45:%.*]] = icmp ne i32 [[TMP44]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = icmp sge i32 [[TMP44]], [[TMP42]]
 ; CHECK-NEXT:    [[DOTNOT33_3:%.*]] = and i1 [[DOT2_2]], [[TMP46]]
 ; CHECK-NEXT:    [[OR_COND_3:%.*]] = select i1 [[TMP45]], i1 true, i1 [[DOTNOT33_3]]
-; CHECK-NEXT:    [[TMP47:%.*]] = select i1 [[OR_COND_3]], i32 [[TMP41]], i32 4
+; CHECK-NEXT:    [[OR_COND_3_FROZEN:%.*]] = freeze i1 [[OR_COND_3]]
+; CHECK-NEXT:    br i1 [[OR_COND_3_FROZEN]], label [[SELECT_END5:%.*]], label [[SELECT_FALSE6:%.*]]
+; CHECK:       select.false6:
+; CHECK-NEXT:    br label [[SELECT_END5]]
+; CHECK:       select.end5:
+; CHECK-NEXT:    [[TMP47:%.*]] = phi i32 [ [[TMP41]], [[SELECT_END3]] ], [ 4, [[SELECT_FALSE6]] ]
+; CHECK-NEXT:    [[DOT2_3:%.*]] = phi i1 [ [[DOT2_2]], [[SELECT_END3]] ], [ true, [[SELECT_FALSE6]] ]
+; CHECK-NEXT:    [[TMP48:%.*]] = phi i32 [ [[TMP42]], [[SELECT_END3]] ], [ [[TMP20]], [[SELECT_FALSE6]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND_3:%.*]] = xor i1 [[OR_COND_3]], true
-; CHECK-NEXT:    [[DOT2_3:%.*]] = select i1 [[NOT_OR_COND_3]], i1 true, i1 [[DOT2_2]]
-; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[OR_COND_3]], i32 [[TMP42]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[DOTNEG51]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i32, ptr [[TMP49]], align 4
 ; CHECK-NEXT:    [[TMP51:%.*]] = icmp ne i32 [[TMP50]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp sge i32 [[TMP50]], [[TMP48]]
 ; CHECK-NEXT:    [[DOTNOT33_4:%.*]] = and i1 [[DOT2_3]], [[TMP52]]
 ; CHECK-NEXT:    [[OR_COND_4:%.*]] = select i1 [[TMP51]], i1 true, i1 [[DOTNOT33_4]]
-; CHECK-NEXT:    [[TMP53:%.*]] = select i1 [[OR_COND_4]], i32 [[TMP47]], i32 5
+; CHECK-NEXT:    [[OR_COND_4_FROZEN:%.*]] = freeze i1 [[OR_COND_4]]
+; CHECK-NEXT:    br i1 [[OR_COND_4_FROZEN]], label [[SELECT_END7:%.*]], label [[SELECT_FALSE8:%.*]]
+; CHECK:       select.false8:
+; CHECK-NEXT:    br label [[SELECT_END7]]
+; CHECK:       select.end7:
+; CHECK-NEXT:    [[TMP53:%.*]] = phi i32 [ [[TMP47]], [[SELECT_END5]] ], [ 5, [[SELECT_FALSE8]] ]
+; CHECK-NEXT:    [[DOT2_4:%.*]] = phi i1 [ [[DOT2_3]], [[SELECT_END5]] ], [ true, [[SELECT_FALSE8]] ]
+; CHECK-NEXT:    [[TMP54:%.*]] = phi i32 [ [[TMP48]], [[SELECT_END5]] ], [ [[TMP20]], [[SELECT_FALSE8]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND_4:%.*]] = xor i1 [[OR_COND_4]], true
-; CHECK-NEXT:    [[DOT2_4:%.*]] = select i1 [[NOT_OR_COND_4]], i1 true, i1 [[DOT2_3]]
-; CHECK-NEXT:    [[TMP54:%.*]] = select i1 [[OR_COND_4]], i32 [[TMP48]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[DOTNEG52]]
 ; CHECK-NEXT:    [[TMP56:%.*]] = load i32, ptr [[TMP55]], align 4
 ; CHECK-NEXT:    [[TMP57:%.*]] = icmp ne i32 [[TMP56]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP58:%.*]] = icmp sge i32 [[TMP56]], [[TMP54]]
 ; CHECK-NEXT:    [[DOTNOT33_5:%.*]] = and i1 [[DOT2_4]], [[TMP58]]
 ; CHECK-NEXT:    [[OR_COND_5:%.*]] = select i1 [[TMP57]], i1 true, i1 [[DOTNOT33_5]]
-; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[OR_COND_5]], i32 [[TMP53]], i32 6
+; CHECK-NEXT:    [[OR_COND_5_FROZEN:%.*]] = freeze i1 [[OR_COND_5]]
+; CHECK-NEXT:    br i1 [[OR_COND_5_FROZEN]], label [[SELECT_END9:%.*]], label [[SELECT_FALSE10:%.*]]
+; CHECK:       select.false10:
+; CHECK-NEXT:    br label [[SELECT_END9]]
+; CHECK:       select.end9:
+; CHECK-NEXT:    [[TMP59:%.*]] = phi i32 [ [[TMP53]], [[SELECT_END7]] ], [ 6, [[SELECT_FALSE10]] ]
+; CHECK-NEXT:    [[DOT2_5:%.*]] = phi i1 [ [[DOT2_4]], [[SELECT_END7]] ], [ true, [[SELECT_FALSE10]] ]
+; CHECK-NEXT:    [[TMP60:%.*]] = phi i32 [ [[TMP54]], [[SELECT_END7]] ], [ [[TMP20]], [[SELECT_FALSE10]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND_5:%.*]] = xor i1 [[OR_COND_5]], true
-; CHECK-NEXT:    [[DOT2_5:%.*]] = select i1 [[NOT_OR_COND_5]], i1 true, i1 [[DOT2_4]]
-; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[OR_COND_5]], i32 [[TMP54]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[DOTNEG53]]
 ; CHECK-NEXT:    [[TMP62:%.*]] = load i32, ptr [[TMP61]], align 4
 ; CHECK-NEXT:    [[TMP63:%.*]] = icmp ne i32 [[TMP62]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP64:%.*]] = icmp sge i32 [[TMP62]], [[TMP60]]
 ; CHECK-NEXT:    [[DOTNOT33_6:%.*]] = and i1 [[DOT2_5]], [[TMP64]]
 ; CHECK-NEXT:    [[OR_COND_6:%.*]] = select i1 [[TMP63]], i1 true, i1 [[DOTNOT33_6]]
-; CHECK-NEXT:    [[TMP65:%.*]] = select i1 [[OR_COND_6]], i32 [[TMP59]], i32 7
+; CHECK-NEXT:    [[OR_COND_6_FROZEN:%.*]] = freeze i1 [[OR_COND_6]]
+; CHECK-NEXT:    br i1 [[OR_COND_6_FROZEN]], label [[SELECT_END11:%.*]], label [[SELECT_FALSE12:%.*]]
+; CHECK:       select.false12:
+; CHECK-NEXT:    br label [[SELECT_END11]]
+; CHECK:       select.end11:
+; CHECK-NEXT:    [[TMP65:%.*]] = phi i32 [ [[TMP59]], [[SELECT_END9]] ], [ 7, [[SELECT_FALSE12]] ]
+; CHECK-NEXT:    [[DOT2_6:%.*]] = phi i1 [ [[DOT2_5]], [[SELECT_END9]] ], [ true, [[SELECT_FALSE12]] ]
+; CHECK-NEXT:    [[TMP66:%.*]] = phi i32 [ [[TMP60]], [[SELECT_END9]] ], [ [[TMP20]], [[SELECT_FALSE12]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND_6:%.*]] = xor i1 [[OR_COND_6]], true
-; CHECK-NEXT:    [[DOT2_6:%.*]] = select i1 [[NOT_OR_COND_6]], i1 true, i1 [[DOT2_5]]
-; CHECK-NEXT:    [[TMP66:%.*]] = select i1 [[OR_COND_6]], i32 [[TMP60]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[DOTNEG54]]
 ; CHECK-NEXT:    [[TMP68:%.*]] = load i32, ptr [[TMP67]], align 4
 ; CHECK-NEXT:    [[TMP69:%.*]] = icmp ne i32 [[TMP68]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP70:%.*]] = icmp sge i32 [[TMP68]], [[TMP66]]
 ; CHECK-NEXT:    [[DOTNOT33_7:%.*]] = and i1 [[DOT2_6]], [[TMP70]]
 ; CHECK-NEXT:    [[OR_COND_7:%.*]] = select i1 [[TMP69]], i1 true, i1 [[DOTNOT33_7]]
-; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[OR_COND_7]], i32 [[TMP65]], i32 8
+; CHECK-NEXT:    [[OR_COND_7_FROZEN:%.*]] = freeze i1 [[OR_COND_7]]
+; CHECK-NEXT:    br i1 [[OR_COND_7_FROZEN]], label [[SELECT_END13:%.*]], label [[SELECT_FALSE14:%.*]]
+; CHECK:       select.false14:
+; CHECK-NEXT:    br label [[SELECT_END13]]
+; CHECK:       select.end13:
+; CHECK-NEXT:    [[TMP71:%.*]] = phi i32 [ [[TMP65]], [[SELECT_END11]] ], [ 8, [[SELECT_FALSE14]] ]
+; CHECK-NEXT:    [[DOT2_7:%.*]] = phi i1 [ [[DOT2_6]], [[SELECT_END11]] ], [ true, [[SELECT_FALSE14]] ]
+; CHECK-NEXT:    [[TMP72:%.*]] = phi i32 [ [[TMP66]], [[SELECT_END11]] ], [ [[TMP20]], [[SELECT_FALSE14]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND_7:%.*]] = xor i1 [[OR_COND_7]], true
-; CHECK-NEXT:    [[DOT2_7:%.*]] = select i1 [[NOT_OR_COND_7]], i1 true, i1 [[DOT2_6]]
-; CHECK-NEXT:    [[TMP72:%.*]] = select i1 [[OR_COND_7]], i32 [[TMP66]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[DOTNEG55]]
 ; CHECK-NEXT:    [[TMP74:%.*]] = load i32, ptr [[TMP73]], align 4
 ; CHECK-NEXT:    [[TMP75:%.*]] = icmp ne i32 [[TMP74]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP76:%.*]] = icmp sge i32 [[TMP74]], [[TMP72]]
 ; CHECK-NEXT:    [[DOTNOT33_8:%.*]] = and i1 [[DOT2_7]], [[TMP76]]
 ; CHECK-NEXT:    [[OR_COND_8:%.*]] = select i1 [[TMP75]], i1 true, i1 [[DOTNOT33_8]]
-; CHECK-NEXT:    [[TMP77]] = select i1 [[OR_COND_8]], i32 [[TMP71]], i32 9
+; CHECK-NEXT:    [[OR_COND_8_FROZEN:%.*]] = freeze i1 [[OR_COND_8]]
+; CHECK-NEXT:    br i1 [[OR_COND_8_FROZEN]], label [[SELECT_END15]], label [[SELECT_FALSE16:%.*]]
+; CHECK:       select.false16:
+; CHECK-NEXT:    br label [[SELECT_END15]]
+; CHECK:       select.end15:
+; CHECK-NEXT:    [[TMP77]] = phi i32 [ [[TMP71]], [[SELECT_END13]] ], [ 9, [[SELECT_FALSE16]] ]
+; CHECK-NEXT:    [[DOT2_8]] = phi i1 [ [[DOT2_7]], [[SELECT_END13]] ], [ true, [[SELECT_FALSE16]] ]
+; CHECK-NEXT:    [[TMP78]] = phi i32 [ [[TMP72]], [[SELECT_END13]] ], [ [[TMP20]], [[SELECT_FALSE16]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND_8:%.*]] = xor i1 [[OR_COND_8]], true
-; CHECK-NEXT:    [[DOT2_8]] = select i1 [[NOT_OR_COND_8]], i1 true, i1 [[DOT2_7]]
-; CHECK-NEXT:    [[TMP78]] = select i1 [[OR_COND_8]], i32 [[TMP72]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[TMP79]] = add nuw nsw i64 [[TMP23]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP79]], [[TMP9]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[DOTPREHEADER]], label [[DOTPREHEADER35]]
 ; CHECK:       .preheader:
-; CHECK-NEXT:    [[DOTLCSSA3641_LCSSA:%.*]] = phi i32 [ 0, [[TMP3:%.*]] ], [ [[TMP77]], [[DOTPREHEADER35]] ]
+; CHECK-NEXT:    [[DOTLCSSA3641_LCSSA:%.*]] = phi i32 [ 0, [[TMP3:%.*]] ], [ [[TMP77]], [[SELECT_END15]] ]
 ; CHECK-NEXT:    ret i32 [[DOTLCSSA3641_LCSSA]]
 ;
   %4 = getelementptr i8, ptr %0, i64 40
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
index 8cb8b1c..8ce24ce 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
@@ -72,6 +72,203 @@ define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_many_svepred_arg(<
   ret <vscale x 16 x i1> %ret
 }
 
+; Test that arg2 is passed through x0, i.e., x0 = &%arg2; and return values are loaded from x0:
+;     P0 = ldr [x0]
+define aarch64_sve_vector_pcs <vscale x 16 x i1> @callee_with_svepred_arg_4xv16i1_1xv16i1([4 x <vscale x 16 x i1>] %arg1, [1 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: callee_with_svepred_arg_4xv16i1_1xv16i1
+; CHECK:    [[BASE:%[0-9]+]]:gpr64common = COPY $x0
+; CHECK:    [[PRED0:%[0-9]+]]:ppr = LDR_PXI [[BASE]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    $p0 = COPY [[PRED0]]
+; CHECK:    RET_ReallyLR implicit $p0
+  %res = extractvalue [1 x <vscale x 16 x i1>] %arg2, 0
+  ret <vscale x 16 x i1> %res
+}
+
+; Test that arg1 is stored to the stack from p0; and the stack location is passed throuch x0 to setup the call:
+;     str P0, [stack_loc_for_args]
+;     x0 = stack_loc_for_args
+define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_svepred_arg_1xv16i1_4xv16i1([1 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: caller_with_svepred_arg_1xv16i1_4xv16i1
+; CHECK: stack:
+; CHECK:      - { id: 0, name: '', type: default, offset: 0, size: 2, alignment: 2,
+; CHECK-NEXT:     stack-id: scalable-vector,
+; CHECK:    [[PRED0:%[0-9]+]]:ppr = COPY $p0
+; CHECK:    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+; CHECK:    STR_PXI [[PRED0]], %stack.0, 0 :: (store (<vscale x 1 x s16>) into %stack.0)
+; CHECK:    [[STACK:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0, 0
+; CHECK:    $x0 = COPY [[STACK]]
+; CHECK:    BL @callee_with_svepred_arg_4xv16i1_1xv16i1, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $p0, implicit $p1, implicit $p2, implicit $p3, implicit $x0, implicit-def $sp, implicit-def $p0
+; CHECK:    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  %res = call <vscale x 16 x i1> @callee_with_svepred_arg_4xv16i1_1xv16i1([4 x <vscale x 16 x i1>] %arg2, [1 x <vscale x 16 x i1>] %arg1)
+  ret <vscale x 16 x i1> %res
+}
+
+; Test that arg2 is passed through x0, i.e., x0 = &%arg2; and return values are loaded from x0:
+;     P0 = ldr [x0]
+;     P1 = ldr [x0 +   sizeof(Px)]
+;     P2 = ldr [x0 + 2*sizeof(Px)]
+;     P3 = ldr [x0 + 3*sizeof(Px)]
+define aarch64_sve_vector_pcs [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_4xv16i1_4xv16i1([4 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: callee_with_svepred_arg_4xv16i1_4xv16i1
+; CHECK:    [[BASE:%[0-9]+]]:gpr64common = COPY $x0
+; CHECK:    [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[BASE]], killed [[OFFSET1]]
+; CHECK:    [[PRED1:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR1]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET2]]
+; CHECK:    [[PRED2:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR2]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK:    [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET3]]
+; CHECK:    [[PRED3:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR3]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[PRED0:%[0-9]+]]:ppr = LDR_PXI [[BASE]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    $p0 = COPY [[PRED0]]
+; CHECK:    $p1 = COPY [[PRED1]]
+; CHECK:    $p2 = COPY [[PRED2]]
+; CHECK:    $p3 = COPY [[PRED3]]
+; CHECK:    RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+  ret [4 x <vscale x 16 x i1>] %arg2
+}
+
+; Test that arg1 is stored to the stack from p0~p3; and the stack location is passed throuch x0 to setup the call:
+;     str P0, [stack_loc_for_args]
+;     str P1, [stack_loc_for_args +   sizeof(Px)]
+;     str P2, [stack_loc_for_args + 2*sizeof(Px)]
+;     str P3, [stack_loc_for_args + 3*sizeof(Px)]
+;     x0 = stack_loc_for_args
+define [4 x <vscale x 16 x i1>] @caller_with_svepred_arg_4xv16i1_4xv16i1([4 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: caller_with_svepred_arg_4xv16i1_4xv16i1
+; CHECK: stack:
+; CHECK:      - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2,
+; CHECK-NEXT:     stack-id: scalable-vector,
+; CHECK:    [[PRED3:%[0-9]+]]:ppr = COPY $p3
+; CHECK:    [[PRED2:%[0-9]+]]:ppr = COPY $p2
+; CHECK:    [[PRED1:%[0-9]+]]:ppr = COPY $p1
+; CHECK:    [[PRED0:%[0-9]+]]:ppr = COPY $p0
+; CHECK:    [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK:    [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK:    [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK:    [[STACK:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0, 0
+; CHECK:    [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], [[OFFSET3]]
+; CHECK:    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+; CHECK:    STR_PXI [[PRED3]], killed [[ADDR3]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK:    [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], [[OFFSET2]]
+; CHECK:    STR_PXI [[PRED2]], killed [[ADDR2]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK:    [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[STACK]], [[OFFSET1]]
+; CHECK:    STR_PXI [[PRED1]], killed [[ADDR1]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK:    STR_PXI [[PRED0]], %stack.0, 0 :: (store (<vscale x 1 x s16>) into %stack.0)
+; CHECK:    $x0 = COPY [[STACK]]
+; CHECK:    BL @callee_with_svepred_arg_4xv16i1_4xv16i1, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $p0, implicit $p1, implicit $p2, implicit $p3, implicit $x0, implicit-def $sp, implicit-def $p0, implicit-def $p1, implicit-def $p2, implicit-def $p3
+; CHECK:    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  %res = call [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_4xv16i1_4xv16i1([4 x <vscale x 16 x i1>] %arg2, [4 x <vscale x 16 x i1>] %arg1)
+  ret [4 x <vscale x 16 x i1>] %res
+}
+
+; Test that arg2 is passed through x0, i.e., x0 = &%arg2; and return values are loaded from x0:
+;     P0 = ldr [x0]
+;     P1 = ldr [x0 +   sizeof(Px)]
+;     P2 = ldr [x0 + 2*sizeof(Px)]
+;     P3 = ldr [x0 + 3*sizeof(Px)]
+define aarch64_sve_vector_pcs [2 x <vscale x 32 x i1>] @callee_with_svepred_arg_1xv16i1_2xv32i1([1 x <vscale x 16 x i1>] %arg1, [2 x <vscale x 32 x i1>] %arg2) {
+; CHECK: name: callee_with_svepred_arg_1xv16i1_2xv32i1
+; CHECK:    [[BASE:%[0-9]+]]:gpr64common = COPY $x0
+; CHECK:    [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[BASE]], killed [[OFFSET1]]
+; CHECK:    [[PRED1:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR1]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET2]]
+; CHECK:    [[PRED2:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR2]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK:    [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET3]]
+; CHECK:    [[PRED3:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR3]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[PRED0:%[0-9]+]]:ppr = LDR_PXI [[BASE]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    $p0 = COPY [[PRED0]]
+; CHECK:    $p1 = COPY [[PRED1]]
+; CHECK:    $p2 = COPY [[PRED2]]
+; CHECK:    $p3 = COPY [[PRED3]]
+; CHECK:    RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+  ret [2 x <vscale x 32 x i1>] %arg2
+}
+
+; Test that arg1 is stored to the stack from p0~p3; and the stack location is passed throuch x0 to setup the call:
+;     str P0, [stack_loc_for_args]
+;     str P1, [stack_loc_for_args +   sizeof(Px)]
+;     str P2, [stack_loc_for_args + 2*sizeof(Px)]
+;     str P3, [stack_loc_for_args + 3*sizeof(Px)]
+;     x0 = stack_loc_for_args
+define [2 x <vscale x 32 x i1>] @caller_with_svepred_arg_2xv32i1_1xv16i1([2 x <vscale x 32 x i1>] %arg1, [1 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: caller_with_svepred_arg_2xv32i1_1xv16i1
+; CHECK: stack:
+; CHECK:      - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2,
+; CHECK-NEXT:     stack-id: scalable-vector,
+; CHECK:    [[PRED3:%[0-9]+]]:ppr = COPY $p3
+; CHECK:    [[PRED2:%[0-9]+]]:ppr = COPY $p2
+; CHECK:    [[PRED1:%[0-9]+]]:ppr = COPY $p1
+; CHECK:    [[PRED0:%[0-9]+]]:ppr = COPY $p0
+; CHECK:    [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK:    [[STACK:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0, 0
+; CHECK:    [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], killed [[OFFSET3]]
+; CHECK:    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+; CHECK:    STR_PXI [[PRED3]], killed [[ADDR3]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], killed [[OFFSET2]]
+; CHECK:    STR_PXI [[PRED2]], killed [[ADDR2]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[STACK]], killed [[OFFSET1]]
+; CHECK:    STR_PXI [[PRED1]], killed [[ADDR1]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK:    STR_PXI [[PRED0]], %stack.0, 0 :: (store (<vscale x 1 x s16>) into %stack.0)
+; CHECK:    $x0 = COPY [[STACK]]
+; CHECK:    BL @callee_with_svepred_arg_1xv16i1_2xv32i1, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $p0, implicit $x0, implicit-def $sp, implicit-def $p0, implicit-def $p1, implicit-def $p2, implicit-def $p3
+; CHECK:    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  %res = call [2 x <vscale x 32 x i1>] @callee_with_svepred_arg_1xv16i1_2xv32i1([1 x <vscale x 16 x i1>] %arg2, [2 x <vscale x 32 x i1>] %arg1)
+  ret [2 x <vscale x 32 x i1>] %res
+}
+
+; Test that arg1 and arg3 are passed via P0~P3, arg1 is passed indirectly through address on stack in x0
+define aarch64_sve_vector_pcs [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_2xv16i1_4xv16i1_2xv16i1([2 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2, [2 x <vscale x 16 x i1>] %arg3) nounwind {
+; CHECK: name: callee_with_svepred_arg_2xv16i1_4xv16i1_2xv16i1
+; CHECK:    [[P3:%[0-9]+]]:ppr = COPY $p3
+; CHECK:    [[P2:%[0-9]+]]:ppr = COPY $p2
+; CHECK:    [[X0:%[0-9]+]]:gpr64common = COPY $x0
+; CHECK:    [[P1:%[0-9]+]]:ppr = COPY $p1
+; CHECK:    [[P0:%[0-9]+]]:ppr = COPY $p0
+; CHECK:    [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK:    [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[X0]], killed [[OFFSET3]]
+; CHECK:    [[P7:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR3]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[X0]], killed [[OFFSET2]]
+; CHECK:    [[P6:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR2]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[X0]], killed [[OFFSET1]]
+; CHECK:    [[P5:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR1]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[P4:%[0-9]+]]:ppr = LDR_PXI [[X0]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[RES0:%[0-9]+]]:ppr = AND_PPzPP [[P0]], [[P0]], killed [[P4]]
+; CHECK:    [[RES1:%[0-9]+]]:ppr = AND_PPzPP [[P1]], [[P1]], killed [[P5]]
+; CHECK:    [[RES2:%[0-9]+]]:ppr = AND_PPzPP [[P2]], [[P2]], killed [[P6]]
+; CHECK:    [[RES3:%[0-9]+]]:ppr = AND_PPzPP [[P3]], [[P3]], killed [[P7]]
+; CHECK:    $p0 = COPY [[RES0]]
+; CHECK:    $p1 = COPY [[RES1]]
+; CHECK:    $p2 = COPY [[RES2]]
+; CHECK:    $p3 = COPY [[RES3]]
+; CHECK:    RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+  %p0 = extractvalue [2 x <vscale x 16 x i1>] %arg1, 0
+  %p1 = extractvalue [2 x <vscale x 16 x i1>] %arg1, 1
+  %p2 = extractvalue [2 x <vscale x 16 x i1>] %arg3, 0
+  %p3 = extractvalue [2 x <vscale x 16 x i1>] %arg3, 1
+  %p4 = extractvalue [4 x <vscale x 16 x i1>] %arg2, 0
+  %p5 = extractvalue [4 x <vscale x 16 x i1>] %arg2, 1
+  %p6 = extractvalue [4 x <vscale x 16 x i1>] %arg2, 2
+  %p7 = extractvalue [4 x <vscale x 16 x i1>] %arg2, 3
+  %r0 = and <vscale x 16 x i1> %p0, %p4
+  %r1 = and <vscale x 16 x i1> %p1, %p5
+  %r2 = and <vscale x 16 x i1> %p2, %p6
+  %r3 = and <vscale x 16 x i1> %p3, %p7
+  %1 = insertvalue  [4 x <vscale x 16 x i1>] undef, <vscale x 16 x i1> %r0, 0
+  %2 = insertvalue  [4 x <vscale x 16 x i1>]    %1, <vscale x 16 x i1> %r1, 1
+  %3 = insertvalue  [4 x <vscale x 16 x i1>]    %2, <vscale x 16 x i1> %r2, 2
+  %4 = insertvalue  [4 x <vscale x 16 x i1>]    %3, <vscale x 16 x i1> %r3, 3
+  ret [4 x <vscale x 16 x i1>] %4
+}
+
 ; Test that z8 and z9, passed by reference, are loaded from a location that is passed on the stack.
 ; i.e.     x0 =   %x0
 ;             :
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
index 0a45244..bfb7505 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
@@ -128,6 +128,52 @@ define <vscale x 4 x i1> @sve_signature_pred(<vscale x 4 x i1> %arg1, <vscale x
   ret <vscale x 4 x i1> %arg2
 }
 
+; Test that scalable predicate argument in [1 x <vscale x 4 x i1>] type are properly assigned to P registers.
+; CHECK-LABEL: name: sve_signature_pred_1xv4i1
+; CHECK: [[RES:%[0-9]+]]:ppr = COPY $p1
+; CHECK: $p0 = COPY [[RES]]
+; CHECK: RET_ReallyLR implicit $p0
+define [1 x <vscale x 4 x i1>] @sve_signature_pred_1xv4i1([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
+  ret [1 x <vscale x 4 x i1>] %arg2
+}
+
+; Test that upto to two scalable predicate arguments in [2 x <vscale x 4 x i1>] type can be assigned to P registers.
+; CHECK-LABEL: name: sve_signature_pred_2xv4i1
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p3
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p2
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind {
+  ret [2 x <vscale x 4 x i1>] %arg2
+}
+
+; Test that a scalable predicate argument in [1 x <vscale x 32 x i1>] type is assigned to two P registers.
+; CHECK-LABLE: name: sve_signature_pred_1xv32i1
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p3
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p2
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [1 x <vscale x 32 x i1>] @sve_signature_pred_1xv32i1([1 x <vscale x 32 x i1>] %arg1, [1 x <vscale x 32 x i1>] %arg2) nounwind {
+  ret [1 x <vscale x 32 x i1>] %arg2
+}
+
+; Test that a scalable predicate argument in [2 x <vscale x 32 x i1>] type is assigned to four P registers.
+; CHECK-LABLE: name: sve_signature_pred_2xv32i1
+; CHECK: [[RES3:%[0-9]+]]:ppr = COPY $p3
+; CHECK: [[RES2:%[0-9]+]]:ppr = COPY $p2
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: $p2 = COPY [[RES2]]
+; CHECK: $p3 = COPY [[RES3]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+define [2 x <vscale x 32 x i1>] @sve_signature_pred_2xv32i1([2 x <vscale x 32 x i1>] %arg1) nounwind {
+  ret [2 x <vscale x 32 x i1>] %arg1
+}
+
 ; CHECK-LABEL: name: sve_signature_vec_caller
 ; CHECK-DAG: [[ARG2:%[0-9]+]]:zpr = COPY $z1
 ; CHECK-DAG: [[ARG1:%[0-9]+]]:zpr = COPY $z0
@@ -156,6 +202,84 @@ define <vscale x 4 x i1> @sve_signature_pred_caller(<vscale x 4 x i1> %arg1, <vs
   ret <vscale x 4 x i1> %res
 }
 
+; CHECK-LABEL: name: sve_signature_pred_1xv4i1_caller
+; CHECK-DAG: [[ARG2:%[0-9]+]]:ppr = COPY $p1
+; CHECK-DAG: [[ARG1:%[0-9]+]]:ppr = COPY $p0
+; CHECK-DAG: $p0 = COPY [[ARG2]]
+; CHECK-DAG: $p1 = COPY [[ARG1]]
+; CHECK-NEXT: BL @sve_signature_pred_1xv4i1, csr_aarch64_sve_aapcs
+; CHECK: [[RES:%[0-9]+]]:ppr = COPY $p0
+; CHECK: $p0 = COPY [[RES]]
+; CHECK: RET_ReallyLR implicit $p0
+define [1 x <vscale x 4 x i1>] @sve_signature_pred_1xv4i1_caller([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
+  %res = call [1 x <vscale x 4 x i1>] @sve_signature_pred_1xv4i1([1 x <vscale x 4 x i1>] %arg2, [1 x <vscale x 4 x i1>] %arg1)
+  ret [1 x <vscale x 4 x i1>] %res
+}
+
+; CHECK-LABEL: name: sve_signature_pred_2xv4i1_caller
+; CHECK-DAG: [[ARG2_2:%[0-9]+]]:ppr = COPY $p3
+; CHECK-DAG: [[ARG2_1:%[0-9]+]]:ppr = COPY $p2
+; CHECK-DAG: [[ARG1_2:%[0-9]+]]:ppr = COPY $p1
+; CHECK-DAG: [[ARG1_1:%[0-9]+]]:ppr = COPY $p0
+; CHECK-DAG: $p0 = COPY [[ARG2_1]]
+; CHECK-DAG: $p1 = COPY [[ARG2_2]]
+; CHECK-DAG: $p2 = COPY [[ARG1_1]]
+; CHECK-DAG: $p3 = COPY [[ARG1_2]]
+; CHECK-NEXT: BL @sve_signature_pred_2xv4i1, csr_aarch64_sve_aapcs
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1_caller([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind {
+  %res = call [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1([2 x <vscale x 4 x i1>] %arg2, [2 x <vscale x 4 x i1>] %arg1)
+  ret [2 x <vscale x 4 x i1>] %res
+}
+
+; CHECK-LABEL: name: sve_signature_pred_1xv32i1_caller
+; CHECK-DAG: [[ARG2_2:%[0-9]+]]:ppr = COPY $p3
+; CHECK-DAG: [[ARG2_1:%[0-9]+]]:ppr = COPY $p2
+; CHECK-DAG: [[ARG1_2:%[0-9]+]]:ppr = COPY $p1
+; CHECK-DAG: [[ARG1_1:%[0-9]+]]:ppr = COPY $p0
+; CHECK-DAG: $p0 = COPY [[ARG2_1]]
+; CHECK-DAG: $p1 = COPY [[ARG2_2]]
+; CHECK-DAG: $p2 = COPY [[ARG1_1]]
+; CHECK-DAG: $p3 = COPY [[ARG1_2]]
+; CHECK-NEXT: BL @sve_signature_pred_1xv32i1, csr_aarch64_sve_aapcs
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [1 x <vscale x 32 x i1>] @sve_signature_pred_1xv32i1_caller([1 x <vscale x 32 x i1>] %arg1, [1 x <vscale x 32 x i1>] %arg2) nounwind {
+  %res = call [1 x <vscale x 32 x i1>] @sve_signature_pred_1xv32i1([1 x <vscale x 32 x i1>] %arg2, [1 x <vscale x 32 x i1>] %arg1)
+  ret [1 x <vscale x 32 x i1>] %res
+}
+
+; CHECK-LABEL: name: sve_signature_pred_2xv32i1_caller
+; CHECK-DAG: [[ARG3:%[0-9]+]]:ppr = COPY $p3
+; CHECK-DAG: [[ARG2:%[0-9]+]]:ppr = COPY $p2
+; CHECK-DAG: [[ARG1:%[0-9]+]]:ppr = COPY $p1
+; CHECK-DAG: [[ARG0:%[0-9]+]]:ppr = COPY $p0
+; CHECK-DAG: $p0 = COPY [[ARG0]]
+; CHECK-DAG: $p1 = COPY [[ARG1]]
+; CHECK-DAG: $p2 = COPY [[ARG2]]
+; CHECK-DAG: $p3 = COPY [[ARG3]]
+; CHECK-NEXT: BL @sve_signature_pred_2xv32i1, csr_aarch64_sve_aapcs
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1
+; CHECK: [[RES2:%[0-9]+]]:ppr = COPY $p2
+; CHECK: [[RES3:%[0-9]+]]:ppr = COPY $p3
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: $p2 = COPY [[RES2]]
+; CHECK: $p3 = COPY [[RES3]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [2 x <vscale x 32 x i1>] @sve_signature_pred_2xv32i1_caller([2 x <vscale x 32 x i1>] %arg1) {
+  %res = call [2 x <vscale x 32 x i1>] @sve_signature_pred_2xv32i1([2 x <vscale x 32 x i1>] %arg1)
+  ret [2 x <vscale x 32 x i1>] %res
+}
+
 ; Test that functions returning or taking SVE arguments use the correct
 ; callee-saved set when using the default C calling convention (as opposed
 ; to aarch64_sve_vector_pcs)
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
index 28094c7..276f237 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128  < %s | FileCheck %s -check-prefixes=CHECK,SVE2_128
-; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefixes=CHECK,SVE2_128_NOMAX
-; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve < %s | FileCheck %s -check-prefixes=CHECK,SVE2_NOMIN_NOMAX
-; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,SVE2_MIN_256_NOMAX
+; RUN: llc -mattr=+sve2 -force-streaming-compatible -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128  < %s | FileCheck %s -check-prefixes=CHECK,SVE2_128
+; RUN: llc -mattr=+sve2 -force-streaming-compatible -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefixes=CHECK,SVE2_128_NOMAX
+; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,SVE2_NOMIN_NOMAX
+; RUN: llc -mattr=+sve2 -force-streaming-compatible -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,SVE2_MIN_256_NOMAX
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll b/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll
index 1a2ab8d..b0b6a6a 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mattr=+sve < %s | FileCheck %s
 
 ; Streaming-compatible SVE doesn't include FADDA, so this shouldn't compile!
-; RUN: not --crash llc -mattr=+sve -force-streaming-compatible-sve < %s
+; RUN: not --crash llc -mattr=+sve -force-streaming-compatible < %s
 
 target triple = "aarch64-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-pr92779.ll b/llvm/test/CodeGen/AArch64/sve-pr92779.ll
new file mode 100644
index 0000000..e257948
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-pr92779.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s | FileCheck %s
+
+define void @main(ptr %0) {
+; CHECK-LABEL: main:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
+; CHECK-NEXT:    uzp1 v1.2s, v0.2s, v1.2s
+; CHECK-NEXT:    neg v1.2s, v1.2s
+; CHECK-NEXT:    smov x8, v1.s[0]
+; CHECK-NEXT:    smov x9, v1.s[1]
+; CHECK-NEXT:    mov z0.d, p0/m, x8
+; CHECK-NEXT:    mov z0.d, p0/m, x9
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+"entry":
+  %1 = bitcast <vscale x 2 x i64> zeroinitializer to <vscale x 4 x i32>
+  %a = extractelement <vscale x 4 x i32> %1, i64 0
+  %b = insertelement <2 x i32> zeroinitializer, i32 %a, i64 0
+  %2 = bitcast <vscale x 2 x i64> zeroinitializer to <vscale x 4 x i32>
+  %c = extractelement <vscale x 4 x i32> %2, i64 2
+  %d = insertelement <2 x i32> %b, i32 %c, i64 1
+  %e = sub <2 x i32> zeroinitializer, %d
+  %f = extractelement <2 x i32> %e, i64 0
+  %g = sext i32 %f to i64
+  %h = insertelement <vscale x 2 x i64> zeroinitializer, i64 %g, i64 0
+  %i = extractelement <2 x i32> %e, i64 1
+  %j = sext i32 %i to i64
+  %k = insertelement <vscale x 2 x i64> %h, i64 %j, i64 0
+  store <vscale x 2 x i64> %k, ptr %0, align 16
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
index d81f725..e843537 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
index d547f99..aa42d5c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
index e3cc74f..260ad16 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
index 74a4aab..9a07bd8 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
index 0c490a6..aec434b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index 86494c4..82e75d6 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 0aefba2..040e586 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
index 25ecd7a..45a804b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
index a752e11..9c3b5e1 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
index f017eea..21ce689 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE
-; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE
+; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
index c2d6ed4..b0a82e6 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
index 465cc17..cbd0ad6 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
index 9bdde14..57d072a 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
index 244a405..6a2dc3c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
index cbe71d71..153a04f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
index 94a7476..6945a61 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
index b56e67d..e239ff5 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sme-fa64 -force-streaming-compatible-sve < %s | FileCheck %s -check-prefix=FA64
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s -check-prefix=NO-FA64
+; RUN: llc -mattr=+sme-fa64 -force-streaming-compatible < %s | FileCheck %s -check-prefix=FA64
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s -check-prefix=NO-FA64
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
index df9613a..78ae7bb 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
index 7ddc641..412c27c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
index 7d36925..89697cd 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index bf8a335..5840ffb 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
index 30a4f04..c1c7b5c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
index 4aa9657..ff38db8 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
index 8baa87c..ee1706b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE
-; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE
+; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
index 73c1eac..c2f3bbf 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
index 5158dda..e6fd775 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE
-; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE
+; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
index c7a8961..e40668a 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE
-; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE
+; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
index f028b3e..54276bb 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
index 4d70c1dd..40824ba 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
index 50cf9b7..74ee548 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
index 149ad6d..3ff6983 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sme-fa64 -force-streaming-compatible-sve < %s | FileCheck %s -check-prefix=FA64
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s -check-prefix=NO-FA64
+; RUN: llc -mattr=+sme-fa64 -force-streaming-compatible < %s | FileCheck %s -check-prefix=FA64
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s -check-prefix=NO-FA64
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
index cb7fa53..8917f43 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE
-; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE
+; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 
 ; This test only tests the legal types for a given vector width, as mulh nodes
 ; do not get generated for non-legal types.
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
index 751f437..1123907 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
index d373a90..4ae7586 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
index 906112f..bfffe4b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
index 9ed52e3..9319bd6 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index a9b52c9..27dbfc9 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
index 81bbaa9..3775a64 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index 318285d..0b61523 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
index 8850308..918f0cc 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
index 8ca8e69..8c69d5b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
index c4aeb44..ef52ead 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index ca58099..4f8f8c2 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index f2b3f9b..bd6b968 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
index b5adea5..aef446a 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
index 0041330..6d91253 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
index cb73030..8808ad9 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
index ab7c42b..8039bd0 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
index 3626125..9741147 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
index bfa9310..726fd28 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
index 9dd42e7..c022bf8 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
index 6f82c97..38aaf86 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
index 323d527..649b13f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 
 
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
index 06709ca..c7435bd 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
index 838db0c..9e04fc2 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 ; Test we can code generater patterns of the form:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
index 7e3a175..b34fe43 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
index 70219dd..9e56462 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
index 1757314..304823c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
index 337a213..6c9c055 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
index 18cd4cc..c4a58ba 100644
--- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
@@ -571,29 +571,27 @@ define void @trunc_v8i19_to_v8i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB5_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldp x10, x9, [x0]
-; CHECK-NEXT:    ldrb w13, [x0, #18]
-; CHECK-NEXT:    ldrh w14, [x0, #16]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldrb w14, [x0, #18]
+; CHECK-NEXT:    ldrh w15, [x0, #16]
 ; CHECK-NEXT:    add x0, x0, #32
-; CHECK-NEXT:    ubfx x12, x9, #12, #20
-; CHECK-NEXT:    fmov s0, w10
-; CHECK-NEXT:    lsr x11, x10, #19
-; CHECK-NEXT:    lsr x15, x9, #31
-; CHECK-NEXT:    fmov s1, w12
-; CHECK-NEXT:    lsr x12, x9, #50
-; CHECK-NEXT:    mov.s v0[1], w11
-; CHECK-NEXT:    orr w11, w14, w13, lsl #16
-; CHECK-NEXT:    lsr x13, x10, #38
-; CHECK-NEXT:    lsr x10, x10, #57
-; CHECK-NEXT:    mov.s v1[1], w15
-; CHECK-NEXT:    orr w12, w12, w11, lsl #14
-; CHECK-NEXT:    orr w9, w10, w9, lsl #7
-; CHECK-NEXT:    lsr w10, w11, #5
-; CHECK-NEXT:    mov.s v0[2], w13
+; CHECK-NEXT:    ubfx x12, x10, #12, #20
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    lsr x11, x9, #19
+; CHECK-NEXT:    lsr x13, x10, #31
+; CHECK-NEXT:    fmov s0, w12
+; CHECK-NEXT:    lsr x12, x9, #38
+; CHECK-NEXT:    extr x9, x10, x9, #57
+; CHECK-NEXT:    mov.s v1[1], w11
+; CHECK-NEXT:    orr x11, x15, x14, lsl #16
+; CHECK-NEXT:    mov.s v0[1], w13
+; CHECK-NEXT:    extr x13, x11, x10, #50
+; CHECK-NEXT:    ubfx x10, x11, #5, #27
 ; CHECK-NEXT:    mov.s v1[2], w12
-; CHECK-NEXT:    mov.s v0[3], w9
-; CHECK-NEXT:    mov.s v1[3], w10
-; CHECK-NEXT:    uzp1.8h v0, v0, v1
+; CHECK-NEXT:    mov.s v0[2], w13
+; CHECK-NEXT:    mov.s v1[3], w9
+; CHECK-NEXT:    mov.s v0[3], w10
+; CHECK-NEXT:    uzp1.8h v0, v1, v0
 ; CHECK-NEXT:    xtn.8b v0, v0
 ; CHECK-NEXT:    str d0, [x1, x8, lsl #3]
 ; CHECK-NEXT:    add x8, x8, #1
@@ -608,35 +606,34 @@ define void @trunc_v8i19_to_v8i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-BE-NEXT:  .LBB5_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    ldp x10, x9, [x0]
-; CHECK-BE-NEXT:    ldrb w16, [x0, #18]
-; CHECK-BE-NEXT:    lsr x11, x9, #40
-; CHECK-BE-NEXT:    ubfx x12, x9, #33, #7
-; CHECK-BE-NEXT:    lsr x15, x10, #45
-; CHECK-BE-NEXT:    lsr x13, x10, #40
-; CHECK-BE-NEXT:    ubfx x14, x10, #26, #14
-; CHECK-BE-NEXT:    orr w11, w12, w11, lsl #7
-; CHECK-BE-NEXT:    ldrh w12, [x0, #16]
-; CHECK-BE-NEXT:    fmov s0, w15
-; CHECK-BE-NEXT:    orr w13, w14, w13, lsl #14
-; CHECK-BE-NEXT:    ubfx x14, x9, #14, #18
+; CHECK-BE-NEXT:    ldrh w16, [x0, #16]
+; CHECK-BE-NEXT:    ldrb w17, [x0, #18]
 ; CHECK-BE-NEXT:    add x0, x0, #32
-; CHECK-BE-NEXT:    fmov s1, w11
-; CHECK-BE-NEXT:    orr w11, w16, w12, lsl #8
-; CHECK-BE-NEXT:    lsl x12, x9, #24
-; CHECK-BE-NEXT:    mov v0.s[1], w13
+; CHECK-BE-NEXT:    lsl x11, x9, #24
+; CHECK-BE-NEXT:    lsr x12, x9, #40
+; CHECK-BE-NEXT:    lsr x13, x10, #45
+; CHECK-BE-NEXT:    lsl x14, x10, #24
+; CHECK-BE-NEXT:    lsr x15, x10, #40
+; CHECK-BE-NEXT:    extr x12, x12, x11, #57
+; CHECK-BE-NEXT:    fmov s0, w13
 ; CHECK-BE-NEXT:    ubfx x13, x10, #7, #25
+; CHECK-BE-NEXT:    extr x14, x15, x14, #50
+; CHECK-BE-NEXT:    ubfx x15, x9, #14, #18
 ; CHECK-BE-NEXT:    extr x9, x10, x9, #40
-; CHECK-BE-NEXT:    orr w12, w11, w12
-; CHECK-BE-NEXT:    mov v1.s[1], w14
-; CHECK-BE-NEXT:    lsr w12, w12, #19
+; CHECK-BE-NEXT:    fmov s1, w12
+; CHECK-BE-NEXT:    orr w12, w17, w16, lsl #8
+; CHECK-BE-NEXT:    mov v0.s[1], w14
 ; CHECK-BE-NEXT:    ubfx x9, x9, #12, #20
+; CHECK-BE-NEXT:    orr w11, w12, w11
+; CHECK-BE-NEXT:    mov v1.s[1], w15
+; CHECK-BE-NEXT:    lsr w11, w11, #19
 ; CHECK-BE-NEXT:    mov v0.s[2], w13
-; CHECK-BE-NEXT:    mov v1.s[2], w12
+; CHECK-BE-NEXT:    mov v1.s[2], w11
 ; CHECK-BE-NEXT:    mov v0.s[3], w9
 ; CHECK-BE-NEXT:    add x9, x1, x8, lsl #3
 ; CHECK-BE-NEXT:    add x8, x8, #1
 ; CHECK-BE-NEXT:    cmp x8, #1000
-; CHECK-BE-NEXT:    mov v1.s[3], w11
+; CHECK-BE-NEXT:    mov v1.s[3], w12
 ; CHECK-BE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-BE-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-BE-NEXT:    st1 { v0.8b }, [x9]
@@ -650,35 +647,34 @@ define void @trunc_v8i19_to_v8i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-DISABLE-NEXT:  .LBB5_1: // %loop
 ; CHECK-DISABLE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-DISABLE-NEXT:    ldp x10, x9, [x0]
-; CHECK-DISABLE-NEXT:    ldrb w16, [x0, #18]
-; CHECK-DISABLE-NEXT:    lsr x11, x9, #40
-; CHECK-DISABLE-NEXT:    ubfx x12, x9, #33, #7
-; CHECK-DISABLE-NEXT:    lsr x15, x10, #45
-; CHECK-DISABLE-NEXT:    lsr x13, x10, #40
-; CHECK-DISABLE-NEXT:    ubfx x14, x10, #26, #14
-; CHECK-DISABLE-NEXT:    orr w11, w12, w11, lsl #7
-; CHECK-DISABLE-NEXT:    ldrh w12, [x0, #16]
-; CHECK-DISABLE-NEXT:    fmov s0, w15
-; CHECK-DISABLE-NEXT:    orr w13, w14, w13, lsl #14
-; CHECK-DISABLE-NEXT:    ubfx x14, x9, #14, #18
+; CHECK-DISABLE-NEXT:    ldrh w16, [x0, #16]
+; CHECK-DISABLE-NEXT:    ldrb w17, [x0, #18]
 ; CHECK-DISABLE-NEXT:    add x0, x0, #32
-; CHECK-DISABLE-NEXT:    fmov s1, w11
-; CHECK-DISABLE-NEXT:    orr w11, w16, w12, lsl #8
-; CHECK-DISABLE-NEXT:    lsl x12, x9, #24
-; CHECK-DISABLE-NEXT:    mov v0.s[1], w13
+; CHECK-DISABLE-NEXT:    lsl x11, x9, #24
+; CHECK-DISABLE-NEXT:    lsr x12, x9, #40
+; CHECK-DISABLE-NEXT:    lsr x13, x10, #45
+; CHECK-DISABLE-NEXT:    lsl x14, x10, #24
+; CHECK-DISABLE-NEXT:    lsr x15, x10, #40
+; CHECK-DISABLE-NEXT:    extr x12, x12, x11, #57
+; CHECK-DISABLE-NEXT:    fmov s0, w13
 ; CHECK-DISABLE-NEXT:    ubfx x13, x10, #7, #25
+; CHECK-DISABLE-NEXT:    extr x14, x15, x14, #50
+; CHECK-DISABLE-NEXT:    ubfx x15, x9, #14, #18
 ; CHECK-DISABLE-NEXT:    extr x9, x10, x9, #40
-; CHECK-DISABLE-NEXT:    orr w12, w11, w12
-; CHECK-DISABLE-NEXT:    mov v1.s[1], w14
-; CHECK-DISABLE-NEXT:    lsr w12, w12, #19
+; CHECK-DISABLE-NEXT:    fmov s1, w12
+; CHECK-DISABLE-NEXT:    orr w12, w17, w16, lsl #8
+; CHECK-DISABLE-NEXT:    mov v0.s[1], w14
 ; CHECK-DISABLE-NEXT:    ubfx x9, x9, #12, #20
+; CHECK-DISABLE-NEXT:    orr w11, w12, w11
+; CHECK-DISABLE-NEXT:    mov v1.s[1], w15
+; CHECK-DISABLE-NEXT:    lsr w11, w11, #19
 ; CHECK-DISABLE-NEXT:    mov v0.s[2], w13
-; CHECK-DISABLE-NEXT:    mov v1.s[2], w12
+; CHECK-DISABLE-NEXT:    mov v1.s[2], w11
 ; CHECK-DISABLE-NEXT:    mov v0.s[3], w9
 ; CHECK-DISABLE-NEXT:    add x9, x1, x8, lsl #3
 ; CHECK-DISABLE-NEXT:    add x8, x8, #1
 ; CHECK-DISABLE-NEXT:    cmp x8, #1000
-; CHECK-DISABLE-NEXT:    mov v1.s[3], w11
+; CHECK-DISABLE-NEXT:    mov v1.s[3], w12
 ; CHECK-DISABLE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-DISABLE-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-DISABLE-NEXT:    st1 { v0.8b }, [x9]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir
index ac98dca..e3d31c7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir
@@ -1,18 +1,28 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1100 %s
-# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx11-generic --amdhsa-code-object-version=6 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1100 %s
-# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1150 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1150 %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -o - -run-pass=legalizer %s -verify-machineinstrs | FileCheck -check-prefix=GFX1100 %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx11-generic --amdhsa-code-object-version=6 -o - -run-pass=legalizer %s -verify-machineinstrs | FileCheck -check-prefix=GFX1100 %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1150 -o - -run-pass=legalizer %s -verify-machineinstrs | FileCheck -check-prefix=GFX1150 %s
 
 ---
 name: test_trap
 body: |
   bb.0:
     ; GFX1100-LABEL: name: test_trap
-    ; GFX1100: successors: %bb.2(0x80000000)
+    ; GFX1100: successors: %bb.1(0x40000000), %bb.2(0x40000000)
     ; GFX1100-NEXT: {{  $}}
     ; GFX1100-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX1100-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
     ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
+    ; GFX1100-NEXT: S_CBRANCH_EXECNZ %bb.2, implicit $exec
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: .1:
+    ; GFX1100-NEXT: successors:
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: .2:
+    ; GFX1100-NEXT: successors: %bb.3(0x80000000)
+    ; GFX1100-NEXT: {{  $}}
     ; GFX1100-NEXT: S_TRAP 2
     ; GFX1100-NEXT: [[S_SENDMSG_RTN_B32_:%[0-9]+]]:sreg_32 = S_SENDMSG_RTN_B32 128
     ; GFX1100-NEXT: $ttmp2 = S_MOV_B32 $m0
@@ -21,18 +31,13 @@ body: |
     ; GFX1100-NEXT: $m0 = S_MOV_B32 [[S_OR_B32_]]
     ; GFX1100-NEXT: S_SENDMSG 1, implicit $exec, implicit $m0
     ; GFX1100-NEXT: $m0 = S_MOV_B32 $ttmp2
-    ; GFX1100-NEXT: S_BRANCH %bb.2
-    ; GFX1100-NEXT: {{  $}}
-    ; GFX1100-NEXT: .1:
-    ; GFX1100-NEXT: successors:
+    ; GFX1100-NEXT: S_BRANCH %bb.3
     ; GFX1100-NEXT: {{  $}}
-    ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
-    ; GFX1100-NEXT: {{  $}}
-    ; GFX1100-NEXT: .2:
-    ; GFX1100-NEXT: successors: %bb.2(0x80000000)
+    ; GFX1100-NEXT: .3:
+    ; GFX1100-NEXT: successors: %bb.3(0x80000000)
     ; GFX1100-NEXT: {{  $}}
     ; GFX1100-NEXT: S_SETHALT 5
-    ; GFX1100-NEXT: S_BRANCH %bb.2
+    ; GFX1100-NEXT: S_BRANCH %bb.3
     ;
     ; GFX1150-LABEL: name: test_trap
     ; GFX1150: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
@@ -45,5 +50,63 @@ body: |
     G_STORE %0, %1 :: (store 1, addrspace 1)
     G_TRAP
     G_STORE %0, %1 :: (store 1, addrspace 1)
+...
+
+---
+name: test_fallthrough_trap
+body: |
+  ; GFX1100-LABEL: name: test_fallthrough_trap
+  ; GFX1100: bb.0:
+  ; GFX1100-NEXT:   successors: %bb.1(0x80000000), %bb.2(0x00000000)
+  ; GFX1100-NEXT: {{  $}}
+  ; GFX1100-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX1100-NEXT:   [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
+  ; GFX1100-NEXT:   G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
+  ; GFX1100-NEXT:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
+  ; GFX1100-NEXT: {{  $}}
+  ; GFX1100-NEXT: bb.1:
+  ; GFX1100-NEXT:   successors:
+  ; GFX1100-NEXT: {{  $}}
+  ; GFX1100-NEXT:   G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
+  ; GFX1100-NEXT: {{  $}}
+  ; GFX1100-NEXT: bb.2:
+  ; GFX1100-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX1100-NEXT: {{  $}}
+  ; GFX1100-NEXT:   S_TRAP 2
+  ; GFX1100-NEXT:   [[S_SENDMSG_RTN_B32_:%[0-9]+]]:sreg_32 = S_SENDMSG_RTN_B32 128
+  ; GFX1100-NEXT:   $ttmp2 = S_MOV_B32 $m0
+  ; GFX1100-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_SENDMSG_RTN_B32_]], 1023, implicit-def $scc
+  ; GFX1100-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], 1024, implicit-def $scc
+  ; GFX1100-NEXT:   $m0 = S_MOV_B32 [[S_OR_B32_]]
+  ; GFX1100-NEXT:   S_SENDMSG 1, implicit $exec, implicit $m0
+  ; GFX1100-NEXT:   $m0 = S_MOV_B32 $ttmp2
+  ; GFX1100-NEXT:   S_BRANCH %bb.3
+  ; GFX1100-NEXT: {{  $}}
+  ; GFX1100-NEXT: bb.3:
+  ; GFX1100-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX1100-NEXT: {{  $}}
+  ; GFX1100-NEXT:   S_SETHALT 5
+  ; GFX1100-NEXT:   S_BRANCH %bb.3
+  ;
+  ; GFX1150-LABEL: name: test_fallthrough_trap
+  ; GFX1150: bb.0:
+  ; GFX1150-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX1150-NEXT: {{  $}}
+  ; GFX1150-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX1150-NEXT:   [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
+  ; GFX1150-NEXT:   G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
+  ; GFX1150-NEXT:   S_TRAP 2
+  ; GFX1150-NEXT: {{  $}}
+  ; GFX1150-NEXT: bb.1:
+  ; GFX1150-NEXT:   G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
+  bb.0:
+    successors: %bb.1
+
+    %0:_(s8) = G_CONSTANT i8 0
+    %1:_(p1) = G_CONSTANT i64 0
+    G_STORE %0, %1 :: (store 1, addrspace 1)
+    G_TRAP
 
+  bb.1:
+    G_STORE %0, %1 :: (store 1, addrspace 1)
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
index c105ad7..7932f8d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
@@ -302,7 +302,8 @@ define half @test_rootn_f16_neg1(half %x) {
 define half @test_rootn_f16_neg2(half %x) {
 ; CHECK-LABEL: define half @test_rootn_f16_neg2(
 ; CHECK-SAME: half [[X:%.*]]) {
-; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = call half @_Z5rsqrtDh(half [[X]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract half @llvm.sqrt.f16(half [[X]])
+; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = fdiv contract half 0xH3C00, [[TMP1]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret half [[__ROOTN2RSQRT]]
 ;
   %call = tail call half @_Z5rootnDhi(half %x, i32 -2)
@@ -371,7 +372,8 @@ define <2 x half> @test_rootn_v2f16_neg1(<2 x half> %x) {
 define <2 x half> @test_rootn_v2f16_neg2(<2 x half> %x) {
 ; CHECK-LABEL: define <2 x half> @test_rootn_v2f16_neg2(
 ; CHECK-SAME: <2 x half> [[X:%.*]]) {
-; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = call <2 x half> @_Z5rsqrtDv2_Dh(<2 x half> [[X]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> [[X]])
+; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = fdiv contract <2 x half> <half 0xH3C00, half 0xH3C00>, [[TMP1]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x half> [[__ROOTN2RSQRT]]
 ;
   %call = tail call <2 x half> @_Z5rootnDv2_DhDv2_i(<2 x half> %x, <2 x i32> <i32 -2, i32 -2>)
@@ -865,7 +867,8 @@ define float @test_rootn_f32__y_neg2(float %x) {
 ; CHECK-LABEL: define float @test_rootn_f32__y_neg2(
 ; CHECK-SAME: float [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = call float @_Z5rsqrtf(float [[X]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = fdiv contract float 1.000000e+00, [[TMP0]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[__ROOTN2RSQRT]]
 ;
 entry:
@@ -877,7 +880,8 @@ define float @test_rootn_f32__y_neg2__flags(float %x) {
 ; CHECK-LABEL: define float @test_rootn_f32__y_neg2__flags(
 ; CHECK-SAME: float [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = call nnan nsz float @_Z5rsqrtf(float [[X]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call nnan nsz contract float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = fdiv nnan nsz contract float 1.000000e+00, [[TMP0]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[__ROOTN2RSQRT]]
 ;
 entry:
@@ -889,7 +893,7 @@ define float @test_rootn_f32__y_neg2__strictfp(float %x) #1 {
 ; CHECK-LABEL: define float @test_rootn_f32__y_neg2__strictfp(
 ; CHECK-SAME: float [[X:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = call float @_Z5rsqrtf(float [[X]]) #[[ATTR0]]
+; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -2) #[[ATTR0]]
 ; CHECK-NEXT:    ret float [[__ROOTN2RSQRT]]
 ;
 entry:
@@ -901,7 +905,7 @@ define float @test_rootn_f32__y_neg2__noinline(float %x) {
 ; CHECK-LABEL: define float @test_rootn_f32__y_neg2__noinline(
 ; CHECK-SAME: float [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = call float @_Z5rsqrtf(float [[X]])
+; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -2) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    ret float [[__ROOTN2RSQRT]]
 ;
 entry:
@@ -913,7 +917,7 @@ define float @test_rootn_f32__y_neg2__nobuiltin(float %x) {
 ; CHECK-LABEL: define float @test_rootn_f32__y_neg2__nobuiltin(
 ; CHECK-SAME: float [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -2) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -2) #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 entry:
@@ -925,7 +929,8 @@ define <2 x float> @test_rootn_v2f32__y_neg2(<2 x float> %x) {
 ; CHECK-LABEL: define <2 x float> @test_rootn_v2f32__y_neg2(
 ; CHECK-SAME: <2 x float> [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = call <2 x float> @_Z5rsqrtDv2_f(<2 x float> [[X]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]])
+; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = fdiv contract <2 x float> <float 1.000000e+00, float 1.000000e+00>, [[TMP0]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[__ROOTN2RSQRT]]
 ;
 entry:
@@ -937,7 +942,8 @@ define <2 x float> @test_rootn_v2f32__y_neg2__flags(<2 x float> %x) {
 ; CHECK-LABEL: define <2 x float> @test_rootn_v2f32__y_neg2__flags(
 ; CHECK-SAME: <2 x float> [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = call nnan nsz <2 x float> @_Z5rsqrtDv2_f(<2 x float> [[X]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call nnan nsz contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]])
+; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = fdiv nnan nsz contract <2 x float> <float 1.000000e+00, float 1.000000e+00>, [[TMP0]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[__ROOTN2RSQRT]]
 ;
 entry:
@@ -949,7 +955,7 @@ define <2 x float> @test_rootn_v2f32__y_neg2__strictfp(<2 x float> %x) #1 {
 ; CHECK-LABEL: define <2 x float> @test_rootn_v2f32__y_neg2__strictfp(
 ; CHECK-SAME: <2 x float> [[X:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = call <2 x float> @_Z5rsqrtDv2_f(<2 x float> [[X]]) #[[ATTR0]]
+; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = tail call <2 x float> @_Z5rootnDv2_fDv2_i(<2 x float> [[X]], <2 x i32> <i32 -2, i32 -2>) #[[ATTR0]]
 ; CHECK-NEXT:    ret <2 x float> [[__ROOTN2RSQRT]]
 ;
 entry:
@@ -1125,7 +1131,7 @@ define float @test_rootn_fast_f32_nobuiltin(float %x, i32 %y) {
 ; CHECK-LABEL: define float @test_rootn_fast_f32_nobuiltin(
 ; CHECK-SAME: float [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call fast float @_Z5rootnfi(float [[X]], i32 [[Y]]) #[[ATTR3]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call fast float @_Z5rootnfi(float [[X]], i32 [[Y]]) #[[ATTR4]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 entry:
@@ -1420,7 +1426,7 @@ entry:
 define float @test_rootn_f32__y_0_nobuiltin(float %x) {
 ; CHECK-LABEL: define float @test_rootn_f32__y_0_nobuiltin(
 ; CHECK-SAME: float [[X:%.*]]) {
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 0) #[[ATTR3]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
   %call = tail call float @_Z5rootnfi(float %x, i32 0) #0
@@ -1430,7 +1436,7 @@ define float @test_rootn_f32__y_0_nobuiltin(float %x) {
 define float @test_rootn_f32__y_1_nobuiltin(float %x) {
 ; CHECK-LABEL: define float @test_rootn_f32__y_1_nobuiltin(
 ; CHECK-SAME: float [[X:%.*]]) {
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 1) #[[ATTR3]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 1) #[[ATTR4]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
   %call = tail call float @_Z5rootnfi(float %x, i32 1) #0
@@ -1440,7 +1446,7 @@ define float @test_rootn_f32__y_1_nobuiltin(float %x) {
 define float @test_rootn_f32__y_2_nobuiltin(float %x) {
 ; CHECK-LABEL: define float @test_rootn_f32__y_2_nobuiltin(
 ; CHECK-SAME: float [[X:%.*]]) {
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 2) #[[ATTR3]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 2) #[[ATTR4]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
   %call = tail call float @_Z5rootnfi(float %x, i32 2) #0
@@ -1450,7 +1456,7 @@ define float @test_rootn_f32__y_2_nobuiltin(float %x) {
 define float @test_rootn_f32__y_3_nobuiltin(float %x) {
 ; CHECK-LABEL: define float @test_rootn_f32__y_3_nobuiltin(
 ; CHECK-SAME: float [[X:%.*]]) {
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 3) #[[ATTR3]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 3) #[[ATTR4]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
   %call = tail call float @_Z5rootnfi(float %x, i32 3) #0
@@ -1460,7 +1466,7 @@ define float @test_rootn_f32__y_3_nobuiltin(float %x) {
 define float @test_rootn_f32__y_neg1_nobuiltin(float %x) {
 ; CHECK-LABEL: define float @test_rootn_f32__y_neg1_nobuiltin(
 ; CHECK-SAME: float [[X:%.*]]) {
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -1) #[[ATTR3]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -1) #[[ATTR4]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
   %call = tail call float @_Z5rootnfi(float %x, i32 -1) #0
@@ -1470,7 +1476,7 @@ define float @test_rootn_f32__y_neg1_nobuiltin(float %x) {
 define float @test_rootn_f32__y_neg2_nobuiltin(float %x) {
 ; CHECK-LABEL: define float @test_rootn_f32__y_neg2_nobuiltin(
 ; CHECK-SAME: float [[X:%.*]]) {
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -2) #[[ATTR3]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -2) #[[ATTR4]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
   %call = tail call float @_Z5rootnfi(float %x, i32 -2) #0
@@ -1487,7 +1493,8 @@ attributes #2 = { noinline }
 ; CHECK: attributes #[[ATTR0]] = { strictfp }
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind memory(read) }
-; CHECK: attributes #[[ATTR3]] = { nobuiltin }
+; CHECK: attributes #[[ATTR3]] = { noinline }
+; CHECK: attributes #[[ATTR4]] = { nobuiltin }
 ;.
 ; CHECK: [[META0]] = !{float 2.000000e+00}
 ; CHECK: [[META1]] = !{float 3.000000e+00}
diff --git a/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll b/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll
new file mode 100644
index 0000000..0c4974f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -simplify-mir -stop-after=finalize-isel < %s | FileCheck %s
+
+; Check that call / asm get an implicit-def $mode added to them in
+; strictfp functions.
+
+declare protected void @maybe_defs_mode() #0
+
+define float @call_changes_mode(float %x, float %y) #0 {
+  ; CHECK-LABEL: name: call_changes_mode
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $mode
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   $vgpr0 = COPY [[V_ADD_F32_e64_]]
+  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  call void @maybe_defs_mode()
+  %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+  ret float %val
+}
+
+define void @tail_call_changes_mode() #0 {
+  ; CHECK-LABEL: name: tail_call_changes_mode
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:ccr_sgpr_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def dead $scc
+  ; CHECK-NEXT:   SI_TCRETURN killed [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, 0, csr_amdgpu, implicit-def $mode
+  tail call void @maybe_defs_mode()
+  ret void
+}
+
+define float @asm_changes_mode(float %x, float %y) #0 {
+  ; CHECK-LABEL: name: asm_changes_mode
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   INLINEASM &"; maybe defs mode", 1 /* sideeffect attdialect */, implicit-def $mode
+  ; CHECK-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   $vgpr0 = COPY [[V_ADD_F32_e64_]]
+  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  call void asm sideeffect "; maybe defs mode", ""()
+  %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+  ret float %val
+}
+
+declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
+
+attributes #0 = { strictfp "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z"  }
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll
index cfc166e..5162092f 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll
@@ -47,6 +47,21 @@ define amdgpu_kernel void @dpp_fadd(ptr addrspace(1) %arg) {
   ret void
 }
 
+; Fails to combine because v_mul_lo_u32 has no e32 or dpp form.
+; GCN-LABEL: {{^}}dpp_mul:
+; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],
+; GCN: v_mov_b32_e32 [[V2:v[0-9]+]], [[V]]
+; GCN: v_mov_b32_dpp [[V2]], [[V2]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
+; GCN: v_mul_lo_u32 [[V]], [[V2]], [[V]]{{$}}
+define amdgpu_kernel void @dpp_mul(ptr addrspace(1) %arg) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %id
+  %load = load i32, ptr addrspace(1) %gep
+  %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %load, i32 %load, i32 1, i32 15, i32 15, i1 1)
+  %mul = mul i32 %tmp0, %load
+  store i32 %mul, ptr addrspace(1) %gep
+  ret void
+}
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 6e45084..9690e12 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -1,98 +1,3251 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
-
-; GCN-LABEL: {{^}}test_fmaximum3_olt_0_f32:
-; GCN: buffer_load_b32 [[REGC:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGB:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGA:v[0-9]+]]
-; GCN: v_maximum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b32 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile float, ptr addrspace(1) %aptr, align 4
-  %b = load volatile float, ptr addrspace(1) %bptr, align 4
-  %c = load volatile float, ptr addrspace(1) %cptr, align 4
-  %f0 = call float @llvm.maximum.f32(float %a, float %b)
-  %f1 = call float @llvm.maximum.f32(float %f0, float %c)
-  store float %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-; Commute operand of second fmaximum
-; GCN-LABEL: {{^}}test_fmaximum3_olt_1_f32:
-; GCN: buffer_load_b32 [[REGB:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGA:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGC:v[0-9]+]]
-; GCN: v_maximum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b32 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile float, ptr addrspace(1) %aptr, align 4
-  %b = load volatile float, ptr addrspace(1) %bptr, align 4
-  %c = load volatile float, ptr addrspace(1) %cptr, align 4
-  %f0 = call float @llvm.maximum.f32(float %a, float %b)
-  %f1 = call float @llvm.maximum.f32(float %c, float %f0)
-  store float %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_fmaximum3_olt_0_f16:
-; GCN: buffer_load_u16 [[REGC:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGB:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGA:v[0-9]+]]
-; GCN: v_maximum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b16 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile half, ptr addrspace(1) %aptr, align 2
-  %b = load volatile half, ptr addrspace(1) %bptr, align 2
-  %c = load volatile half, ptr addrspace(1) %cptr, align 2
-  %f0 = call half @llvm.maximum.f16(half %a, half %b)
-  %f1 = call half @llvm.maximum.f16(half %f0, half %c)
-  store half %f1, ptr addrspace(1) %out, align 2
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_fmaximum3_olt_1_f16:
-; GCN: buffer_load_u16 [[REGA:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGB:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGC:v[0-9]+]]
-; GCN: v_maximum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]]
-; GCN: buffer_store_b16 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile half, ptr addrspace(1) %aptr, align 2
-  %b = load volatile half, ptr addrspace(1) %bptr, align 2
-  %c = load volatile half, ptr addrspace(1) %cptr, align 2
-  %f0 = call half @llvm.maximum.f16(half %a, half %b)
-  %f1 = call half @llvm.maximum.f16(half %c, half %f0)
-  store half %f1, ptr addrspace(1) %out, align 2
-  ret void
-}
-
-; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of maximum3
-; since there are no pack instructions for fmaximum3.
-; GCN-LABEL: {{^}}no_fmaximum3_v2f16:
-; GCN: v_pk_maximum_f16 v0, v0, v1
-; GCN: v_pk_maximum_f16 v0, v2, v0
-; GCN: v_pk_maximum_f16 v0, v0, v3
-; GCN-NEXT: s_setpc_b64
-define <2 x half> @no_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
-entry:
-  %max = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
-  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max)
-  %res = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max1, <2 x half> %d)
-  ret <2 x half> %res
-}
-
-; GCN-LABEL: {{^}}no_fmaximum3_olt_0_f64:
-; GCN-COUNT-2: v_maximum_f64
-define amdgpu_kernel void @no_fmaximum3_olt_0_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile double, ptr addrspace(1) %aptr, align 4
-  %b = load volatile double, ptr addrspace(1) %bptr, align 4
-  %c = load volatile double, ptr addrspace(1) %cptr, align 4
-  %f0 = call double @llvm.maximum.f64(double %a, double %b)
-  %f1 = call double @llvm.maximum.f64(double %f0, double %c)
-  store double %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-declare double @llvm.maximum.f64(double, double)
-declare float @llvm.maximum.f32(float, float)
-declare half @llvm.maximum.f16(half, half)
-declare <2 x half> @llvm.maximum.v2f16(<2 x half>, <2 x half>)
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+
+define float @v_fmaximum3_f32(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_commute(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v2, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v2, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %c, float %max0)
+  ret float %max1
+}
+
+define amdgpu_ps i32 @s_fmaximum3_f32(float inreg %a, float inreg %b, float inreg %c) {
+; GFX12-LABEL: s_fmaximum3_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum3_f32 v0, s0, s1, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fmaximum3_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_max_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, s2, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  %cast = bitcast float %max1 to i32
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+  ret i32 %readfirstlane
+}
+
+define float @v_fmaximum3_f32_fabs0(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, |v0|, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fabs0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, |v0|, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %max0 = call float @llvm.maximum.f32(float %a.fabs, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fabs1(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, |v1|, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fabs1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, v0, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %max0 = call float @llvm.maximum.f32(float %a, float %b.fabs)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fabs2(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fabs2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fabs)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fabs_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, |v0|, |v1|, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, |v0|, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %max0 = call float @llvm.maximum.f32(float %a.fabs, float %b.fabs)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fabs)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, -v0, -v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, -v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg float %a
+  %b.fneg = fneg float %b
+  %c.fneg = fneg float %c
+  %max0 = call float @llvm.maximum.f32(float %a.fneg, float %b.fneg)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg_fabs_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, -|v0|, -|v1|, -|v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, -|v0|, -|v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -|v0|, -|v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e64 v1, v0, -|v2|
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -|v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %a.fneg.fabs = fneg float %a.fabs
+  %b.fneg.fabs = fneg float %b.fabs
+  %c.fneg.fabs = fneg float %c.fabs
+  %max0 = call float @llvm.maximum.f32(float %a.fneg.fabs, float %b.fneg.fabs)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg.fabs)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg0(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, -v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, -v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg float %a
+  %max0 = call float @llvm.maximum.f32(float %a.fneg, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg1(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, -v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fneg = fneg float %b
+  %max0 = call float @llvm.maximum.f32(float %a, float %b.fneg)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg2(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fneg = fneg float %c
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_const0(float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_const0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, 0x41000000, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_const0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v2, 0x41000000, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float 8.0, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32__const2(float %a, float %b) {
+; GFX12-LABEL: v_fmaximum3_f32__const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, 0x41000000
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32__const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, 0x41000000, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float 8.0)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_inlineimm0(float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_inlineimm0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, 4.0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_inlineimm0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v2, 4.0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float 4.0, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32__inlineimm(float %a, float %b) {
+; GFX12-LABEL: v_fmaximum3_f32__inlineimm:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32__inlineimm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, 4.0, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float 4.0)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_const1_const2(float %a) {
+; GFX12-LABEL: v_fmaximum3_f32_const1_const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s0, 0x41000000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, s0, 0x41800000
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_const1_const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v1, 0x41000000, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, 0x41800000, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float 8.0)
+  %max1 = call float @llvm.maximum.f32(float %max0, float 16.0)
+  ret float %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v4, v0, v2
+; GFX12-NEXT:    v_maximum3_f32 v1, v5, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v5, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %c, <2 x float> %max0)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v2, v4
+; GFX12-NEXT:    v_maximum3_f32 v1, v1, v3, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f32_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, |v0|, |v2|, |v4|
+; GFX12-NEXT:    v_maximum3_f32 v1, |v1|, |v3|, |v5|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f32__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v7, |v6|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v1|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v6, |v1|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v3|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v3|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, |v3|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v0|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, |v0|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v2|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v2|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v0, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, |v4|, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v4|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v4|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v1, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, |v5|, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v5|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v5|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %b.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %b)
+  %c.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %c)
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a.fabs, <2 x float> %b.fabs)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c.fabs)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, -v0, -v2, -v4
+; GFX12-NEXT:    v_maximum3_f32 v1, -v1, -v3, -v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f32__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, -v1, -v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v1, -v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v7, -v6, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v6, -v1, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v3, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, -v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, -v3, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, -v0, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v2, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v0, -v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, -v4, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v4, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v1, -v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, -v5, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v5, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <2 x float> %a
+  %b.fneg = fneg <2 x float> %b
+  %c.fneg = fneg <2 x float> %c
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a.fneg, <2 x float> %b.fneg)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c.fneg)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, 2.0, v2
+; GFX12-NEXT:    v_maximum3_f32 v1, v1, 2.0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f32__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 2.0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 2.0, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 2.0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 2.0, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> <float 2.0, float 2.0>)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b) {
+; GFX12-LABEL: v_fmaximum3_v2f32__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v2, 4.0
+; GFX12-NEXT:    v_maximum3_f32 v1, v1, v3, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f32__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 4.0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 4.0, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 4.0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 4.0, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> <float 4.0, float 4.0>)
+  ret <2 x float> %max1
+}
+
+define <3 x float> @v_fmaximum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v6, v0, v3
+; GFX12-NEXT:    v_maximum3_f32 v1, v7, v1, v4
+; GFX12-NEXT:    v_maximum3_f32 v2, v8, v2, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v10, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v6, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v6, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v7, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v7, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v8, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
+  %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %c, <3 x float> %max0)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fmaximum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f32_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v3, v6
+; GFX12-NEXT:    v_maximum3_f32 v1, v1, v4, v7
+; GFX12-NEXT:    v_maximum3_f32 v2, v2, v5, v8
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f32_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v10, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v6, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v7, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v8, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
+  %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> %c)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fmaximum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f32__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, |v0|, |v3|, |v6|
+; GFX12-NEXT:    v_maximum3_f32 v1, |v1|, |v4|, |v7|
+; GFX12-NEXT:    v_maximum3_f32 v2, |v2|, |v5|, |v8|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f32__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, |v2|, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v2|, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v10, |v9|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v2|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v9, |v2|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v5|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, |v5|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v1|, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v10, |v5|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v1|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, |v1|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v4|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v4|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v0|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, |v4|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v0|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, |v0|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v3|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v3|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v0, |v6|
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, |v6|, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v6|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v6|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v6|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v1, |v7|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, |v7|, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v7|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v7|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v7|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v2, |v8|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, |v8|, v2, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v2, |v8|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v8|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, |v8|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a)
+  %b.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %b)
+  %c.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %c)
+  %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a.fabs, <3 x float> %b.fabs)
+  %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> %c.fabs)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fmaximum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f32__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, -v0, -v3, -v6
+; GFX12-NEXT:    v_maximum3_f32 v1, -v1, -v4, -v7
+; GFX12-NEXT:    v_maximum3_f32 v2, -v2, -v5, -v8
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f32__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, -v2, -v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v2, -v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v10, -v9, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v9, -v2, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -v5, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, -v1, -v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v1, -v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v10, -v5, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v4, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, -v0, -v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v0, -v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, -v4, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, -v0, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v3, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v0, -v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, -v6, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v6, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v6, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v1, -v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, -v7, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v7, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v7, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v2, -v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, -v8, v2, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v2, -v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v8, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -v8, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <3 x float> %a
+  %b.fneg = fneg <3 x float> %b
+  %c.fneg = fneg <3 x float> %c
+  %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a.fneg, <3 x float> %b.fneg)
+  %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> %c.fneg)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fmaximum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f32__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, 2.0, v3
+; GFX12-NEXT:    v_maximum3_f32 v1, v1, 2.0, v4
+; GFX12-NEXT:    v_maximum3_f32 v2, v2, 2.0, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f32__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 2.0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 2.0, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 2.0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 2.0, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 2.0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 2.0, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> <float 2.0, float 2.0, float 2.0>)
+  %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> %c)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fmaximum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b) {
+; GFX12-LABEL: v_fmaximum3_v3f32__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v3, 4.0
+; GFX12-NEXT:    v_maximum3_f32 v1, v1, v4, 4.0
+; GFX12-NEXT:    v_maximum3_f32 v2, v2, v5, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f32__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 4.0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 4.0, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 4.0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 4.0, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 4.0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 4.0, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
+  %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> <float 4.0, float 4.0, float 4.0>)
+  ret <3 x float> %max1
+}
+
+
+define half @v_fmaximum3_f16(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_commute(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v2, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, v2, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %c, half %max0)
+  ret half %max1
+}
+
+define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg %c) {
+; GFX12-LABEL: s_fmaximum3_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum3_f16 v0, s0, s1, v0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fmaximum3_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_max_f16_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, s2, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  %cast = bitcast half %max1 to i16
+  %zext = zext i16 %cast to i32
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readfirstlane
+}
+
+define half @v_fmaximum3_f16_fabs0(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fabs0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, |v0|, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fabs0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v3, |v0|, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call half @llvm.fabs.f16(half %a)
+  %max0 = call half @llvm.maximum.f16(half %a.fabs, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fabs1(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fabs1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, |v1|, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fabs1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v3, v0, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fabs = call half @llvm.fabs.f16(half %b)
+  %max0 = call half @llvm.maximum.f16(half %a, half %b.fabs)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fabs2(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fabs2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fabs2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fabs = call half @llvm.fabs.f16(half %c)
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c.fabs)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fabs_all(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, |v0|, |v1|, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v3, |v0|, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call half @llvm.fabs.f16(half %a)
+  %b.fabs = call half @llvm.fabs.f16(half %b)
+  %c.fabs = call half @llvm.fabs.f16(half %c)
+  %max0 = call half @llvm.maximum.f16(half %a.fabs, half %b.fabs)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c.fabs)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fneg_all(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, -v0, -v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v3, -v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg half %a
+  %b.fneg = fneg half %b
+  %c.fneg = fneg half %c
+  %max0 = call half @llvm.maximum.f16(half %a.fneg, half %b.fneg)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c.fneg)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fneg_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, -|v0|, -|v1|, -|v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fneg_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v3, -|v0|, -|v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -|v0|, -|v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e64 v1, v0, -|v2|
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -|v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call half @llvm.fabs.f16(half %a)
+  %b.fabs = call half @llvm.fabs.f16(half %b)
+  %c.fabs = call half @llvm.fabs.f16(half %c)
+  %a.fneg.fabs = fneg half %a.fabs
+  %b.fneg.fabs = fneg half %b.fabs
+  %c.fneg.fabs = fneg half %c.fabs
+  %max0 = call half @llvm.maximum.f16(half %a.fneg.fabs, half %b.fneg.fabs)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c.fneg.fabs)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fneg0(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fneg0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, -v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fneg0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v3, -v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg half %a
+  %max0 = call half @llvm.maximum.f16(half %a.fneg, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fneg1(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fneg1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, -v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fneg1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v3, v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fneg = fneg half %b
+  %max0 = call half @llvm.maximum.f16(half %a, half %b.fneg)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fneg2(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fneg2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fneg2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fneg = fneg half %c
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c.fneg)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_const0(half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_const0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, 0x4800, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_const0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v2, 0x4800, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half 8.0, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16__const2(half %a, half %b) {
+; GFX12-LABEL: v_fmaximum3_f16__const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, 0x4800
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16__const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, 0x4800, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half 8.0)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_inlineimm0(half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_inlineimm0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, 4.0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_inlineimm0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v2, 4.0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half 4.0, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16__inlineimm(half %a, half %b) {
+; GFX12-LABEL: v_fmaximum3_f16__inlineimm:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16__inlineimm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, 4.0, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half 4.0)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_const1_const2(half %a) {
+; GFX12-LABEL: v_fmaximum3_f16_const1_const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_movk_i32 s0, 0x4800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, s0, 0x4c00
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_const1_const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v1, 0x4800, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, 0x4c00, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half %a, half 8.0)
+  %max1 = call half @llvm.maximum.f16(half %max0, half 16.0)
+  ret half %max1
+}
+
+define <2 x half> @v_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v2, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v2, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max0)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fmaximum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f16_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fmaximum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f16__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
+; GFX12-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f16__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v1
+; GFX9-NEXT:    v_pk_max_f16 v3, v3, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v2
+; GFX9-NEXT:    v_perm_b32 v1, v4, v0, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
+  %b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b)
+  %c.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %c)
+  %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a.fabs, <2 x half> %b.fabs)
+  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c.fabs)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fmaximum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f16__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f16__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v5, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <2 x half> %a
+  %b.fneg = fneg <2 x half> %b
+  %c.fneg = fneg <2 x half> %c
+  %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a.fneg, <2 x half> %b.fneg)
+  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c.fneg)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fmaximum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f16__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f16__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v2, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v3, v0, s4
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 2.0>)
+  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fmaximum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) {
+; GFX12-LABEL: v_fmaximum3_v2f16__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f16__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v0, v4, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> <half 4.0, half 4.0>)
+  ret <2 x half> %max1
+}
+
+define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v4, v0
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v5, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v6, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v5, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v6
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v4, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
+  %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %c, <3 x half> %max0)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fmaximum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f16_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v6, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v5
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v8, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
+  %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fmaximum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f16__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
+; GFX12-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX12-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v5
+; GFX12-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f16__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v7, 0x7fff7fff, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0x7fff7fff, v3
+; GFX9-NEXT:    v_and_b32_e32 v6, 0x7fff7fff, v0
+; GFX9-NEXT:    v_and_b32_e32 v8, 0x7fff7fff, v2
+; GFX9-NEXT:    v_pk_max_f16 v7, v7, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_pk_max_f16 v6, v6, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v6, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_and_b32_e32 v11, 0x7fff7fff, v4
+; GFX9-NEXT:    v_perm_b32 v2, v8, v0, s4
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v11
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x7fff7fff, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_perm_b32 v6, v9, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX9-NEXT:    v_pk_max_f16 v6, v6, v10
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v1, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v6, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a)
+  %b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b)
+  %c.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %c)
+  %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a.fabs, <3 x half> %b.fabs)
+  %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c.fabs)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fmaximum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f16__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f16__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v1, -v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v6, -v5
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v8, -v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <3 x half> %a
+  %b.fneg = fneg <3 x half> %b
+  %c.fneg = fneg <3 x half> %c
+  %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a.fneg, <3 x half> %b.fneg)
+  %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c.fneg)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fmaximum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f16__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, 2.0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f16__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX9-NEXT:    v_pk_max_f16 v7, v1, 2.0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX9-NEXT:    s_mov_b32 s5, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v4, v5, v0, s5
+; GFX9-NEXT:    v_pk_max_f16 v4, v4, v2
+; GFX9-NEXT:    s_movk_i32 s4, 0x7e00
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX9-NEXT:    v_pack_b32_f16 v7, v1, s4
+; GFX9-NEXT:    v_pk_max_f16 v7, v7, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> <half 2.0, half 2.0, half 2.0>)
+  %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fmaximum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) {
+; GFX12-LABEL: v_fmaximum3_v3f16__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f16__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v4, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, 4.0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
+; GFX9-NEXT:    v_perm_b32 v2, v0, v6, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
+  %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> <half 4.0, half 4.0, half 4.0>)
+  ret <3 x half> %max1
+}
+
+define <4 x half> @v_fmaximum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v4f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v4, v0
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v5, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v6, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s4
+; GFX9-NEXT:    v_pk_max_f16 v2, v5, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_pk_max_f16 v2, v4, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
+  %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %c, <4 x half> %max0)
+  ret <4 x half> %max1
+}
+
+define <4 x half> @v_fmaximum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v4f16_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v4f16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v6, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s4
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v5
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v8, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
+  %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c)
+  ret <4 x half> %max1
+}
+
+define <4 x half> @v_fmaximum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v4f16__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
+; GFX12-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX12-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v5
+; GFX12-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v4f16__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v7, 0x7fff7fff, v0
+; GFX9-NEXT:    v_and_b32_e32 v9, 0x7fff7fff, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 0x7fff7fff, v1
+; GFX9-NEXT:    v_and_b32_e32 v8, 0x7fff7fff, v3
+; GFX9-NEXT:    v_pk_max_f16 v7, v7, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_pk_max_f16 v6, v6, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v6, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_and_b32_e32 v11, 0x7fff7fff, v5
+; GFX9-NEXT:    v_perm_b32 v2, v8, v1, s4
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x7fff7fff, v4
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v11
+; GFX9-NEXT:    v_perm_b32 v6, v9, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_pk_max_f16 v6, v6, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v12, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v1, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v6, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v7, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %a)
+  %b.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %b)
+  %c.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %c)
+  %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a.fabs, <4 x half> %b.fabs)
+  %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c.fabs)
+  ret <4 x half> %max1
+}
+
+define <4 x half> @v_fmaximum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v4f16__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v4f16__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v1, -v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s4
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v6, -v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v8, -v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <4 x half> %a
+  %b.fneg = fneg <4 x half> %b
+  %c.fneg = fneg <4 x half> %c
+  %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a.fneg, <4 x half> %b.fneg)
+  %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c.fneg)
+  ret <4 x half> %max1
+}
+
+define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v4f16__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, 2.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v4f16__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_pk_max_f16 v7, v1, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v6, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v4, v8, v1, s4
+; GFX9-NEXT:    v_pk_max_f16 v4, v4, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_perm_b32 v8, v5, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX9-NEXT:    v_pk_max_f16 v8, v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v6, v7, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v9, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v7, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>)
+  %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c)
+  ret <4 x half> %max1
+}
+
+define <4 x half> @v_fmaximum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) {
+; GFX12-LABEL: v_fmaximum3_v4f16__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v4f16__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v4, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v1, v4, s4
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v6, s4
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
+  %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> <half 4.0, half 4.0, half 4.0, half 4.0>)
+  ret <4 x half> %max1
+}
+
+define double @v_fmaximum3_f64(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_commute(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[4:5], v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[4:5], v[0:1]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %c, double %max0)
+  ret double %max1
+}
+
+define amdgpu_ps <2 x i32> @s_fmaximum3_f64(double inreg %a, double inreg %b, double inreg %c) {
+; GFX12-LABEL: s_fmaximum3_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], s[4:5]
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fmaximum3_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  %cast = bitcast double %max1 to <2 x i32>
+  %elt0 = extractelement <2 x i32> %cast, i32 0
+  %elt1 = extractelement <2 x i32> %cast, i32 1
+  %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
+  %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
+  %insert.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
+  %insert.1 = insertelement <2 x i32> %insert.0, i32 %readlane1, i32 1
+  ret <2 x i32> %insert.1
+}
+
+define double @v_fmaximum3_f64_fabs0(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fabs0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], |v[0:1]|, v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fabs0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], |v[0:1]|, v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call double @llvm.fabs.f64(double %a)
+  %max0 = call double @llvm.maximum.f64(double %a.fabs, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fabs1(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fabs1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], |v[2:3]|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fabs1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], |v[2:3]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]|
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fabs = call double @llvm.fabs.f64(double %b)
+  %max0 = call double @llvm.maximum.f64(double %a, double %b.fabs)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fabs2(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fabs2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], |v[4:5]|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fabs2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fabs = call double @llvm.fabs.f64(double %c)
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c.fabs)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fabs_all(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], |v[0:1]|, |v[2:3]|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], |v[4:5]|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], |v[0:1]|, |v[2:3]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]|
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call double @llvm.fabs.f64(double %a)
+  %b.fabs = call double @llvm.fabs.f64(double %b)
+  %c.fabs = call double @llvm.fabs.f64(double %c)
+  %max0 = call double @llvm.maximum.f64(double %a.fabs, double %b.fabs)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c.fabs)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fneg_all(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], -v[0:1], -v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], -v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], -v[0:1], -v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg double %a
+  %b.fneg = fneg double %b
+  %c.fneg = fneg double %c
+  %max0 = call double @llvm.maximum.f64(double %a.fneg, double %b.fneg)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c.fneg)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fneg_fabs_all(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fneg_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], -|v[0:1]|, -|v[2:3]|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], -|v[4:5]|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fneg_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], -|v[0:1]|, -|v[2:3]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]|
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], -|v[4:5]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -|v[4:5]|
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call double @llvm.fabs.f64(double %a)
+  %b.fabs = call double @llvm.fabs.f64(double %b)
+  %c.fabs = call double @llvm.fabs.f64(double %c)
+  %a.fneg.fabs = fneg double %a.fabs
+  %b.fneg.fabs = fneg double %b.fabs
+  %c.fneg.fabs = fneg double %c.fabs
+  %max0 = call double @llvm.maximum.f64(double %a.fneg.fabs, double %b.fneg.fabs)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c.fneg.fabs)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fneg0(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fneg0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], -v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fneg0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], -v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg double %a
+  %max0 = call double @llvm.maximum.f64(double %a.fneg, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fneg1(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fneg1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], -v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fneg1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], -v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fneg = fneg double %b
+  %max0 = call double @llvm.maximum.f64(double %a, double %b.fneg)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fneg2(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fneg2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], -v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fneg2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fneg = fneg double %c
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c.fneg)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_const0(double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_const0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], 0x40200000, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_const0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double 8.0, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64__const2(double %a, double %b) {
+; GFX12-LABEL: v_fmaximum3_f64__const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], 0x40200000, v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64__const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double 8.0)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_inlineimm0(double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_inlineimm0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], 4.0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_inlineimm0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], 4.0
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double 4.0, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64__inlineimm(double %a, double %b) {
+; GFX12-LABEL: v_fmaximum3_f64__inlineimm:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64__inlineimm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], 4.0
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double 4.0)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_const1_const2(double %a) {
+; GFX12-LABEL: v_fmaximum3_f64_const1_const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], 0x40200000, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], 0x40300000, v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_const1_const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40300000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double %a, double 8.0)
+  %max1 = call double @llvm.maximum.f64(double %max0, double 16.0)
+  ret double %max1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index eef271e..7481fff 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -1,98 +1,3251 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
-
-; GCN-LABEL: {{^}}test_fminimum3_olt_0_f32:
-; GCN: buffer_load_b32 [[REGC:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGB:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGA:v[0-9]+]]
-; GCN: v_minimum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b32 [[RESULT]],
-define amdgpu_kernel void @test_fminimum3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile float, ptr addrspace(1) %aptr, align 4
-  %b = load volatile float, ptr addrspace(1) %bptr, align 4
-  %c = load volatile float, ptr addrspace(1) %cptr, align 4
-  %f0 = call float @llvm.minimum.f32(float %a, float %b)
-  %f1 = call float @llvm.minimum.f32(float %f0, float %c)
-  store float %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-; Commute operand of second fminimum
-; GCN-LABEL: {{^}}test_fminimum3_olt_1_f32:
-; GCN: buffer_load_b32 [[REGB:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGA:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGC:v[0-9]+]]
-; GCN: v_minimum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b32 [[RESULT]],
-define amdgpu_kernel void @test_fminimum3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile float, ptr addrspace(1) %aptr, align 4
-  %b = load volatile float, ptr addrspace(1) %bptr, align 4
-  %c = load volatile float, ptr addrspace(1) %cptr, align 4
-  %f0 = call float @llvm.minimum.f32(float %a, float %b)
-  %f1 = call float @llvm.minimum.f32(float %c, float %f0)
-  store float %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_fminimum3_olt_0_f16:
-; GCN: buffer_load_u16 [[REGC:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGB:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGA:v[0-9]+]]
-; GCN: v_minimum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b16 [[RESULT]],
-define amdgpu_kernel void @test_fminimum3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile half, ptr addrspace(1) %aptr, align 2
-  %b = load volatile half, ptr addrspace(1) %bptr, align 2
-  %c = load volatile half, ptr addrspace(1) %cptr, align 2
-  %f0 = call half @llvm.minimum.f16(half %a, half %b)
-  %f1 = call half @llvm.minimum.f16(half %f0, half %c)
-  store half %f1, ptr addrspace(1) %out, align 2
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_fminimum3_olt_1_f16:
-; GCN: buffer_load_u16 [[REGA:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGB:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGC:v[0-9]+]]
-; GCN: v_minimum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]]
-; GCN: buffer_store_b16 [[RESULT]],
-define amdgpu_kernel void @test_fminimum3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile half, ptr addrspace(1) %aptr, align 2
-  %b = load volatile half, ptr addrspace(1) %bptr, align 2
-  %c = load volatile half, ptr addrspace(1) %cptr, align 2
-  %f0 = call half @llvm.minimum.f16(half %a, half %b)
-  %f1 = call half @llvm.minimum.f16(half %c, half %f0)
-  store half %f1, ptr addrspace(1) %out, align 2
-  ret void
-}
-
-; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of minimum3
-; since there are no pack instructions for fminimum3.
-; GCN-LABEL: {{^}}no_fminimum3_v2f16:
-; GCN: v_pk_minimum_f16 v0, v0, v1
-; GCN: v_pk_minimum_f16 v0, v2, v0
-; GCN: v_pk_minimum_f16 v0, v0, v3
-; GCN-NEXT: s_setpc_b64
-define <2 x half> @no_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
-entry:
-  %min = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
-  %min1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %c, <2 x half> %min)
-  %res = call <2 x half> @llvm.minimum.v2f16(<2 x half> %min1, <2 x half> %d)
-  ret <2 x half> %res
-}
-
-; GCN-LABEL: {{^}}no_fminimum3_olt_0_f64:
-; GCN-COUNT-2: v_minimum_f64
-define amdgpu_kernel void @no_fminimum3_olt_0_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile double, ptr addrspace(1) %aptr, align 4
-  %b = load volatile double, ptr addrspace(1) %bptr, align 4
-  %c = load volatile double, ptr addrspace(1) %cptr, align 4
-  %f0 = call double @llvm.minimum.f64(double %a, double %b)
-  %f1 = call double @llvm.minimum.f64(double %f0, double %c)
-  store double %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-declare double @llvm.minimum.f64(double, double)
-declare float @llvm.minimum.f32(float, float)
-declare half @llvm.minimum.f16(half, half)
-declare <2 x half> @llvm.minimum.v2f16(<2 x half>, <2 x half>)
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+
+define float @v_fminimum3_f32(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_commute(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v2, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, v2, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %c, float %max0)
+  ret float %max1
+}
+
+define amdgpu_ps i32 @s_fminimum3_f32(float inreg %a, float inreg %b, float inreg %c) {
+; GFX12-LABEL: s_fminimum3_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum3_f32 v0, s0, s1, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fminimum3_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_min_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, s2, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  %cast = bitcast float %max1 to i32
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+  ret i32 %readfirstlane
+}
+
+define float @v_fminimum3_f32_fabs0(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fabs0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, |v0|, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fabs0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e64 v3, |v0|, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %max0 = call float @llvm.minimum.f32(float %a.fabs, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fabs1(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fabs1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, |v1|, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fabs1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e64 v3, v0, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %max0 = call float @llvm.minimum.f32(float %a, float %b.fabs)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fabs2(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fabs2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fabs2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c.fabs)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fabs_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, |v0|, |v1|, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e64 v3, |v0|, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %max0 = call float @llvm.minimum.f32(float %a.fabs, float %b.fabs)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c.fabs)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fneg_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, -v0, -v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e64 v3, -v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg float %a
+  %b.fneg = fneg float %b
+  %c.fneg = fneg float %c
+  %max0 = call float @llvm.minimum.f32(float %a.fneg, float %b.fneg)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c.fneg)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fneg_fabs_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fneg_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, -|v0|, -|v1|, -|v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fneg_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e64 v3, -|v0|, -|v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -|v0|, -|v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e64 v1, v0, -|v2|
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -|v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %a.fneg.fabs = fneg float %a.fabs
+  %b.fneg.fabs = fneg float %b.fabs
+  %c.fneg.fabs = fneg float %c.fabs
+  %max0 = call float @llvm.minimum.f32(float %a.fneg.fabs, float %b.fneg.fabs)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c.fneg.fabs)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fneg0(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fneg0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, -v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fneg0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e64 v3, -v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg float %a
+  %max0 = call float @llvm.minimum.f32(float %a.fneg, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fneg1(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fneg1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, -v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fneg1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e64 v3, v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fneg = fneg float %b
+  %max0 = call float @llvm.minimum.f32(float %a, float %b.fneg)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fneg2(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fneg2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fneg2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fneg = fneg float %c
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c.fneg)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_const0(float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_const0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, 0x41000000, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_const0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v2, 0x41000000, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float 8.0, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32__const2(float %a, float %b) {
+; GFX12-LABEL: v_fminimum3_f32__const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, 0x41000000
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32__const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, 0x41000000, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float 8.0)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_inlineimm0(float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_inlineimm0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, 4.0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_inlineimm0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v2, 4.0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float 4.0, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32__inlineimm(float %a, float %b) {
+; GFX12-LABEL: v_fminimum3_f32__inlineimm:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32__inlineimm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, 4.0, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float 4.0)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_const1_const2(float %a) {
+; GFX12-LABEL: v_fminimum3_f32_const1_const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s0, 0x41000000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, s0, 0x41800000
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_const1_const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v1, 0x41000000, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, 0x41800000, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float %a, float 8.0)
+  %max1 = call float @llvm.minimum.f32(float %max0, float 16.0)
+  ret float %max1
+}
+
+define <2 x float> @v_fminimum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v4, v0, v2
+; GFX12-NEXT:    v_minimum3_f32 v1, v5, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v5, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
+  %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %c, <2 x float> %max0)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fminimum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v2f32_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v2, v4
+; GFX12-NEXT:    v_minimum3_f32 v1, v1, v3, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f32_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
+  %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> %c)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fminimum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v2f32__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, |v0|, |v2|, |v4|
+; GFX12-NEXT:    v_minimum3_f32 v1, |v1|, |v3|, |v5|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f32__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v7, |v6|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v1|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v6, |v1|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v3|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v3|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, |v3|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v0|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, |v0|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v2|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v2|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v0, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, |v4|, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v4|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v4|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, |v5|, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v5|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v5|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %b.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %b)
+  %c.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %c)
+  %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a.fabs, <2 x float> %b.fabs)
+  %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> %c.fabs)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fminimum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v2f32__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, -v0, -v2, -v4
+; GFX12-NEXT:    v_minimum3_f32 v1, -v1, -v3, -v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f32__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, -v1, -v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v1, -v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v7, -v6, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v6, -v1, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v3, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, -v3, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, -v0, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v2, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v0, -v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, -v4, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v4, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, -v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, -v5, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v5, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <2 x float> %a
+  %b.fneg = fneg <2 x float> %b
+  %c.fneg = fneg <2 x float> %c
+  %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a.fneg, <2 x float> %b.fneg)
+  %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> %c.fneg)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fminimum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v2f32__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, 2.0, v2
+; GFX12-NEXT:    v_minimum3_f32 v1, v1, 2.0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f32__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 2.0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 2.0, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 2.0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 2.0, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> <float 2.0, float 2.0>)
+  %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> %c)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fminimum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b) {
+; GFX12-LABEL: v_fminimum3_v2f32__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v2, 4.0
+; GFX12-NEXT:    v_minimum3_f32 v1, v1, v3, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f32__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 4.0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 4.0, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 4.0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 4.0, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
+  %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> <float 4.0, float 4.0>)
+  ret <2 x float> %max1
+}
+
+define <3 x float> @v_fminimum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v3f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v6, v0, v3
+; GFX12-NEXT:    v_minimum3_f32 v1, v7, v1, v4
+; GFX12-NEXT:    v_minimum3_f32 v2, v8, v2, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v10, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v6, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v6, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v7, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v7, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v8, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
+  %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %c, <3 x float> %max0)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fminimum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v3f32_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v3, v6
+; GFX12-NEXT:    v_minimum3_f32 v1, v1, v4, v7
+; GFX12-NEXT:    v_minimum3_f32 v2, v2, v5, v8
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f32_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v10, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v6, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v7, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v8, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
+  %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> %c)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fminimum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v3f32__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, |v0|, |v3|, |v6|
+; GFX12-NEXT:    v_minimum3_f32 v1, |v1|, |v4|, |v7|
+; GFX12-NEXT:    v_minimum3_f32 v2, |v2|, |v5|, |v8|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f32__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v2|, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v2|, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v10, |v9|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v2|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v9, |v2|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v5|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, |v5|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v1|, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v10, |v5|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v1|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, |v1|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v4|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v4|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v0|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, |v4|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v0|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, |v0|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v3|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v3|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v0, |v6|
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, |v6|, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v6|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v6|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v6|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, |v7|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, |v7|, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v7|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v7|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v7|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v2, |v8|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, |v8|, v2, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v2, |v8|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v8|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, |v8|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a)
+  %b.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %b)
+  %c.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %c)
+  %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a.fabs, <3 x float> %b.fabs)
+  %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> %c.fabs)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fminimum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v3f32__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, -v0, -v3, -v6
+; GFX12-NEXT:    v_minimum3_f32 v1, -v1, -v4, -v7
+; GFX12-NEXT:    v_minimum3_f32 v2, -v2, -v5, -v8
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f32__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, -v2, -v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v2, -v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v10, -v9, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v9, -v2, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -v5, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, -v1, -v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v1, -v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v10, -v5, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v4, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, -v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v0, -v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, -v4, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, -v0, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v3, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v0, -v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, -v6, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v6, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v6, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, -v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, -v7, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v7, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v7, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v2, -v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, -v8, v2, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v2, -v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v8, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -v8, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <3 x float> %a
+  %b.fneg = fneg <3 x float> %b
+  %c.fneg = fneg <3 x float> %c
+  %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a.fneg, <3 x float> %b.fneg)
+  %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> %c.fneg)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fminimum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v3f32__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, 2.0, v3
+; GFX12-NEXT:    v_minimum3_f32 v1, v1, 2.0, v4
+; GFX12-NEXT:    v_minimum3_f32 v2, v2, 2.0, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f32__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 2.0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 2.0, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 2.0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 2.0, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 2.0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 2.0, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> <float 2.0, float 2.0, float 2.0>)
+  %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> %c)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fminimum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b) {
+; GFX12-LABEL: v_fminimum3_v3f32__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v3, 4.0
+; GFX12-NEXT:    v_minimum3_f32 v1, v1, v4, 4.0
+; GFX12-NEXT:    v_minimum3_f32 v2, v2, v5, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f32__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 4.0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 4.0, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 4.0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 4.0, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 4.0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 4.0, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
+  %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> <float 4.0, float 4.0, float 4.0>)
+  ret <3 x float> %max1
+}
+
+
+define half @v_fminimum3_f16(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_commute(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v2, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, v2, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %c, half %max0)
+  ret half %max1
+}
+
+define amdgpu_ps i32 @s_fminimum3_f16(half inreg %a, half inreg %b, half inreg %c) {
+; GFX12-LABEL: s_fminimum3_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum3_f16 v0, s0, s1, v0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fminimum3_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_min_f16_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, s2, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  %cast = bitcast half %max1 to i16
+  %zext = zext i16 %cast to i32
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readfirstlane
+}
+
+define half @v_fminimum3_f16_fabs0(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fabs0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, |v0|, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fabs0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e64 v3, |v0|, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call half @llvm.fabs.f16(half %a)
+  %max0 = call half @llvm.minimum.f16(half %a.fabs, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fabs1(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fabs1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, |v1|, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fabs1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e64 v3, v0, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fabs = call half @llvm.fabs.f16(half %b)
+  %max0 = call half @llvm.minimum.f16(half %a, half %b.fabs)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fabs2(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fabs2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fabs2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fabs = call half @llvm.fabs.f16(half %c)
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c.fabs)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fabs_all(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, |v0|, |v1|, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e64 v3, |v0|, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call half @llvm.fabs.f16(half %a)
+  %b.fabs = call half @llvm.fabs.f16(half %b)
+  %c.fabs = call half @llvm.fabs.f16(half %c)
+  %max0 = call half @llvm.minimum.f16(half %a.fabs, half %b.fabs)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c.fabs)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fneg_all(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, -v0, -v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e64 v3, -v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg half %a
+  %b.fneg = fneg half %b
+  %c.fneg = fneg half %c
+  %max0 = call half @llvm.minimum.f16(half %a.fneg, half %b.fneg)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c.fneg)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fneg_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, -|v0|, -|v1|, -|v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fneg_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e64 v3, -|v0|, -|v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -|v0|, -|v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e64 v1, v0, -|v2|
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -|v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call half @llvm.fabs.f16(half %a)
+  %b.fabs = call half @llvm.fabs.f16(half %b)
+  %c.fabs = call half @llvm.fabs.f16(half %c)
+  %a.fneg.fabs = fneg half %a.fabs
+  %b.fneg.fabs = fneg half %b.fabs
+  %c.fneg.fabs = fneg half %c.fabs
+  %max0 = call half @llvm.minimum.f16(half %a.fneg.fabs, half %b.fneg.fabs)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c.fneg.fabs)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fneg0(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fneg0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, -v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fneg0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e64 v3, -v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg half %a
+  %max0 = call half @llvm.minimum.f16(half %a.fneg, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fneg1(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fneg1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, -v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fneg1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e64 v3, v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fneg = fneg half %b
+  %max0 = call half @llvm.minimum.f16(half %a, half %b.fneg)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fneg2(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fneg2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fneg2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fneg = fneg half %c
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c.fneg)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_const0(half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_const0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, 0x4800, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_const0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v2, 0x4800, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half 8.0, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16__const2(half %a, half %b) {
+; GFX12-LABEL: v_fminimum3_f16__const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, 0x4800
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16__const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, 0x4800, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half 8.0)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_inlineimm0(half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_inlineimm0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, 4.0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_inlineimm0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v2, 4.0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half 4.0, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16__inlineimm(half %a, half %b) {
+; GFX12-LABEL: v_fminimum3_f16__inlineimm:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16__inlineimm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, 4.0, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half 4.0)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_const1_const2(half %a) {
+; GFX12-LABEL: v_fminimum3_f16_const1_const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_movk_i32 s0, 0x4800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, s0, 0x4c00
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_const1_const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v1, 0x4800, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, 0x4c00, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half %a, half 8.0)
+  %max1 = call half @llvm.minimum.f16(half %max0, half 16.0)
+  ret half %max1
+}
+
+define <2 x half> @v_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v2, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v2, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %c, <2 x half> %max0)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fminimum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v2f16_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fminimum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v2f16__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
+; GFX12-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f16__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v1
+; GFX9-NEXT:    v_pk_min_f16 v3, v3, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v2
+; GFX9-NEXT:    v_perm_b32 v1, v4, v0, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
+  %b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b)
+  %c.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %c)
+  %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a.fabs, <2 x half> %b.fabs)
+  %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c.fabs)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fminimum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v2f16__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f16__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v5, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <2 x half> %a
+  %b.fneg = fneg <2 x half> %b
+  %c.fneg = fneg <2 x half> %c
+  %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a.fneg, <2 x half> %b.fneg)
+  %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c.fneg)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fminimum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v2f16__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f16__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v2, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v3, v0, s4
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 2.0>)
+  %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fminimum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) {
+; GFX12-LABEL: v_fminimum3_v2f16__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f16__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v0, v4, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> <half 4.0, half 4.0>)
+  ret <2 x half> %max1
+}
+
+define <3 x half> @v_fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v3f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v4, v0
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v5, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v6, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v5, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v6
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v4, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
+  %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %c, <3 x half> %max0)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fminimum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v3f16_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v6, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v5
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v8, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
+  %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fminimum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v3f16__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
+; GFX12-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX12-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v5
+; GFX12-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f16__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v7, 0x7fff7fff, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0x7fff7fff, v3
+; GFX9-NEXT:    v_and_b32_e32 v6, 0x7fff7fff, v0
+; GFX9-NEXT:    v_and_b32_e32 v8, 0x7fff7fff, v2
+; GFX9-NEXT:    v_pk_min_f16 v7, v7, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_pk_min_f16 v6, v6, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v6, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_and_b32_e32 v11, 0x7fff7fff, v4
+; GFX9-NEXT:    v_perm_b32 v2, v8, v0, s4
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v11
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x7fff7fff, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_perm_b32 v6, v9, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX9-NEXT:    v_pk_min_f16 v6, v6, v10
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v1, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v6, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a)
+  %b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b)
+  %c.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %c)
+  %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a.fabs, <3 x half> %b.fabs)
+  %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c.fabs)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fminimum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v3f16__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f16__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v1, -v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v6, -v5
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v8, -v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <3 x half> %a
+  %b.fneg = fneg <3 x half> %b
+  %c.fneg = fneg <3 x half> %c
+  %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a.fneg, <3 x half> %b.fneg)
+  %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c.fneg)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fminimum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v3f16__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, 2.0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f16__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX9-NEXT:    v_pk_min_f16 v7, v1, 2.0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX9-NEXT:    s_mov_b32 s5, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v4, v5, v0, s5
+; GFX9-NEXT:    v_pk_min_f16 v4, v4, v2
+; GFX9-NEXT:    s_movk_i32 s4, 0x7e00
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX9-NEXT:    v_pack_b32_f16 v7, v1, s4
+; GFX9-NEXT:    v_pk_min_f16 v7, v7, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> <half 2.0, half 2.0, half 2.0>)
+  %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fminimum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) {
+; GFX12-LABEL: v_fminimum3_v3f16__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f16__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v4, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, 4.0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
+; GFX9-NEXT:    v_perm_b32 v2, v0, v6, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
+  %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> <half 4.0, half 4.0, half 4.0>)
+  ret <3 x half> %max1
+}
+
+define <4 x half> @v_fminimum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v4f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v4, v0
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v5, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v6, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s4
+; GFX9-NEXT:    v_pk_min_f16 v2, v5, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_pk_min_f16 v2, v4, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
+  %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %c, <4 x half> %max0)
+  ret <4 x half> %max1
+}
+
+define <4 x half> @v_fminimum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v4f16_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v4f16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v6, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s4
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v5
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v8, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
+  %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c)
+  ret <4 x half> %max1
+}
+
+define <4 x half> @v_fminimum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v4f16__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
+; GFX12-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX12-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v5
+; GFX12-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v4f16__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v7, 0x7fff7fff, v0
+; GFX9-NEXT:    v_and_b32_e32 v9, 0x7fff7fff, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 0x7fff7fff, v1
+; GFX9-NEXT:    v_and_b32_e32 v8, 0x7fff7fff, v3
+; GFX9-NEXT:    v_pk_min_f16 v7, v7, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_pk_min_f16 v6, v6, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v6, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_and_b32_e32 v11, 0x7fff7fff, v5
+; GFX9-NEXT:    v_perm_b32 v2, v8, v1, s4
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x7fff7fff, v4
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v11
+; GFX9-NEXT:    v_perm_b32 v6, v9, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_pk_min_f16 v6, v6, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v12, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v1, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v6, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v7, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %a)
+  %b.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %b)
+  %c.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %c)
+  %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a.fabs, <4 x half> %b.fabs)
+  %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c.fabs)
+  ret <4 x half> %max1
+}
+
+define <4 x half> @v_fminimum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v4f16__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v4f16__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v1, -v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s4
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v6, -v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v8, -v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <4 x half> %a
+  %b.fneg = fneg <4 x half> %b
+  %c.fneg = fneg <4 x half> %c
+  %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a.fneg, <4 x half> %b.fneg)
+  %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c.fneg)
+  ret <4 x half> %max1
+}
+
+define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v4f16__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, 2.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v4f16__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_pk_min_f16 v7, v1, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v6, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v4, v8, v1, s4
+; GFX9-NEXT:    v_pk_min_f16 v4, v4, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_perm_b32 v8, v5, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX9-NEXT:    v_pk_min_f16 v8, v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v6, v7, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v9, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v7, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>)
+  %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c)
+  ret <4 x half> %max1
+}
+
+define <4 x half> @v_fminimum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) {
+; GFX12-LABEL: v_fminimum3_v4f16__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v4f16__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v4, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v1, v4, s4
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v6, s4
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
+  %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> <half 4.0, half 4.0, half 4.0, half 4.0>)
+  ret <4 x half> %max1
+}
+
+define double @v_fminimum3_f64(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_commute(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[4:5], v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[4:5], v[0:1]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %c, double %max0)
+  ret double %max1
+}
+
+define amdgpu_ps <2 x i32> @s_fminimum3_f64(double inreg %a, double inreg %b, double inreg %c) {
+; GFX12-LABEL: s_fminimum3_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], s[4:5]
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fminimum3_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  %cast = bitcast double %max1 to <2 x i32>
+  %elt0 = extractelement <2 x i32> %cast, i32 0
+  %elt1 = extractelement <2 x i32> %cast, i32 1
+  %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
+  %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
+  %insert.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
+  %insert.1 = insertelement <2 x i32> %insert.0, i32 %readlane1, i32 1
+  ret <2 x i32> %insert.1
+}
+
+define double @v_fminimum3_f64_fabs0(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fabs0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], |v[0:1]|, v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fabs0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], |v[0:1]|, v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call double @llvm.fabs.f64(double %a)
+  %max0 = call double @llvm.minimum.f64(double %a.fabs, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fabs1(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fabs1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], |v[2:3]|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fabs1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], |v[2:3]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]|
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fabs = call double @llvm.fabs.f64(double %b)
+  %max0 = call double @llvm.minimum.f64(double %a, double %b.fabs)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fabs2(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fabs2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], |v[4:5]|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fabs2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fabs = call double @llvm.fabs.f64(double %c)
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c.fabs)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fabs_all(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], |v[0:1]|, |v[2:3]|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], |v[4:5]|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], |v[0:1]|, |v[2:3]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]|
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call double @llvm.fabs.f64(double %a)
+  %b.fabs = call double @llvm.fabs.f64(double %b)
+  %c.fabs = call double @llvm.fabs.f64(double %c)
+  %max0 = call double @llvm.minimum.f64(double %a.fabs, double %b.fabs)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c.fabs)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fneg_all(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], -v[0:1], -v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], -v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], -v[0:1], -v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg double %a
+  %b.fneg = fneg double %b
+  %c.fneg = fneg double %c
+  %max0 = call double @llvm.minimum.f64(double %a.fneg, double %b.fneg)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c.fneg)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fneg_fabs_all(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fneg_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], -|v[0:1]|, -|v[2:3]|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], -|v[4:5]|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fneg_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], -|v[0:1]|, -|v[2:3]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]|
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], -|v[4:5]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -|v[4:5]|
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call double @llvm.fabs.f64(double %a)
+  %b.fabs = call double @llvm.fabs.f64(double %b)
+  %c.fabs = call double @llvm.fabs.f64(double %c)
+  %a.fneg.fabs = fneg double %a.fabs
+  %b.fneg.fabs = fneg double %b.fabs
+  %c.fneg.fabs = fneg double %c.fabs
+  %max0 = call double @llvm.minimum.f64(double %a.fneg.fabs, double %b.fneg.fabs)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c.fneg.fabs)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fneg0(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fneg0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], -v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fneg0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], -v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg double %a
+  %max0 = call double @llvm.minimum.f64(double %a.fneg, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fneg1(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fneg1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], -v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fneg1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], -v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fneg = fneg double %b
+  %max0 = call double @llvm.minimum.f64(double %a, double %b.fneg)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fneg2(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fneg2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], -v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fneg2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fneg = fneg double %c
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c.fneg)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_const0(double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_const0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], 0x40200000, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_const0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double 8.0, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64__const2(double %a, double %b) {
+; GFX12-LABEL: v_fminimum3_f64__const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], 0x40200000, v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64__const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double 8.0)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_inlineimm0(double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_inlineimm0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], 4.0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_inlineimm0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], 4.0
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double 4.0, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64__inlineimm(double %a, double %b) {
+; GFX12-LABEL: v_fminimum3_f64__inlineimm:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64__inlineimm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], 4.0
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double 4.0)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_const1_const2(double %a) {
+; GFX12-LABEL: v_fminimum3_f64_const1_const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], 0x40200000, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], 0x40300000, v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_const1_const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40300000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double %a, double 8.0)
+  %max1 = call double @llvm.minimum.f64(double %max0, double 16.0)
+  ret double %max1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
index 64063f6..04ef30b 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -253,25 +253,25 @@ define amdgpu_kernel void @fp_to_sint_i64 (ptr addrspace(1) %out, float %in) {
 ; EG-NEXT:     ADD_INT * T2.W, PV.W, literal.y,
 ; EG-NEXT:    8388608(1.175494e-38), -150(nan)
 ; EG-NEXT:     ADD_INT T0.X, T0.W, literal.x,
-; EG-NEXT:     SUB_INT T0.Y, literal.y, T0.W,
-; EG-NEXT:     AND_INT T0.Z, PS, literal.z,
+; EG-NEXT:     AND_INT T0.Y, PS, literal.y,
+; EG-NEXT:     SUB_INT T0.Z, literal.z, T0.W,
 ; EG-NEXT:     NOT_INT T0.W, PS,
 ; EG-NEXT:     LSHR * T3.W, PV.W, 1,
-; EG-NEXT:    -127(nan), 150(2.101948e-43)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:    -127(nan), 31(4.344025e-44)
+; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
 ; EG-NEXT:     BIT_ALIGN_INT T1.X, 0.0, PS, PV.W,
-; EG-NEXT:     LSHL T1.Y, T1.W, PV.Z,
-; EG-NEXT:     AND_INT T0.Z, T2.W, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, T1.W, PV.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.x,
+; EG-NEXT:     AND_INT T1.Y, PV.Z, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, T1.W, PV.Z,
+; EG-NEXT:     LSHL T0.W, T1.W, PV.Y,
+; EG-NEXT:     AND_INT * T1.W, T2.W, literal.x,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     CNDE_INT T0.Y, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T1.Z, PV.Z, PV.Y, 0.0,
-; EG-NEXT:     CNDE_INT T0.W, PV.Z, PV.X, PV.Y,
+; EG-NEXT:     CNDE_INT T0.Z, PV.Y, PV.Z, 0.0,
+; EG-NEXT:     CNDE_INT T0.W, PS, PV.X, PV.W,
 ; EG-NEXT:     SETGT_INT * T1.W, T0.X, literal.x,
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T0.Z, PS, 0.0, PV.W,
-; EG-NEXT:     CNDE_INT T0.W, PS, PV.Y, PV.Z,
+; EG-NEXT:     CNDE_INT T1.Z, PS, 0.0, PV.W,
+; EG-NEXT:     CNDE_INT T0.W, PS, PV.Z, PV.Y,
 ; EG-NEXT:     ASHR * T1.W, KC0[2].Z, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ; EG-NEXT:     XOR_INT T0.W, PV.W, PS,
@@ -364,79 +364,78 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %
 ;
 ; EG-LABEL: fp_to_sint_v2i64:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 75, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 74, @4, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    ALU clause starting at 4:
 ; EG-NEXT:     MOV * T0.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT * T1.W, KC0[2].W, literal.x, PV.W,
-; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T0.Z, KC0[2].W, literal.x,
-; EG-NEXT:     BFE_UINT T0.W, KC0[3].X, literal.y, T0.W,
-; EG-NEXT:     ADD_INT * T2.W, PV.W, literal.z,
-; EG-NEXT:    8388607(1.175494e-38), 23(3.222986e-44)
+; EG-NEXT:     BFE_UINT T0.Z, KC0[3].X, literal.x, PV.W,
+; EG-NEXT:     BFE_UINT T0.W, KC0[2].W, literal.x, PV.W,
+; EG-NEXT:     AND_INT * T1.Z, KC0[2].W, literal.y,
+; EG-NEXT:    23(3.222986e-44), 8388607(1.175494e-38)
+; EG-NEXT:     ADD_INT T1.W, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, PV.Z, literal.x,
 ; EG-NEXT:    -150(nan), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T0.X, literal.x, PV.W,
-; EG-NEXT:     SUB_INT T0.Y, literal.x, T1.W,
-; EG-NEXT:     AND_INT T1.Z, PS, literal.y,
-; EG-NEXT:     OR_INT T3.W, PV.Z, literal.z,
+; EG-NEXT:     AND_INT T0.X, PS, literal.x,
+; EG-NEXT:     AND_INT T0.Y, PV.W, literal.x,
+; EG-NEXT:     OR_INT T1.Z, T1.Z, literal.y,
+; EG-NEXT:     SUB_INT T3.W, literal.z, T0.W,
 ; EG-NEXT:     AND_INT * T4.W, KC0[3].X, literal.w,
-; EG-NEXT:    150(2.101948e-43), 31(4.344025e-44)
-; EG-NEXT:    8388608(1.175494e-38), 8388607(1.175494e-38)
+; EG-NEXT:    31(4.344025e-44), 8388608(1.175494e-38)
+; EG-NEXT:    150(2.101948e-43), 8388607(1.175494e-38)
 ; EG-NEXT:     OR_INT T1.X, PS, literal.x,
-; EG-NEXT:     LSHL T1.Y, PV.W, PV.Z,
-; EG-NEXT:     AND_INT T0.Z, T2.W, literal.y,
-; EG-NEXT:     BIT_ALIGN_INT T4.W, 0.0, PV.W, PV.Y,
-; EG-NEXT:     AND_INT * T5.W, PV.Y, literal.y,
+; EG-NEXT:     AND_INT T1.Y, PV.W, literal.y,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, 0.0, PV.Z, PV.W,
+; EG-NEXT:     LSHL T3.W, PV.Z, PV.Y,
+; EG-NEXT:     AND_INT * T4.W, T1.W, literal.y,
 ; EG-NEXT:    8388608(1.175494e-38), 32(4.484155e-44)
-; EG-NEXT:     CNDE_INT T2.X, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T0.Y, PV.Z, PV.Y, 0.0,
-; EG-NEXT:     ADD_INT T1.Z, T0.W, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T4.W, 0.0, PV.X, T0.X,
-; EG-NEXT:     AND_INT * T5.W, T0.X, literal.y,
-; EG-NEXT:    -150(nan), 32(4.484155e-44)
+; EG-NEXT:     CNDE_INT T0.Y, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T2.Z, PV.Y, PV.Z, 0.0,
+; EG-NEXT:     LSHL T5.W, PV.X, T0.X,
+; EG-NEXT:     AND_INT * T6.W, T2.W, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     CNDE_INT T0.X, PS, PV.W, 0.0,
-; EG-NEXT:     NOT_INT T2.Y, T2.W,
-; EG-NEXT:     AND_INT T2.Z, PV.Z, literal.x,
-; EG-NEXT:     NOT_INT T2.W, PV.Z,
-; EG-NEXT:     LSHR * T4.W, T1.X, 1,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T3.X, T3.W, 1,
-; EG-NEXT:     ADD_INT T3.Y, T0.W, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     BIT_ALIGN_INT T3.Z, 0.0, PS, PV.W,
-; EG-NEXT:     LSHL T0.W, T1.X, PV.Z,
-; EG-NEXT:     AND_INT * T2.W, T1.Z, literal.y,
+; EG-NEXT:     NOT_INT T1.Y, T1.W,
+; EG-NEXT:     SUB_INT T3.Z, literal.x, T0.Z,
+; EG-NEXT:     NOT_INT T1.W, T2.W, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T2.W, T1.X, 1,
+; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
+; EG-NEXT:     LSHR T2.X, T1.Z, 1,
+; EG-NEXT:     ADD_INT T2.Y, T0.Z, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, PS, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, 0.0, T1.X, PV.Z,
+; EG-NEXT:     AND_INT * T2.W, PV.Z, literal.y,
 ; EG-NEXT:    -127(nan), 32(4.484155e-44)
 ; EG-NEXT:     CNDE_INT T1.X, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T4.Y, PS, PV.Z, PV.W,
-; EG-NEXT:     SETGT_INT T1.Z, PV.Y, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, PV.X, T2.Y,
-; EG-NEXT:     ADD_INT * T1.W, T1.W, literal.y,
+; EG-NEXT:     CNDE_INT T3.Y, T6.W, PV.Z, T5.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGT_INT T0.Z, PV.Y, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, 0.0, PV.X, T1.Y,
+; EG-NEXT:     ADD_INT * T0.W, T0.W, literal.y,
 ; EG-NEXT:    23(3.222986e-44), -127(nan)
-; EG-NEXT:     CNDE_INT T3.X, T0.Z, PV.W, T1.Y,
+; EG-NEXT:     CNDE_INT T2.X, T4.W, PV.W, T3.W,
 ; EG-NEXT:     SETGT_INT T1.Y, PS, literal.x,
-; EG-NEXT:     CNDE_INT T0.Z, PV.Z, 0.0, PV.Y,
-; EG-NEXT:     CNDE_INT T0.W, PV.Z, T0.X, PV.X,
+; EG-NEXT:     CNDE_INT T1.Z, PV.Z, 0.0, PV.Y,
+; EG-NEXT:     CNDE_INT T1.W, PV.Z, PV.X, T0.X,
 ; EG-NEXT:     ASHR * T2.W, KC0[3].X, literal.y,
 ; EG-NEXT:    23(3.222986e-44), 31(4.344025e-44)
 ; EG-NEXT:     XOR_INT T0.X, PV.W, PS,
-; EG-NEXT:     XOR_INT T2.Y, PV.Z, PS,
+; EG-NEXT:     XOR_INT T3.Y, PV.Z, PS,
 ; EG-NEXT:     CNDE_INT T0.Z, PV.Y, 0.0, PV.X,
-; EG-NEXT:     CNDE_INT T0.W, PV.Y, T2.X, T0.Y,
+; EG-NEXT:     CNDE_INT T1.W, PV.Y, T2.Z, T0.Y,
 ; EG-NEXT:     ASHR * T3.W, KC0[2].W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ; EG-NEXT:     XOR_INT T0.Y, PV.W, PS,
 ; EG-NEXT:     XOR_INT T0.Z, PV.Z, PS,
-; EG-NEXT:     SUB_INT T0.W, PV.Y, T2.W,
+; EG-NEXT:     SUB_INT T1.W, PV.Y, T2.W,
 ; EG-NEXT:     SUBB_UINT * T4.W, PV.X, T2.W,
 ; EG-NEXT:     SUB_INT T1.Y, PV.W, PS,
-; EG-NEXT:     SETGT_INT T1.Z, 0.0, T3.Y,
-; EG-NEXT:     SUB_INT T0.W, PV.Z, T3.W,
+; EG-NEXT:     SETGT_INT T1.Z, 0.0, T2.Y,
+; EG-NEXT:     SUB_INT T1.W, PV.Z, T3.W,
 ; EG-NEXT:     SUBB_UINT * T4.W, PV.Y, T3.W,
 ; EG-NEXT:     SUB_INT T0.Z, PV.W, PS,
-; EG-NEXT:     SETGT_INT T0.W, 0.0, T1.W,
+; EG-NEXT:     SETGT_INT T0.W, 0.0, T0.W,
 ; EG-NEXT:     CNDE_INT * T1.W, PV.Z, PV.Y, 0.0,
 ; EG-NEXT:     CNDE_INT T1.Y, PV.W, PV.Z, 0.0,
 ; EG-NEXT:     SUB_INT * T2.W, T0.X, T2.W,
@@ -567,170 +566,168 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> %
 ;
 ; EG-LABEL: fp_to_sint_v4i64:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 101, @6, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    ALU 54, @108, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T2.X, 1
+; EG-NEXT:    ALU 99, @6, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 54, @106, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T0.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    ALU clause starting at 6:
 ; EG-NEXT:     MOV * T0.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T1.W, KC0[4].X, literal.x, PV.W,
-; EG-NEXT:     AND_INT * T2.W, KC0[4].X, literal.y,
+; EG-NEXT:     BFE_UINT T1.W, KC0[3].Z, literal.x, PV.W,
+; EG-NEXT:     AND_INT * T2.W, KC0[3].Z, literal.y,
 ; EG-NEXT:    23(3.222986e-44), 8388607(1.175494e-38)
-; EG-NEXT:     OR_INT T0.Z, PS, literal.x,
-; EG-NEXT:     BFE_UINT T2.W, KC0[3].Z, literal.y, T0.W,
-; EG-NEXT:     ADD_INT * T3.W, PV.W, literal.z,
-; EG-NEXT:    8388608(1.175494e-38), 23(3.222986e-44)
-; EG-NEXT:    -150(nan), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT T0.Y, PV.W, literal.x,
-; EG-NEXT:     AND_INT T1.Z, PS, literal.y,
-; EG-NEXT:     NOT_INT T4.W, PS,
-; EG-NEXT:     LSHR * T5.W, PV.Z, 1,
-; EG-NEXT:    -127(nan), 31(4.344025e-44)
+; EG-NEXT:     OR_INT T2.W, PS, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, PV.W, literal.y,
+; EG-NEXT:    8388608(1.175494e-38), -150(nan)
 ; EG-NEXT:     ADD_INT T0.X, T1.W, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T1.Y, 0.0, PS, PV.W,
-; EG-NEXT:     AND_INT T2.Z, T3.W, literal.y, BS:VEC_201
-; EG-NEXT:     LSHL T3.W, T0.Z, PV.Z,
-; EG-NEXT:     SUB_INT * T1.W, literal.z, T1.W,
-; EG-NEXT:    -127(nan), 32(4.484155e-44)
-; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.X, PS, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T2.Y, 0.0, T0.Z, PS,
-; EG-NEXT:     AND_INT T0.Z, KC0[3].Z, literal.y,
-; EG-NEXT:     CNDE_INT T1.W, PV.Z, PV.Y, PV.W,
-; EG-NEXT:     SETGT_INT * T4.W, PV.X, literal.z,
+; EG-NEXT:     BFE_UINT T0.Y, KC0[4].X, literal.y, T0.W,
+; EG-NEXT:     AND_INT T0.Z, PS, literal.z,
+; EG-NEXT:     NOT_INT T4.W, PS,
+; EG-NEXT:     LSHR * T5.W, PV.W, 1,
+; EG-NEXT:    -127(nan), 23(3.222986e-44)
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BIT_ALIGN_INT T1.X, 0.0, PS, PV.W,
+; EG-NEXT:     AND_INT T1.Y, T3.W, literal.x,
+; EG-NEXT:     LSHL T0.Z, T2.W, PV.Z, BS:VEC_120/SCL_212
+; EG-NEXT:     AND_INT T3.W, KC0[4].X, literal.y,
+; EG-NEXT:     ADD_INT * T4.W, PV.Y, literal.z,
 ; EG-NEXT:    32(4.484155e-44), 8388607(1.175494e-38)
+; EG-NEXT:    -150(nan), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T2.Y, PS, literal.x,
+; EG-NEXT:     OR_INT T1.Z, PV.W, literal.y,
+; EG-NEXT:     CNDE_INT T3.W, PV.Y, PV.X, PV.Z,
+; EG-NEXT:     SETGT_INT * T5.W, T0.X, literal.z,
+; EG-NEXT:    31(4.344025e-44), 8388608(1.175494e-38)
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T2.X, PS, 0.0, PV.W,
-; EG-NEXT:     OR_INT T1.Y, PV.Z, literal.x,
-; EG-NEXT:     ADD_INT T0.Z, T2.W, literal.y,
-; EG-NEXT:     CNDE_INT T1.W, PV.X, PV.Y, 0.0,
-; EG-NEXT:     CNDE_INT * T3.W, T2.Z, T3.W, 0.0,
-; EG-NEXT:    8388608(1.175494e-38), -150(nan)
-; EG-NEXT:     CNDE_INT T1.X, T4.W, PV.W, PS,
-; EG-NEXT:     ASHR T2.Y, KC0[4].X, literal.x,
-; EG-NEXT:     AND_INT T1.Z, PV.Z, literal.x,
-; EG-NEXT:     NOT_INT T1.W, PV.Z,
-; EG-NEXT:     LSHR * T3.W, PV.Y, 1,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.X, 0.0, PS, PV.W,
-; EG-NEXT:     LSHL T3.Y, T1.Y, PV.Z,
-; EG-NEXT:     XOR_INT T1.Z, PV.X, PV.Y,
-; EG-NEXT:     XOR_INT T1.W, T2.X, PV.Y,
-; EG-NEXT:     SUB_INT * T2.W, literal.x, T2.W,
-; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.X, T0.Z, literal.x,
-; EG-NEXT:     AND_INT T4.Y, PS, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, T1.Y, PS, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T1.W, PV.W, T2.Y,
-; EG-NEXT:     SUBB_UINT * T2.W, PV.Z, T2.Y,
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T2.X, PV.W, PS,
-; EG-NEXT:     CNDE_INT T1.Y, PV.Y, PV.Z, 0.0,
-; EG-NEXT:     CNDE_INT T0.Z, PV.X, T3.Y, 0.0,
-; EG-NEXT:     CNDE_INT T1.W, PV.X, T3.X, T3.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGT_INT * T2.W, T0.Y, literal.x,
+; EG-NEXT:     CNDE_INT T3.Y, PS, 0.0, PV.W,
+; EG-NEXT:     SUB_INT T2.Z, literal.x, T1.W,
+; EG-NEXT:     LSHL T1.W, PV.Z, PV.Y,
+; EG-NEXT:     AND_INT * T3.W, T4.W, literal.y,
+; EG-NEXT:    150(2.101948e-43), 32(4.484155e-44)
+; EG-NEXT:     CNDE_INT T1.X, PS, PV.W, 0.0,
+; EG-NEXT:     AND_INT T2.Y, PV.Z, literal.x,
+; EG-NEXT:     SUB_INT T3.Z, literal.y, T0.Y,
+; EG-NEXT:     NOT_INT T4.W, T4.W,
+; EG-NEXT:     LSHR * T6.W, T1.Z, 1,
+; EG-NEXT:    32(4.484155e-44), 150(2.101948e-43)
+; EG-NEXT:     BIT_ALIGN_INT T2.X, 0.0, T2.W, T2.Z,
+; EG-NEXT:     ADD_INT T0.Y, T0.Y, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, 0.0, PS, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T2.W, 0.0, T1.Z, PV.Z,
+; EG-NEXT:     AND_INT * T4.W, PV.Z, literal.y,
+; EG-NEXT:    -127(nan), 32(4.484155e-44)
+; EG-NEXT:     CNDE_INT T3.X, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T4.Y, T3.W, PV.Z, T1.W,
+; EG-NEXT:     SETGT_INT T1.Z, PV.Y, literal.x,
+; EG-NEXT:     CNDE_INT T1.W, T1.Y, T0.Z, 0.0,
+; EG-NEXT:     CNDE_INT * T2.W, T2.Y, PV.X, 0.0,
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T1.X, KC0[3].W, literal.x, T0.W,
-; EG-NEXT:     AND_INT T3.Y, KC0[3].W, literal.y,
-; EG-NEXT:     CNDE_INT T2.Z, PS, 0.0, PV.W,
-; EG-NEXT:     CNDE_INT T1.W, PS, PV.Y, PV.Z,
-; EG-NEXT:     ASHR * T2.W, KC0[3].Z, literal.z,
-; EG-NEXT:    23(3.222986e-44), 8388607(1.175494e-38)
+; EG-NEXT:     CNDE_INT T2.X, T5.W, PS, PV.W,
+; EG-NEXT:     ASHR T1.Y, KC0[3].Z, literal.x,
+; EG-NEXT:     CNDE_INT T0.Z, PV.Z, 0.0, PV.Y,
+; EG-NEXT:     CNDE_INT T1.W, PV.Z, PV.X, T1.X,
+; EG-NEXT:     ASHR * T2.W, KC0[4].X, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T3.X, KC0[3].Y, literal.x, T0.W,
-; EG-NEXT:     XOR_INT T1.Y, PV.W, PS,
+; EG-NEXT:     XOR_INT T2.Y, PV.W, PS,
 ; EG-NEXT:     XOR_INT T0.Z, PV.Z, PS,
-; EG-NEXT:     OR_INT T0.W, PV.Y, literal.y,
-; EG-NEXT:     SUB_INT * T1.W, literal.z, PV.X,
-; EG-NEXT:    23(3.222986e-44), 8388608(1.175494e-38)
+; EG-NEXT:     XOR_INT T1.W, PV.X, PV.Y,
+; EG-NEXT:     XOR_INT * T3.W, T3.Y, PV.Y,
+; EG-NEXT:     SUB_INT T3.Y, PS, T1.Y,
+; EG-NEXT:     SUBB_UINT T1.Z, PV.W, T1.Y,
+; EG-NEXT:     SUB_INT T3.W, PV.Z, T2.W,
+; EG-NEXT:     SUBB_UINT * T4.W, PV.Y, T2.W,
+; EG-NEXT:     SUB_INT T4.Y, PV.W, PS,
+; EG-NEXT:     SUB_INT T0.Z, PV.Y, PV.Z,
+; EG-NEXT:     BFE_UINT T3.W, KC0[3].Y, literal.x, T0.W,
+; EG-NEXT:     AND_INT * T4.W, KC0[3].Y, literal.y,
+; EG-NEXT:    23(3.222986e-44), 8388607(1.175494e-38)
+; EG-NEXT:     SETGT_INT T0.X, 0.0, T0.X,
+; EG-NEXT:     ADD_INT T3.Y, PV.W, literal.x,
+; EG-NEXT:     OR_INT T1.Z, PS, literal.y,
+; EG-NEXT:     BFE_UINT T0.W, KC0[3].W, literal.z, T0.W,
+; EG-NEXT:     ADD_INT * T4.W, PV.W, literal.w,
+; EG-NEXT:    -127(nan), 8388608(1.175494e-38)
+; EG-NEXT:    23(3.222986e-44), -150(nan)
+; EG-NEXT:     AND_INT T1.X, KC0[3].W, literal.x,
+; EG-NEXT:     ADD_INT T5.Y, PV.W, literal.y,
+; EG-NEXT:     SUB_INT T2.Z, literal.z, T3.W,
+; EG-NEXT:     NOT_INT T3.W, PS,
+; EG-NEXT:     LSHR * T5.W, PV.Z, 1,
+; EG-NEXT:    8388607(1.175494e-38), -150(nan)
 ; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T4.X, KC0[3].Y, literal.x,
-; EG-NEXT:     AND_INT T3.Y, PS, literal.y,
-; EG-NEXT:     BIT_ALIGN_INT T2.Z, 0.0, PV.W, PS,
-; EG-NEXT:     SUB_INT T1.W, PV.Z, T2.W,
-; EG-NEXT:     SUBB_UINT * T3.W, PV.Y, T2.W,
-; EG-NEXT:    8388607(1.175494e-38), 32(4.484155e-44)
-; EG-NEXT:     SUB_INT T5.X, PV.W, PS,
-; EG-NEXT:     SETGT_INT T0.Y, 0.0, T0.Y,
-; EG-NEXT:     CNDE_INT T0.Z, PV.Y, PV.Z, 0.0,
-; EG-NEXT:     OR_INT T1.W, PV.X, literal.x,
-; EG-NEXT:     ADD_INT * T3.W, T3.X, literal.y,
-; EG-NEXT:    8388608(1.175494e-38), -150(nan)
-; EG-NEXT:     ADD_INT T4.X, T3.X, literal.x,
-; EG-NEXT:     SUB_INT T3.Y, literal.y, T3.X,
-; EG-NEXT:     AND_INT T2.Z, PS, literal.z,
-; EG-NEXT:     NOT_INT T4.W, PS,
-; EG-NEXT:     LSHR * T5.W, PV.W, 1,
-; EG-NEXT:    -127(nan), 150(2.101948e-43)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.X, 0.0, PS, PV.W,
-; EG-NEXT:     LSHL T4.Y, T1.W, PV.Z,
-; EG-NEXT:     AND_INT T2.Z, T3.W, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     BIT_ALIGN_INT T1.W, 0.0, T1.W, PV.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     AND_INT * T3.W, PV.Y, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T2.X, 0.0, PS, PV.W,
+; EG-NEXT:     AND_INT T6.Y, PV.Z, literal.x,
+; EG-NEXT:     AND_INT T3.Z, PV.Y, literal.y,
+; EG-NEXT:     OR_INT T3.W, PV.X, literal.z,
+; EG-NEXT:     AND_INT * T5.W, T4.W, literal.y,
+; EG-NEXT:    32(4.484155e-44), 31(4.344025e-44)
+; EG-NEXT:    8388608(1.175494e-38), 0(0.000000e+00)
+; EG-NEXT:     BIT_ALIGN_INT T1.X, 0.0, T1.Z, T2.Z,
+; EG-NEXT:     LSHL T7.Y, T1.Z, PS,
+; EG-NEXT:     AND_INT T1.Z, T4.W, literal.x,
+; EG-NEXT:     LSHL T4.W, PV.W, PV.Z,
+; EG-NEXT:     AND_INT * T5.W, T5.Y, literal.x,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT T6.X, T1.X, literal.x,
-; EG-NEXT:     CNDE_INT T3.Y, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT * T3.Z, PV.Z, PV.Y, 0.0,
-; EG-NEXT:    -150(nan), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 108:
-; EG-NEXT:     CNDE_INT T1.W, T2.Z, T3.X, T4.Y,
-; EG-NEXT:     SETGT_INT * T3.W, T4.X, literal.x,
+; EG-NEXT:     CNDE_INT T3.X, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T8.Y, PV.Z, PV.Y, 0.0,
+; EG-NEXT:     CNDE_INT * T2.Z, T6.Y, PV.X, 0.0,
+; EG-NEXT:    ALU clause starting at 106:
+; EG-NEXT:     CNDE_INT T6.W, T1.Z, T2.X, T7.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGT_INT * T7.W, T3.Y, literal.x,
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T3.X, PS, 0.0, PV.W,
-; EG-NEXT:     CNDE_INT T3.Y, PS, T3.Y, T3.Z,
-; EG-NEXT:     AND_INT T2.Z, T6.X, literal.x,
-; EG-NEXT:     NOT_INT T1.W, T6.X,
-; EG-NEXT:     LSHR * T3.W, T0.W, 1,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ASHR T7.X, KC0[3].Y, literal.x,
-; EG-NEXT:     ADD_INT T4.Y, T1.X, literal.y,
-; EG-NEXT:     BIT_ALIGN_INT T3.Z, 0.0, PS, PV.W,
-; EG-NEXT:     LSHL T0.W, T0.W, PV.Z,
-; EG-NEXT:     AND_INT * T1.W, T6.X, literal.z,
+; EG-NEXT:     CNDE_INT T1.X, PS, 0.0, PV.W,
+; EG-NEXT:     CNDE_INT T6.Y, PS, T2.Z, T8.Y,
+; EG-NEXT:     SUB_INT T1.Z, literal.x, T0.W,
+; EG-NEXT:     NOT_INT T6.W, T5.Y,
+; EG-NEXT:     LSHR * T7.W, T3.W, 1,
+; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
+; EG-NEXT:     ASHR T2.X, KC0[3].Y, literal.x,
+; EG-NEXT:     ADD_INT T5.Y, T0.W, literal.y,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, 0.0, PS, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, T3.W, PV.Z,
+; EG-NEXT:     AND_INT * T3.W, PV.Z, literal.z,
 ; EG-NEXT:    31(4.344025e-44), -127(nan)
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T1.X, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T5.Y, PS, PV.Z, PV.W,
-; EG-NEXT:     SETGT_INT T2.Z, PV.Y, literal.x,
-; EG-NEXT:     XOR_INT T0.W, T3.Y, PV.X,
-; EG-NEXT:     XOR_INT * T1.W, T3.X, PV.X,
+; EG-NEXT:     CNDE_INT T4.X, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T7.Y, T5.W, PV.Z, T4.W,
+; EG-NEXT:     SETGT_INT T1.Z, PV.Y, literal.x,
+; EG-NEXT:     XOR_INT T0.W, T6.Y, PV.X,
+; EG-NEXT:     XOR_INT * T3.W, T1.X, PV.X,
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T3.X, PS, T7.X,
-; EG-NEXT:     SUBB_UINT T3.Y, PV.W, T7.X,
-; EG-NEXT:     CNDE_INT T3.Z, PV.Z, 0.0, PV.Y,
-; EG-NEXT:     CNDE_INT T1.W, PV.Z, T0.Z, PV.X,
-; EG-NEXT:     ASHR * T3.W, KC0[3].W, literal.x,
+; EG-NEXT:     SUB_INT T1.X, PS, T2.X,
+; EG-NEXT:     SUBB_UINT T6.Y, PV.W, T2.X,
+; EG-NEXT:     CNDE_INT T2.Z, PV.Z, 0.0, PV.Y,
+; EG-NEXT:     CNDE_INT T3.W, PV.Z, PV.X, T3.X,
+; EG-NEXT:     ASHR * T4.W, KC0[3].W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     XOR_INT T1.X, PV.W, PS,
-; EG-NEXT:     XOR_INT T5.Y, PV.Z, PS,
-; EG-NEXT:     SUB_INT T0.Z, PV.X, PV.Y,
-; EG-NEXT:     SETGT_INT T1.W, 0.0, T4.X, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T6.W, T0.Y, T5.X, 0.0,
-; EG-NEXT:     SETGT_INT T0.X, 0.0, T0.X,
+; EG-NEXT:     XOR_INT T3.X, PV.W, PS,
+; EG-NEXT:     XOR_INT T7.Y, PV.Z, PS,
+; EG-NEXT:     SUB_INT T1.Z, PV.X, PV.Y,
+; EG-NEXT:     SETGT_INT T3.W, 0.0, T3.Y,
+; EG-NEXT:     CNDE_INT * T6.W, T0.X, T0.Z, 0.0,
+; EG-NEXT:     SETGT_INT T1.X, 0.0, T0.Y,
 ; EG-NEXT:     CNDE_INT T6.Y, PV.W, PV.Z, 0.0,
-; EG-NEXT:     SUB_INT T0.Z, T1.Y, T2.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T2.W, PV.Y, T3.W,
-; EG-NEXT:     SUBB_UINT * T4.W, PV.X, T3.W,
-; EG-NEXT:     SUB_INT T3.X, PV.W, PS,
-; EG-NEXT:     SETGT_INT T1.Y, 0.0, T4.Y,
-; EG-NEXT:     CNDE_INT T6.Z, T0.Y, PV.Z, 0.0,
-; EG-NEXT:     SUB_INT T0.W, T0.W, T7.X, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, PV.X, T2.X, 0.0,
-; EG-NEXT:     CNDE_INT T6.X, T1.W, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T4.Y, PV.Y, PV.X, 0.0,
-; EG-NEXT:     SUB_INT T0.W, T1.Z, T2.Y,
-; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
+; EG-NEXT:     SUB_INT T0.Z, T1.W, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T1.W, PV.Y, T4.W,
+; EG-NEXT:     SUBB_UINT * T5.W, PV.X, T4.W,
+; EG-NEXT:     SUB_INT T4.X, PV.W, PS,
+; EG-NEXT:     SETGT_INT T0.Y, 0.0, T5.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T6.Z, T0.X, PV.Z, 0.0,
+; EG-NEXT:     SUB_INT T0.W, T0.W, T2.X,
+; EG-NEXT:     CNDE_INT * T1.W, PV.X, T4.Y, 0.0,
+; EG-NEXT:     CNDE_INT T6.X, T3.W, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T1.Y, PV.Y, PV.X, 0.0,
+; EG-NEXT:     SUB_INT T0.W, T2.Y, T2.W,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.Z, T0.X, PV.W, 0.0,
-; EG-NEXT:     SUB_INT * T0.W, T1.X, T3.W, BS:VEC_120/SCL_212
-; EG-NEXT:     CNDE_INT T4.X, T1.Y, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T1.Z, T1.X, PV.W, 0.0,
+; EG-NEXT:     SUB_INT * T0.W, T3.X, T4.W, BS:VEC_120/SCL_212
+; EG-NEXT:     CNDE_INT T1.X, T0.Y, PV.W, 0.0,
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T0.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T2.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %conv = fptosi <4 x float> %x to <4 x i64>
   store <4 x i64> %conv, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
index 5170f9c..5abf82a 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -200,25 +200,25 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i64(ptr addrspace(1) %out, float %x
 ; EG-NEXT:     ADD_INT * T2.W, PV.W, literal.y,
 ; EG-NEXT:    8388608(1.175494e-38), -150(nan)
 ; EG-NEXT:     ADD_INT T0.X, T0.W, literal.x,
-; EG-NEXT:     SUB_INT T0.Y, literal.y, T0.W,
-; EG-NEXT:     AND_INT T0.Z, PS, literal.z,
+; EG-NEXT:     AND_INT T0.Y, PS, literal.y,
+; EG-NEXT:     SUB_INT T0.Z, literal.z, T0.W,
 ; EG-NEXT:     NOT_INT T0.W, PS,
 ; EG-NEXT:     LSHR * T3.W, PV.W, 1,
-; EG-NEXT:    -127(nan), 150(2.101948e-43)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:    -127(nan), 31(4.344025e-44)
+; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
 ; EG-NEXT:     BIT_ALIGN_INT T1.X, 0.0, PS, PV.W,
-; EG-NEXT:     LSHL T1.Y, T1.W, PV.Z,
-; EG-NEXT:     AND_INT T0.Z, T2.W, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, T1.W, PV.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.x,
+; EG-NEXT:     AND_INT T1.Y, PV.Z, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, T1.W, PV.Z,
+; EG-NEXT:     LSHL T0.W, T1.W, PV.Y,
+; EG-NEXT:     AND_INT * T1.W, T2.W, literal.x,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     CNDE_INT T0.Y, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T1.Z, PV.Z, PV.Y, 0.0,
-; EG-NEXT:     CNDE_INT T0.W, PV.Z, PV.X, PV.Y,
+; EG-NEXT:     CNDE_INT T0.Z, PV.Y, PV.Z, 0.0,
+; EG-NEXT:     CNDE_INT T0.W, PS, PV.X, PV.W,
 ; EG-NEXT:     SETGT_INT * T1.W, T0.X, literal.x,
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T0.Z, PS, 0.0, PV.W,
-; EG-NEXT:     CNDE_INT T0.W, PS, PV.Y, PV.Z,
+; EG-NEXT:     CNDE_INT T1.Z, PS, 0.0, PV.W,
+; EG-NEXT:     CNDE_INT T0.W, PS, PV.Z, PV.Y,
 ; EG-NEXT:     ASHR * T1.W, KC0[2].Z, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ; EG-NEXT:     XOR_INT T0.W, PV.W, PS,
@@ -288,79 +288,78 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x
 ;
 ; EG-LABEL: fp_to_uint_v2f32_to_v2i64:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 75, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 74, @4, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    ALU clause starting at 4:
 ; EG-NEXT:     MOV * T0.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT * T1.W, KC0[2].W, literal.x, PV.W,
-; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T0.Z, KC0[2].W, literal.x,
-; EG-NEXT:     BFE_UINT T0.W, KC0[3].X, literal.y, T0.W,
-; EG-NEXT:     ADD_INT * T2.W, PV.W, literal.z,
-; EG-NEXT:    8388607(1.175494e-38), 23(3.222986e-44)
+; EG-NEXT:     BFE_UINT T0.Z, KC0[3].X, literal.x, PV.W,
+; EG-NEXT:     BFE_UINT T0.W, KC0[2].W, literal.x, PV.W,
+; EG-NEXT:     AND_INT * T1.Z, KC0[2].W, literal.y,
+; EG-NEXT:    23(3.222986e-44), 8388607(1.175494e-38)
+; EG-NEXT:     ADD_INT T1.W, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, PV.Z, literal.x,
 ; EG-NEXT:    -150(nan), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T0.X, literal.x, PV.W,
-; EG-NEXT:     SUB_INT T0.Y, literal.x, T1.W,
-; EG-NEXT:     AND_INT T1.Z, PS, literal.y,
-; EG-NEXT:     OR_INT T3.W, PV.Z, literal.z,
+; EG-NEXT:     AND_INT T0.X, PS, literal.x,
+; EG-NEXT:     AND_INT T0.Y, PV.W, literal.x,
+; EG-NEXT:     OR_INT T1.Z, T1.Z, literal.y,
+; EG-NEXT:     SUB_INT T3.W, literal.z, T0.W,
 ; EG-NEXT:     AND_INT * T4.W, KC0[3].X, literal.w,
-; EG-NEXT:    150(2.101948e-43), 31(4.344025e-44)
-; EG-NEXT:    8388608(1.175494e-38), 8388607(1.175494e-38)
+; EG-NEXT:    31(4.344025e-44), 8388608(1.175494e-38)
+; EG-NEXT:    150(2.101948e-43), 8388607(1.175494e-38)
 ; EG-NEXT:     OR_INT T1.X, PS, literal.x,
-; EG-NEXT:     LSHL T1.Y, PV.W, PV.Z,
-; EG-NEXT:     AND_INT T0.Z, T2.W, literal.y,
-; EG-NEXT:     BIT_ALIGN_INT T4.W, 0.0, PV.W, PV.Y,
-; EG-NEXT:     AND_INT * T5.W, PV.Y, literal.y,
+; EG-NEXT:     AND_INT T1.Y, PV.W, literal.y,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, 0.0, PV.Z, PV.W,
+; EG-NEXT:     LSHL T3.W, PV.Z, PV.Y,
+; EG-NEXT:     AND_INT * T4.W, T1.W, literal.y,
 ; EG-NEXT:    8388608(1.175494e-38), 32(4.484155e-44)
-; EG-NEXT:     CNDE_INT T2.X, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T0.Y, PV.Z, PV.Y, 0.0,
-; EG-NEXT:     ADD_INT T1.Z, T0.W, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T4.W, 0.0, PV.X, T0.X,
-; EG-NEXT:     AND_INT * T5.W, T0.X, literal.y,
-; EG-NEXT:    -150(nan), 32(4.484155e-44)
+; EG-NEXT:     CNDE_INT T0.Y, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T2.Z, PV.Y, PV.Z, 0.0,
+; EG-NEXT:     LSHL T5.W, PV.X, T0.X,
+; EG-NEXT:     AND_INT * T6.W, T2.W, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     CNDE_INT T0.X, PS, PV.W, 0.0,
-; EG-NEXT:     NOT_INT T2.Y, T2.W,
-; EG-NEXT:     AND_INT T2.Z, PV.Z, literal.x,
-; EG-NEXT:     NOT_INT T2.W, PV.Z,
-; EG-NEXT:     LSHR * T4.W, T1.X, 1,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T3.X, T3.W, 1,
-; EG-NEXT:     ADD_INT T3.Y, T0.W, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     BIT_ALIGN_INT T3.Z, 0.0, PS, PV.W,
-; EG-NEXT:     LSHL T0.W, T1.X, PV.Z,
-; EG-NEXT:     AND_INT * T2.W, T1.Z, literal.y,
+; EG-NEXT:     NOT_INT T1.Y, T1.W,
+; EG-NEXT:     SUB_INT T3.Z, literal.x, T0.Z,
+; EG-NEXT:     NOT_INT T1.W, T2.W, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T2.W, T1.X, 1,
+; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
+; EG-NEXT:     LSHR T2.X, T1.Z, 1,
+; EG-NEXT:     ADD_INT T2.Y, T0.Z, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, PS, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, 0.0, T1.X, PV.Z,
+; EG-NEXT:     AND_INT * T2.W, PV.Z, literal.y,
 ; EG-NEXT:    -127(nan), 32(4.484155e-44)
 ; EG-NEXT:     CNDE_INT T1.X, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T4.Y, PS, PV.Z, PV.W,
-; EG-NEXT:     SETGT_INT T1.Z, PV.Y, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, PV.X, T2.Y,
-; EG-NEXT:     ADD_INT * T1.W, T1.W, literal.y,
+; EG-NEXT:     CNDE_INT T3.Y, T6.W, PV.Z, T5.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGT_INT T0.Z, PV.Y, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, 0.0, PV.X, T1.Y,
+; EG-NEXT:     ADD_INT * T0.W, T0.W, literal.y,
 ; EG-NEXT:    23(3.222986e-44), -127(nan)
-; EG-NEXT:     CNDE_INT T3.X, T0.Z, PV.W, T1.Y,
+; EG-NEXT:     CNDE_INT T2.X, T4.W, PV.W, T3.W,
 ; EG-NEXT:     SETGT_INT T1.Y, PS, literal.x,
-; EG-NEXT:     CNDE_INT T0.Z, PV.Z, 0.0, PV.Y,
-; EG-NEXT:     CNDE_INT T0.W, PV.Z, T0.X, PV.X,
+; EG-NEXT:     CNDE_INT T1.Z, PV.Z, 0.0, PV.Y,
+; EG-NEXT:     CNDE_INT T1.W, PV.Z, PV.X, T0.X,
 ; EG-NEXT:     ASHR * T2.W, KC0[3].X, literal.y,
 ; EG-NEXT:    23(3.222986e-44), 31(4.344025e-44)
 ; EG-NEXT:     XOR_INT T0.X, PV.W, PS,
-; EG-NEXT:     XOR_INT T2.Y, PV.Z, PS,
+; EG-NEXT:     XOR_INT T3.Y, PV.Z, PS,
 ; EG-NEXT:     CNDE_INT T0.Z, PV.Y, 0.0, PV.X,
-; EG-NEXT:     CNDE_INT T0.W, PV.Y, T2.X, T0.Y,
+; EG-NEXT:     CNDE_INT T1.W, PV.Y, T2.Z, T0.Y,
 ; EG-NEXT:     ASHR * T3.W, KC0[2].W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ; EG-NEXT:     XOR_INT T0.Y, PV.W, PS,
 ; EG-NEXT:     XOR_INT T0.Z, PV.Z, PS,
-; EG-NEXT:     SUB_INT T0.W, PV.Y, T2.W,
+; EG-NEXT:     SUB_INT T1.W, PV.Y, T2.W,
 ; EG-NEXT:     SUBB_UINT * T4.W, PV.X, T2.W,
 ; EG-NEXT:     SUB_INT T1.Y, PV.W, PS,
-; EG-NEXT:     SETGT_INT T1.Z, 0.0, T3.Y,
-; EG-NEXT:     SUB_INT T0.W, PV.Z, T3.W,
+; EG-NEXT:     SETGT_INT T1.Z, 0.0, T2.Y,
+; EG-NEXT:     SUB_INT T1.W, PV.Z, T3.W,
 ; EG-NEXT:     SUBB_UINT * T4.W, PV.Y, T3.W,
 ; EG-NEXT:     SUB_INT T0.Z, PV.W, PS,
-; EG-NEXT:     SETGT_INT T0.W, 0.0, T1.W,
+; EG-NEXT:     SETGT_INT T0.W, 0.0, T0.W,
 ; EG-NEXT:     CNDE_INT * T1.W, PV.Z, PV.Y, 0.0,
 ; EG-NEXT:     CNDE_INT T1.Y, PV.W, PV.Z, 0.0,
 ; EG-NEXT:     SUB_INT * T2.W, T0.X, T2.W,
@@ -449,170 +448,168 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x
 ;
 ; EG-LABEL: fp_to_uint_v4f32_to_v4i64:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 101, @6, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    ALU 54, @108, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T2.X, 1
+; EG-NEXT:    ALU 99, @6, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 54, @106, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T0.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    ALU clause starting at 6:
 ; EG-NEXT:     MOV * T0.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T1.W, KC0[4].X, literal.x, PV.W,
-; EG-NEXT:     AND_INT * T2.W, KC0[4].X, literal.y,
+; EG-NEXT:     BFE_UINT T1.W, KC0[3].Z, literal.x, PV.W,
+; EG-NEXT:     AND_INT * T2.W, KC0[3].Z, literal.y,
 ; EG-NEXT:    23(3.222986e-44), 8388607(1.175494e-38)
-; EG-NEXT:     OR_INT T0.Z, PS, literal.x,
-; EG-NEXT:     BFE_UINT T2.W, KC0[3].Z, literal.y, T0.W,
-; EG-NEXT:     ADD_INT * T3.W, PV.W, literal.z,
-; EG-NEXT:    8388608(1.175494e-38), 23(3.222986e-44)
-; EG-NEXT:    -150(nan), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT T0.Y, PV.W, literal.x,
-; EG-NEXT:     AND_INT T1.Z, PS, literal.y,
-; EG-NEXT:     NOT_INT T4.W, PS,
-; EG-NEXT:     LSHR * T5.W, PV.Z, 1,
-; EG-NEXT:    -127(nan), 31(4.344025e-44)
+; EG-NEXT:     OR_INT T2.W, PS, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, PV.W, literal.y,
+; EG-NEXT:    8388608(1.175494e-38), -150(nan)
 ; EG-NEXT:     ADD_INT T0.X, T1.W, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T1.Y, 0.0, PS, PV.W,
-; EG-NEXT:     AND_INT T2.Z, T3.W, literal.y, BS:VEC_201
-; EG-NEXT:     LSHL T3.W, T0.Z, PV.Z,
-; EG-NEXT:     SUB_INT * T1.W, literal.z, T1.W,
-; EG-NEXT:    -127(nan), 32(4.484155e-44)
-; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.X, PS, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T2.Y, 0.0, T0.Z, PS,
-; EG-NEXT:     AND_INT T0.Z, KC0[3].Z, literal.y,
-; EG-NEXT:     CNDE_INT T1.W, PV.Z, PV.Y, PV.W,
-; EG-NEXT:     SETGT_INT * T4.W, PV.X, literal.z,
+; EG-NEXT:     BFE_UINT T0.Y, KC0[4].X, literal.y, T0.W,
+; EG-NEXT:     AND_INT T0.Z, PS, literal.z,
+; EG-NEXT:     NOT_INT T4.W, PS,
+; EG-NEXT:     LSHR * T5.W, PV.W, 1,
+; EG-NEXT:    -127(nan), 23(3.222986e-44)
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BIT_ALIGN_INT T1.X, 0.0, PS, PV.W,
+; EG-NEXT:     AND_INT T1.Y, T3.W, literal.x,
+; EG-NEXT:     LSHL T0.Z, T2.W, PV.Z, BS:VEC_120/SCL_212
+; EG-NEXT:     AND_INT T3.W, KC0[4].X, literal.y,
+; EG-NEXT:     ADD_INT * T4.W, PV.Y, literal.z,
 ; EG-NEXT:    32(4.484155e-44), 8388607(1.175494e-38)
+; EG-NEXT:    -150(nan), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T2.Y, PS, literal.x,
+; EG-NEXT:     OR_INT T1.Z, PV.W, literal.y,
+; EG-NEXT:     CNDE_INT T3.W, PV.Y, PV.X, PV.Z,
+; EG-NEXT:     SETGT_INT * T5.W, T0.X, literal.z,
+; EG-NEXT:    31(4.344025e-44), 8388608(1.175494e-38)
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T2.X, PS, 0.0, PV.W,
-; EG-NEXT:     OR_INT T1.Y, PV.Z, literal.x,
-; EG-NEXT:     ADD_INT T0.Z, T2.W, literal.y,
-; EG-NEXT:     CNDE_INT T1.W, PV.X, PV.Y, 0.0,
-; EG-NEXT:     CNDE_INT * T3.W, T2.Z, T3.W, 0.0,
-; EG-NEXT:    8388608(1.175494e-38), -150(nan)
-; EG-NEXT:     CNDE_INT T1.X, T4.W, PV.W, PS,
-; EG-NEXT:     ASHR T2.Y, KC0[4].X, literal.x,
-; EG-NEXT:     AND_INT T1.Z, PV.Z, literal.x,
-; EG-NEXT:     NOT_INT T1.W, PV.Z,
-; EG-NEXT:     LSHR * T3.W, PV.Y, 1,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.X, 0.0, PS, PV.W,
-; EG-NEXT:     LSHL T3.Y, T1.Y, PV.Z,
-; EG-NEXT:     XOR_INT T1.Z, PV.X, PV.Y,
-; EG-NEXT:     XOR_INT T1.W, T2.X, PV.Y,
-; EG-NEXT:     SUB_INT * T2.W, literal.x, T2.W,
-; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.X, T0.Z, literal.x,
-; EG-NEXT:     AND_INT T4.Y, PS, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, T1.Y, PS, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T1.W, PV.W, T2.Y,
-; EG-NEXT:     SUBB_UINT * T2.W, PV.Z, T2.Y,
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T2.X, PV.W, PS,
-; EG-NEXT:     CNDE_INT T1.Y, PV.Y, PV.Z, 0.0,
-; EG-NEXT:     CNDE_INT T0.Z, PV.X, T3.Y, 0.0,
-; EG-NEXT:     CNDE_INT T1.W, PV.X, T3.X, T3.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGT_INT * T2.W, T0.Y, literal.x,
+; EG-NEXT:     CNDE_INT T3.Y, PS, 0.0, PV.W,
+; EG-NEXT:     SUB_INT T2.Z, literal.x, T1.W,
+; EG-NEXT:     LSHL T1.W, PV.Z, PV.Y,
+; EG-NEXT:     AND_INT * T3.W, T4.W, literal.y,
+; EG-NEXT:    150(2.101948e-43), 32(4.484155e-44)
+; EG-NEXT:     CNDE_INT T1.X, PS, PV.W, 0.0,
+; EG-NEXT:     AND_INT T2.Y, PV.Z, literal.x,
+; EG-NEXT:     SUB_INT T3.Z, literal.y, T0.Y,
+; EG-NEXT:     NOT_INT T4.W, T4.W,
+; EG-NEXT:     LSHR * T6.W, T1.Z, 1,
+; EG-NEXT:    32(4.484155e-44), 150(2.101948e-43)
+; EG-NEXT:     BIT_ALIGN_INT T2.X, 0.0, T2.W, T2.Z,
+; EG-NEXT:     ADD_INT T0.Y, T0.Y, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, 0.0, PS, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T2.W, 0.0, T1.Z, PV.Z,
+; EG-NEXT:     AND_INT * T4.W, PV.Z, literal.y,
+; EG-NEXT:    -127(nan), 32(4.484155e-44)
+; EG-NEXT:     CNDE_INT T3.X, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T4.Y, T3.W, PV.Z, T1.W,
+; EG-NEXT:     SETGT_INT T1.Z, PV.Y, literal.x,
+; EG-NEXT:     CNDE_INT T1.W, T1.Y, T0.Z, 0.0,
+; EG-NEXT:     CNDE_INT * T2.W, T2.Y, PV.X, 0.0,
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T1.X, KC0[3].W, literal.x, T0.W,
-; EG-NEXT:     AND_INT T3.Y, KC0[3].W, literal.y,
-; EG-NEXT:     CNDE_INT T2.Z, PS, 0.0, PV.W,
-; EG-NEXT:     CNDE_INT T1.W, PS, PV.Y, PV.Z,
-; EG-NEXT:     ASHR * T2.W, KC0[3].Z, literal.z,
-; EG-NEXT:    23(3.222986e-44), 8388607(1.175494e-38)
+; EG-NEXT:     CNDE_INT T2.X, T5.W, PS, PV.W,
+; EG-NEXT:     ASHR T1.Y, KC0[3].Z, literal.x,
+; EG-NEXT:     CNDE_INT T0.Z, PV.Z, 0.0, PV.Y,
+; EG-NEXT:     CNDE_INT T1.W, PV.Z, PV.X, T1.X,
+; EG-NEXT:     ASHR * T2.W, KC0[4].X, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T3.X, KC0[3].Y, literal.x, T0.W,
-; EG-NEXT:     XOR_INT T1.Y, PV.W, PS,
+; EG-NEXT:     XOR_INT T2.Y, PV.W, PS,
 ; EG-NEXT:     XOR_INT T0.Z, PV.Z, PS,
-; EG-NEXT:     OR_INT T0.W, PV.Y, literal.y,
-; EG-NEXT:     SUB_INT * T1.W, literal.z, PV.X,
-; EG-NEXT:    23(3.222986e-44), 8388608(1.175494e-38)
+; EG-NEXT:     XOR_INT T1.W, PV.X, PV.Y,
+; EG-NEXT:     XOR_INT * T3.W, T3.Y, PV.Y,
+; EG-NEXT:     SUB_INT T3.Y, PS, T1.Y,
+; EG-NEXT:     SUBB_UINT T1.Z, PV.W, T1.Y,
+; EG-NEXT:     SUB_INT T3.W, PV.Z, T2.W,
+; EG-NEXT:     SUBB_UINT * T4.W, PV.Y, T2.W,
+; EG-NEXT:     SUB_INT T4.Y, PV.W, PS,
+; EG-NEXT:     SUB_INT T0.Z, PV.Y, PV.Z,
+; EG-NEXT:     BFE_UINT T3.W, KC0[3].Y, literal.x, T0.W,
+; EG-NEXT:     AND_INT * T4.W, KC0[3].Y, literal.y,
+; EG-NEXT:    23(3.222986e-44), 8388607(1.175494e-38)
+; EG-NEXT:     SETGT_INT T0.X, 0.0, T0.X,
+; EG-NEXT:     ADD_INT T3.Y, PV.W, literal.x,
+; EG-NEXT:     OR_INT T1.Z, PS, literal.y,
+; EG-NEXT:     BFE_UINT T0.W, KC0[3].W, literal.z, T0.W,
+; EG-NEXT:     ADD_INT * T4.W, PV.W, literal.w,
+; EG-NEXT:    -127(nan), 8388608(1.175494e-38)
+; EG-NEXT:    23(3.222986e-44), -150(nan)
+; EG-NEXT:     AND_INT T1.X, KC0[3].W, literal.x,
+; EG-NEXT:     ADD_INT T5.Y, PV.W, literal.y,
+; EG-NEXT:     SUB_INT T2.Z, literal.z, T3.W,
+; EG-NEXT:     NOT_INT T3.W, PS,
+; EG-NEXT:     LSHR * T5.W, PV.Z, 1,
+; EG-NEXT:    8388607(1.175494e-38), -150(nan)
 ; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T4.X, KC0[3].Y, literal.x,
-; EG-NEXT:     AND_INT T3.Y, PS, literal.y,
-; EG-NEXT:     BIT_ALIGN_INT T2.Z, 0.0, PV.W, PS,
-; EG-NEXT:     SUB_INT T1.W, PV.Z, T2.W,
-; EG-NEXT:     SUBB_UINT * T3.W, PV.Y, T2.W,
-; EG-NEXT:    8388607(1.175494e-38), 32(4.484155e-44)
-; EG-NEXT:     SUB_INT T5.X, PV.W, PS,
-; EG-NEXT:     SETGT_INT T0.Y, 0.0, T0.Y,
-; EG-NEXT:     CNDE_INT T0.Z, PV.Y, PV.Z, 0.0,
-; EG-NEXT:     OR_INT T1.W, PV.X, literal.x,
-; EG-NEXT:     ADD_INT * T3.W, T3.X, literal.y,
-; EG-NEXT:    8388608(1.175494e-38), -150(nan)
-; EG-NEXT:     ADD_INT T4.X, T3.X, literal.x,
-; EG-NEXT:     SUB_INT T3.Y, literal.y, T3.X,
-; EG-NEXT:     AND_INT T2.Z, PS, literal.z,
-; EG-NEXT:     NOT_INT T4.W, PS,
-; EG-NEXT:     LSHR * T5.W, PV.W, 1,
-; EG-NEXT:    -127(nan), 150(2.101948e-43)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.X, 0.0, PS, PV.W,
-; EG-NEXT:     LSHL T4.Y, T1.W, PV.Z,
-; EG-NEXT:     AND_INT T2.Z, T3.W, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     BIT_ALIGN_INT T1.W, 0.0, T1.W, PV.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     AND_INT * T3.W, PV.Y, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T2.X, 0.0, PS, PV.W,
+; EG-NEXT:     AND_INT T6.Y, PV.Z, literal.x,
+; EG-NEXT:     AND_INT T3.Z, PV.Y, literal.y,
+; EG-NEXT:     OR_INT T3.W, PV.X, literal.z,
+; EG-NEXT:     AND_INT * T5.W, T4.W, literal.y,
+; EG-NEXT:    32(4.484155e-44), 31(4.344025e-44)
+; EG-NEXT:    8388608(1.175494e-38), 0(0.000000e+00)
+; EG-NEXT:     BIT_ALIGN_INT T1.X, 0.0, T1.Z, T2.Z,
+; EG-NEXT:     LSHL T7.Y, T1.Z, PS,
+; EG-NEXT:     AND_INT T1.Z, T4.W, literal.x,
+; EG-NEXT:     LSHL T4.W, PV.W, PV.Z,
+; EG-NEXT:     AND_INT * T5.W, T5.Y, literal.x,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT T6.X, T1.X, literal.x,
-; EG-NEXT:     CNDE_INT T3.Y, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT * T3.Z, PV.Z, PV.Y, 0.0,
-; EG-NEXT:    -150(nan), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 108:
-; EG-NEXT:     CNDE_INT T1.W, T2.Z, T3.X, T4.Y,
-; EG-NEXT:     SETGT_INT * T3.W, T4.X, literal.x,
+; EG-NEXT:     CNDE_INT T3.X, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T8.Y, PV.Z, PV.Y, 0.0,
+; EG-NEXT:     CNDE_INT * T2.Z, T6.Y, PV.X, 0.0,
+; EG-NEXT:    ALU clause starting at 106:
+; EG-NEXT:     CNDE_INT T6.W, T1.Z, T2.X, T7.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGT_INT * T7.W, T3.Y, literal.x,
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T3.X, PS, 0.0, PV.W,
-; EG-NEXT:     CNDE_INT T3.Y, PS, T3.Y, T3.Z,
-; EG-NEXT:     AND_INT T2.Z, T6.X, literal.x,
-; EG-NEXT:     NOT_INT T1.W, T6.X,
-; EG-NEXT:     LSHR * T3.W, T0.W, 1,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ASHR T7.X, KC0[3].Y, literal.x,
-; EG-NEXT:     ADD_INT T4.Y, T1.X, literal.y,
-; EG-NEXT:     BIT_ALIGN_INT T3.Z, 0.0, PS, PV.W,
-; EG-NEXT:     LSHL T0.W, T0.W, PV.Z,
-; EG-NEXT:     AND_INT * T1.W, T6.X, literal.z,
+; EG-NEXT:     CNDE_INT T1.X, PS, 0.0, PV.W,
+; EG-NEXT:     CNDE_INT T6.Y, PS, T2.Z, T8.Y,
+; EG-NEXT:     SUB_INT T1.Z, literal.x, T0.W,
+; EG-NEXT:     NOT_INT T6.W, T5.Y,
+; EG-NEXT:     LSHR * T7.W, T3.W, 1,
+; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
+; EG-NEXT:     ASHR T2.X, KC0[3].Y, literal.x,
+; EG-NEXT:     ADD_INT T5.Y, T0.W, literal.y,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, 0.0, PS, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, T3.W, PV.Z,
+; EG-NEXT:     AND_INT * T3.W, PV.Z, literal.z,
 ; EG-NEXT:    31(4.344025e-44), -127(nan)
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T1.X, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T5.Y, PS, PV.Z, PV.W,
-; EG-NEXT:     SETGT_INT T2.Z, PV.Y, literal.x,
-; EG-NEXT:     XOR_INT T0.W, T3.Y, PV.X,
-; EG-NEXT:     XOR_INT * T1.W, T3.X, PV.X,
+; EG-NEXT:     CNDE_INT T4.X, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T7.Y, T5.W, PV.Z, T4.W,
+; EG-NEXT:     SETGT_INT T1.Z, PV.Y, literal.x,
+; EG-NEXT:     XOR_INT T0.W, T6.Y, PV.X,
+; EG-NEXT:     XOR_INT * T3.W, T1.X, PV.X,
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T3.X, PS, T7.X,
-; EG-NEXT:     SUBB_UINT T3.Y, PV.W, T7.X,
-; EG-NEXT:     CNDE_INT T3.Z, PV.Z, 0.0, PV.Y,
-; EG-NEXT:     CNDE_INT T1.W, PV.Z, T0.Z, PV.X,
-; EG-NEXT:     ASHR * T3.W, KC0[3].W, literal.x,
+; EG-NEXT:     SUB_INT T1.X, PS, T2.X,
+; EG-NEXT:     SUBB_UINT T6.Y, PV.W, T2.X,
+; EG-NEXT:     CNDE_INT T2.Z, PV.Z, 0.0, PV.Y,
+; EG-NEXT:     CNDE_INT T3.W, PV.Z, PV.X, T3.X,
+; EG-NEXT:     ASHR * T4.W, KC0[3].W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     XOR_INT T1.X, PV.W, PS,
-; EG-NEXT:     XOR_INT T5.Y, PV.Z, PS,
-; EG-NEXT:     SUB_INT T0.Z, PV.X, PV.Y,
-; EG-NEXT:     SETGT_INT T1.W, 0.0, T4.X, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T6.W, T0.Y, T5.X, 0.0,
-; EG-NEXT:     SETGT_INT T0.X, 0.0, T0.X,
+; EG-NEXT:     XOR_INT T3.X, PV.W, PS,
+; EG-NEXT:     XOR_INT T7.Y, PV.Z, PS,
+; EG-NEXT:     SUB_INT T1.Z, PV.X, PV.Y,
+; EG-NEXT:     SETGT_INT T3.W, 0.0, T3.Y,
+; EG-NEXT:     CNDE_INT * T6.W, T0.X, T0.Z, 0.0,
+; EG-NEXT:     SETGT_INT T1.X, 0.0, T0.Y,
 ; EG-NEXT:     CNDE_INT T6.Y, PV.W, PV.Z, 0.0,
-; EG-NEXT:     SUB_INT T0.Z, T1.Y, T2.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T2.W, PV.Y, T3.W,
-; EG-NEXT:     SUBB_UINT * T4.W, PV.X, T3.W,
-; EG-NEXT:     SUB_INT T3.X, PV.W, PS,
-; EG-NEXT:     SETGT_INT T1.Y, 0.0, T4.Y,
-; EG-NEXT:     CNDE_INT T6.Z, T0.Y, PV.Z, 0.0,
-; EG-NEXT:     SUB_INT T0.W, T0.W, T7.X, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, PV.X, T2.X, 0.0,
-; EG-NEXT:     CNDE_INT T6.X, T1.W, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T4.Y, PV.Y, PV.X, 0.0,
-; EG-NEXT:     SUB_INT T0.W, T1.Z, T2.Y,
-; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
+; EG-NEXT:     SUB_INT T0.Z, T1.W, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T1.W, PV.Y, T4.W,
+; EG-NEXT:     SUBB_UINT * T5.W, PV.X, T4.W,
+; EG-NEXT:     SUB_INT T4.X, PV.W, PS,
+; EG-NEXT:     SETGT_INT T0.Y, 0.0, T5.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T6.Z, T0.X, PV.Z, 0.0,
+; EG-NEXT:     SUB_INT T0.W, T0.W, T2.X,
+; EG-NEXT:     CNDE_INT * T1.W, PV.X, T4.Y, 0.0,
+; EG-NEXT:     CNDE_INT T6.X, T3.W, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T1.Y, PV.Y, PV.X, 0.0,
+; EG-NEXT:     SUB_INT T0.W, T2.Y, T2.W,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.Z, T0.X, PV.W, 0.0,
-; EG-NEXT:     SUB_INT * T0.W, T1.X, T3.W, BS:VEC_120/SCL_212
-; EG-NEXT:     CNDE_INT T4.X, T1.Y, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T1.Z, T1.X, PV.W, 0.0,
+; EG-NEXT:     SUB_INT * T0.W, T3.X, T4.W, BS:VEC_120/SCL_212
+; EG-NEXT:     CNDE_INT T1.X, T0.Y, PV.W, 0.0,
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T0.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T2.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %conv = fptoui <4 x float> %x to <4 x i64>
   store <4 x i64> %conv, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/kernel_code_t_recurse.ll b/llvm/test/CodeGen/AMDGPU/kernel_code_t_recurse.ll
new file mode 100644
index 0000000..cdd6e88
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/kernel_code_t_recurse.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d < %s | FileCheck %s
+
+; CHECK-LABEL: non_kernel_recursion:
+define void @non_kernel_recursion(i32 %val) #2 {
+  %cmp = icmp eq i32 %val, 0
+  br i1 %cmp, label %ret, label %call
+
+call:
+  %val.sub1 = sub i32 %val, 1
+  call void @non_kernel_recursion(i32 %val.sub1)
+  br label %ret
+
+ret:
+  ret void
+}
+
+; CHECK-LABEL: kernel_caller_recursion:
+; CHECK: .amd_kernel_code_t
+; CHECK: is_dynamic_callstack = 1
+; CHECK: .end_amd_kernel_code_t
+define amdgpu_kernel void @kernel_caller_recursion(i32 %n) #0 {
+  call void @non_kernel_recursion(i32 %n)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll
new file mode 100644
index 0000000..4927c2f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefix=SDAG
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefix=GFX9-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefix=SDAG
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefix=GFX10-GISEL
+
+define amdgpu_ps void @test(ptr addrspace(1) inreg %ptr) {
+; SDAG-LABEL: test:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s2, src_pops_exiting_wave_id
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_mov_b32 s2, src_pops_exiting_wave_id
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: test:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_mov_b32 s2, src_pops_exiting_wave_id
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.pops.exiting.wave.id()
+  store i32 %id, ptr addrspace(1) %ptr
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 7a04507..3a86787 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -228,23 +228,23 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) {
 ; R600-NEXT:     MUL_IEEE * T2.W, PS, literal.z,
 ; R600-NEXT:    -127(nan), 254(3.559298e-43)
 ; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T3.X, T1.X, literal.x,
-; R600-NEXT:     MUL_IEEE T0.Y, PS, literal.y,
+; R600-NEXT:     MUL_IEEE T3.X, PS, literal.x,
+; R600-NEXT:     MUL_IEEE T0.Y, T1.X, literal.y,
 ; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Z, T0.Z,
 ; R600-NEXT:     CNDE_INT T3.W, PV.Y, PV.X, T0.X,
 ; R600-NEXT:     SETGT_INT * T4.W, T0.Z, literal.z,
-; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
+; R600-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
 ; R600-NEXT:    127(1.779649e-43), 0(0.000000e+00)
 ; R600-NEXT:     CNDE_INT T0.Z, PS, PV.Z, PV.W,
-; R600-NEXT:     CNDE_INT T0.W, T0.W, PV.Y, T2.W,
-; R600-NEXT:     MUL_IEEE * T2.W, PV.X, literal.x,
+; R600-NEXT:     MUL_IEEE T3.W, PV.Y, literal.x,
+; R600-NEXT:     CNDE_INT * T0.W, T0.W, PV.X, T2.W,
 ; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T1.Z, T1.Y, T3.X, PS,
-; R600-NEXT:     CNDE_INT T0.W, T1.W, PV.W, T1.X,
+; R600-NEXT:     CNDE_INT T1.Z, T1.W, PS, T1.X,
+; R600-NEXT:     CNDE_INT T0.W, T1.Y, T0.Y, PV.W,
 ; R600-NEXT:     LSHL * T1.W, PV.Z, literal.x,
 ; R600-NEXT:    23(3.222986e-44), 0(0.000000e+00)
 ; R600-NEXT:     ADD_INT T1.W, PS, literal.x,
-; R600-NEXT:     CNDE_INT * T0.W, T4.W, PV.W, PV.Z,
+; R600-NEXT:     CNDE_INT * T0.W, T4.W, PV.Z, PV.W,
 ; R600-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
 ; R600-NEXT:     MUL_IEEE T0.W, PS, PV.W,
 ; R600-NEXT:     SETGT * T1.W, literal.x, KC0[2].Z,
@@ -258,65 +258,63 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) {
 ;
 ; CM-LABEL: s_exp_f32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 64, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 62, @4, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
 ; CM-NEXT:    ALU clause starting at 4:
 ; CM-NEXT:     AND_INT * T0.W, KC0[2].Z, literal.x,
 ; CM-NEXT:    -4096(nan), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.Z, PV.W, literal.x,
 ; CM-NEXT:     ADD * T1.W, KC0[2].Z, -PV.W,
-; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T1.Z, PV.W, literal.x,
-; CM-NEXT:     RNDNE * T2.W, PV.Z,
-; CM-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; CM-NEXT:     TRUNC T2.Z, PV.W,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.W, literal.x,
+; CM-NEXT:     MUL_IEEE * T2.W, T0.W, literal.y,
+; CM-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
+; CM-NEXT:     RNDNE T1.Z, PV.W,
 ; CM-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.x, PV.Z,
 ; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; CM-NEXT:     MULADD_IEEE T0.Y, T0.W, literal.x, PV.W,
-; CM-NEXT:     ADD T0.Z, T0.Z, -T2.W,
-; CM-NEXT:     FLT_TO_INT * T0.W, PV.Z,
+; CM-NEXT:     MULADD_IEEE T0.Z, T0.W, literal.x, PV.W,
+; CM-NEXT:     ADD * T0.W, T2.W, -PV.Z, BS:VEC_120/SCL_212
 ; CM-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; CM-NEXT:     MIN_INT T1.Z, PV.W, literal.x,
-; CM-NEXT:     ADD * T1.W, PV.Z, PV.Y,
+; CM-NEXT:     TRUNC T1.Z, T1.Z,
+; CM-NEXT:     ADD * T0.W, PV.W, PV.Z,
+; CM-NEXT:     EXP_IEEE T0.X, T0.W,
+; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
+; CM-NEXT:     FLT_TO_INT T0.Z, T1.Z,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T0.Y, PV.W, literal.x,
+; CM-NEXT:     MAX_INT T1.Z, PV.Z, literal.y,
+; CM-NEXT:     MIN_INT * T1.W, PV.Z, literal.z,
+; CM-NEXT:    209715200(1.972152e-31), -330(nan)
 ; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
-; CM-NEXT:     EXP_IEEE T0.X, T1.W,
-; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
-; CM-NEXT:     MUL_IEEE T0.Y, PV.X, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, T1.Z, literal.y,
-; CM-NEXT:     MAX_INT * T1.W, T0.W, literal.z,
-; CM-NEXT:    2130706432(1.701412e+38), -254(nan)
-; CM-NEXT:    -330(nan), 0(0.000000e+00)
-; CM-NEXT:     ADD_INT T1.X, T0.W, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, PV.W, literal.y,
-; CM-NEXT:     ADD_INT T1.Z, T0.W, literal.z,
-; CM-NEXT:     SETGT_UINT * T1.W, T0.W, literal.w,
-; CM-NEXT:    -127(nan), 204(2.858649e-43)
+; CM-NEXT:     ADD_INT T1.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T1.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T1.Z, T0.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T1.W, T0.Z, literal.w,
+; CM-NEXT:    -254(nan), 204(2.858649e-43)
 ; CM-NEXT:    102(1.429324e-43), -229(nan)
-; CM-NEXT:     SETGT_UINT T2.X, T0.W, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     SETGT_INT T1.Z, T0.W, literal.y,
-; CM-NEXT:     MUL_IEEE * T2.W, T0.X, literal.z,
-; CM-NEXT:    254(3.559298e-43), -127(nan)
-; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T3.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.Z, PV.Y, T0.W,
-; CM-NEXT:     CNDE_INT T0.Z, PV.X, T1.X, T0.Z,
-; CM-NEXT:     SETGT_INT * T0.W, T0.W, literal.y,
-; CM-NEXT:    209715200(1.972152e-31), 127(1.779649e-43)
+; CM-NEXT:     ADD_INT T2.X, T0.Z, literal.x,
+; CM-NEXT:     SETGT_UINT T2.Y, T0.Z, literal.y,
+; CM-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     SETGT_INT * T2.W, T0.Z, literal.x,
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     MUL_IEEE T3.X, T0.X, literal.x,
+; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Z, T0.Z,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Y, PV.X, T1.X,
+; CM-NEXT:     SETGT_INT * T3.W, T0.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 127(1.779649e-43)
 ; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     CNDE_INT T0.Z, T1.W, PV.X, T2.W,
-; CM-NEXT:     MUL_IEEE * T1.W, T0.Y, literal.x,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.X, literal.x,
+; CM-NEXT:     CNDE_INT * T0.W, T1.W, T0.Y, T0.W,
 ; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T0.Y, T2.X, T0.Y, PV.W,
-; CM-NEXT:     CNDE_INT T0.Z, T1.Z, PV.Z, T0.X,
-; CM-NEXT:     LSHL * T1.W, PV.Y, literal.x,
+; CM-NEXT:     CNDE_INT T0.Y, T2.W, PV.W, T0.X,
+; CM-NEXT:     CNDE_INT T0.Z, T2.Y, T3.X, PV.Z,
+; CM-NEXT:     LSHL * T0.W, PV.Y, literal.x,
 ; CM-NEXT:    23(3.222986e-44), 0(0.000000e+00)
 ; CM-NEXT:     ADD_INT T1.Z, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT * T0.W, T0.W, PV.Z, PV.Y,
+; CM-NEXT:     CNDE_INT * T0.W, T3.W, PV.Y, PV.Z,
 ; CM-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
 ; CM-NEXT:     MUL_IEEE T0.Z, PV.W, PV.Z,
 ; CM-NEXT:     SETGT * T0.W, literal.x, KC0[2].Z,
@@ -610,105 +608,105 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; R600-NEXT:     AND_INT * T0.W, KC0[3].X, literal.x,
 ; R600-NEXT:    -4096(nan), 0(0.000000e+00)
 ; R600-NEXT:     ADD * T1.W, KC0[3].X, -PV.W,
-; R600-NEXT:     AND_INT T0.Z, KC0[2].W, literal.x,
-; R600-NEXT:     MUL_IEEE T2.W, PV.W, literal.y,
-; R600-NEXT:     MUL_IEEE * T3.W, T0.W, literal.z,
-; R600-NEXT:    -4096(nan), 967029397(3.122284e-04)
-; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; R600-NEXT:     RNDNE T1.Z, PS,
+; R600-NEXT:     MUL_IEEE T2.W, PV.W, literal.x,
+; R600-NEXT:     MUL_IEEE * T3.W, T0.W, literal.y,
+; R600-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
+; R600-NEXT:     RNDNE T0.Z, PS,
 ; R600-NEXT:     MULADD_IEEE T1.W, T1.W, literal.x, PV.W,
-; R600-NEXT:     ADD * T2.W, KC0[2].W, -PV.Z,
-; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T0.Y, PS, literal.x,
-; R600-NEXT:     MUL_IEEE T2.Z, T0.Z, literal.y,
+; R600-NEXT:     AND_INT * T2.W, KC0[2].W, literal.y,
+; R600-NEXT:    1069064192(1.442383e+00), -4096(nan)
+; R600-NEXT:     ADD T1.Z, KC0[2].W, -PS,
 ; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.x, PV.W,
 ; R600-NEXT:     ADD * T1.W, T3.W, -PV.Z,
+; R600-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
+; R600-NEXT:     ADD T2.Z, PS, PV.W,
+; R600-NEXT:     MUL_IEEE T0.W, PV.Z, literal.x,
+; R600-NEXT:     MUL_IEEE * T1.W, T2.W, literal.y,
 ; R600-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
-; R600-NEXT:     ADD T3.Z, PS, PV.W,
-; R600-NEXT:     RNDNE T0.W, PV.Z,
-; R600-NEXT:     MULADD_IEEE * T1.W, T2.W, literal.x, PV.Y, BS:VEC_021/SCL_122
-; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; R600-NEXT:     TRUNC T0.Y, T1.Z,
-; R600-NEXT:     MULADD_IEEE T0.Z, T0.Z, literal.x, PS, BS:VEC_120/SCL_212
-; R600-NEXT:     ADD T1.W, T2.Z, -PV.W, BS:VEC_201
+; R600-NEXT:     RNDNE T0.Y, PS,
+; R600-NEXT:     MULADD_IEEE T1.Z, T1.Z, literal.x, PV.W,
+; R600-NEXT:     TRUNC T0.W, T0.Z, BS:VEC_120/SCL_212
 ; R600-NEXT:     EXP_IEEE * T0.X, PV.Z,
-; R600-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; R600-NEXT:     ADD T0.Z, PV.W, PV.Z,
-; R600-NEXT:     FLT_TO_INT T1.W, PV.Y,
-; R600-NEXT:     MUL_IEEE * T2.W, PS, literal.x,
-; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T1.Z, PS, literal.x,
-; R600-NEXT:     SETGT_UINT T3.W, PV.W, literal.y,
-; R600-NEXT:     EXP_IEEE * T0.Y, PV.Z,
-; R600-NEXT:    2130706432(1.701412e+38), 254(3.559298e-43)
-; R600-NEXT:     CNDE_INT T1.X, PV.W, T2.W, PV.Z,
-; R600-NEXT:     MUL_IEEE T1.Y, PS, literal.x,
-; R600-NEXT:     MAX_INT T0.Z, T1.W, literal.y,
-; R600-NEXT:     MIN_INT T2.W, T1.W, literal.z,
-; R600-NEXT:     TRUNC * T0.W, T0.W,
+; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
+; R600-NEXT:     FLT_TO_INT T1.Y, PV.W,
+; R600-NEXT:     MUL_IEEE T0.Z, PS, literal.x,
+; R600-NEXT:     MULADD_IEEE T0.W, T2.W, literal.y, PV.Z,
+; R600-NEXT:     ADD * T1.W, T1.W, -PV.Y,
+; R600-NEXT:    209715200(1.972152e-31), 967029397(3.122284e-04)
+; R600-NEXT:     ADD T1.Z, PS, PV.W,
+; R600-NEXT:     MUL_IEEE T0.W, PV.Z, literal.x,
+; R600-NEXT:     SETGT_UINT * T1.W, PV.Y, literal.y,
+; R600-NEXT:    209715200(1.972152e-31), -229(nan)
+; R600-NEXT:     CNDE_INT T0.Z, PS, PV.W, T0.Z,
+; R600-NEXT:     SETGT_INT T0.W, T1.Y, literal.x,
+; R600-NEXT:     EXP_IEEE * T1.X, PV.Z,
+; R600-NEXT:    -127(nan), 0(0.000000e+00)
+; R600-NEXT:     CNDE_INT T0.Z, PV.W, PV.Z, T0.X,
+; R600-NEXT:     MAX_INT T2.W, T1.Y, literal.x,
+; R600-NEXT:     MUL_IEEE * T3.W, PS, literal.y,
+; R600-NEXT:    -330(nan), 209715200(1.972152e-31)
+; R600-NEXT:     MUL_IEEE T2.X, PS, literal.x,
+; R600-NEXT:     ADD_INT T2.Y, PV.W, literal.y,
+; R600-NEXT:     ADD_INT T1.Z, T1.Y, literal.z,
+; R600-NEXT:     MIN_INT T2.W, T1.Y, literal.w,
+; R600-NEXT:     TRUNC * T4.W, T0.Y,
+; R600-NEXT:    209715200(1.972152e-31), 204(2.858649e-43)
+; R600-NEXT:    102(1.429324e-43), 381(5.338947e-43)
+; R600-NEXT:     FLT_TO_INT T3.X, PS,
+; R600-NEXT:     ADD_INT T0.Y, PV.W, literal.x,
+; R600-NEXT:     ADD_INT T2.Z, T1.Y, literal.y,
+; R600-NEXT:     SETGT_UINT T2.W, T1.Y, literal.z,
+; R600-NEXT:     CNDE_INT * T1.W, T1.W, PV.Y, PV.Z,
+; R600-NEXT:    -254(nan), -127(nan)
+; R600-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T4.X, T1.X, literal.x,
+; R600-NEXT:     MUL_IEEE T2.Y, T0.X, literal.x, BS:VEC_120/SCL_212
+; R600-NEXT:     CNDE_INT T1.Z, T0.W, PS, T1.Y,
+; R600-NEXT:     CNDE_INT T0.W, PV.W, PV.Z, PV.Y,
+; R600-NEXT:     MAX_INT * T1.W, PV.X, literal.y,
 ; R600-NEXT:    2130706432(1.701412e+38), -330(nan)
-; R600-NEXT:    381(5.338947e-43), 0(0.000000e+00)
-; R600-NEXT:     FLT_TO_INT T2.X, PS,
-; R600-NEXT:     ADD_INT T2.Y, PV.W, literal.x,
-; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
-; R600-NEXT:     ADD_INT T0.W, T1.W, literal.z,
-; R600-NEXT:     SETGT_UINT * T2.W, T1.W, literal.w,
-; R600-NEXT:    -254(nan), 204(2.858649e-43)
-; R600-NEXT:    102(1.429324e-43), -229(nan)
-; R600-NEXT:     ADD_INT T3.X, T1.W, literal.x,
-; R600-NEXT:     CNDE_INT T3.Y, PS, PV.Z, PV.W,
-; R600-NEXT:     SETGT_INT T0.Z, T1.W, literal.x,
-; R600-NEXT:     MUL_IEEE T0.W, T0.X, literal.y,
-; R600-NEXT:     MUL_IEEE * T4.W, T0.Y, literal.y,
-; R600-NEXT:    -127(nan), 209715200(1.972152e-31)
-; R600-NEXT:     MUL_IEEE T4.X, PS, literal.x,
-; R600-NEXT:     MUL_IEEE T4.Y, PV.W, literal.x,
-; R600-NEXT:     CNDE_INT T1.Z, PV.Z, PV.Y, T1.W,
-; R600-NEXT:     CNDE_INT T3.W, T3.W, PV.X, T2.Y,
-; R600-NEXT:     MAX_INT * T5.W, T2.X, literal.y,
-; R600-NEXT:    209715200(1.972152e-31), -330(nan)
-; R600-NEXT:     SETGT_INT T3.X, T1.W, literal.x,
-; R600-NEXT:     ADD_INT T2.Y, PS, literal.y,
-; R600-NEXT:     ADD_INT T2.Z, T2.X, literal.z,
-; R600-NEXT:     SETGT_UINT * T1.W, T2.X, literal.w,
+; R600-NEXT:     SETGT_INT T0.X, T1.Y, literal.x,
+; R600-NEXT:     ADD_INT T0.Y, PS, literal.y,
+; R600-NEXT:     ADD_INT T2.Z, T3.X, literal.z,
+; R600-NEXT:     SETGT_UINT * T1.W, T3.X, literal.w,
 ; R600-NEXT:    127(1.779649e-43), 204(2.858649e-43)
 ; R600-NEXT:    102(1.429324e-43), -229(nan)
-; R600-NEXT:     MIN_INT * T5.W, T2.X, literal.x,
+; R600-NEXT:     MIN_INT * T4.W, T3.X, literal.x,
 ; R600-NEXT:    381(5.338947e-43), 0(0.000000e+00)
 ; R600-NEXT:     ADD_INT T5.X, PV.W, literal.x,
-; R600-NEXT:     ADD_INT T3.Y, T2.X, literal.y,
-; R600-NEXT:     SETGT_UINT T3.Z, T2.X, literal.z,
-; R600-NEXT:     CNDE_INT T5.W, T1.W, T2.Y, T2.Z,
-; R600-NEXT:     SETGT_INT * T6.W, T2.X, literal.y,
+; R600-NEXT:     ADD_INT T1.Y, T3.X, literal.y,
+; R600-NEXT:     SETGT_UINT T3.Z, T3.X, literal.z,
+; R600-NEXT:     CNDE_INT T4.W, T1.W, T0.Y, T2.Z,
+; R600-NEXT:     SETGT_INT * T5.W, T3.X, literal.y,
 ; R600-NEXT:    -254(nan), -127(nan)
 ; R600-NEXT:    254(3.559298e-43), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T6.X, PS, PV.W, T2.X,
-; R600-NEXT:     CNDE_INT T2.Y, PV.Z, PV.Y, PV.X,
-; R600-NEXT:     SETGT_INT T2.Z, T2.X, literal.x, BS:VEC_120/SCL_212
-; R600-NEXT:     CNDE_INT T3.W, T3.X, T1.Z, T3.W, BS:VEC_021/SCL_122
-; R600-NEXT:     CNDE_INT * T0.W, T2.W, T4.Y, T0.W,
-; R600-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T0.X, T0.Z, PS, T0.X,
-; R600-NEXT:     LSHL T3.Y, PV.W, literal.x,
-; R600-NEXT:     CNDE_INT T0.Z, PV.Z, PV.X, PV.Y,
-; R600-NEXT:     CNDE_INT T0.W, T1.W, T4.X, T4.W,
-; R600-NEXT:     MUL_IEEE * T1.W, T1.Y, literal.y,
+; R600-NEXT:     CNDE_INT T6.X, PS, PV.W, T3.X,
+; R600-NEXT:     CNDE_INT T0.Y, PV.Z, PV.Y, PV.X,
+; R600-NEXT:     SETGT_INT T2.Z, T3.X, literal.x,
+; R600-NEXT:     CNDE_INT T0.W, T0.X, T1.Z, T0.W, BS:VEC_120/SCL_212
+; R600-NEXT:     MUL_IEEE * T4.W, T2.Y, literal.y,
+; R600-NEXT:    127(1.779649e-43), 2130706432(1.701412e+38)
+; R600-NEXT:     CNDE_INT T3.X, T2.W, T2.Y, PS, BS:VEC_120/SCL_212
+; R600-NEXT:     LSHL T1.Y, PV.W, literal.x,
+; R600-NEXT:     CNDE_INT T1.Z, PV.Z, PV.X, PV.Y,
+; R600-NEXT:     MUL_IEEE T0.W, T4.X, literal.y,
+; R600-NEXT:     CNDE_INT * T1.W, T1.W, T2.X, T3.W,
 ; R600-NEXT:    23(3.222986e-44), 2130706432(1.701412e+38)
-; R600-NEXT:     CNDE_INT T2.X, T3.Z, T1.Y, PS,
-; R600-NEXT:     CNDE_INT T0.Y, T6.W, PV.W, T0.Y,
-; R600-NEXT:     LSHL T0.Z, PV.Z, literal.x,
+; R600-NEXT:     CNDE_INT T1.X, T5.W, PS, T1.X, BS:VEC_021/SCL_122
+; R600-NEXT:     CNDE_INT T0.Y, T3.Z, T4.X, PV.W, BS:VEC_201
+; R600-NEXT:     LSHL T1.Z, PV.Z, literal.x,
 ; R600-NEXT:     ADD_INT T0.W, PV.Y, literal.y,
-; R600-NEXT:     CNDE_INT * T1.W, T3.X, PV.X, T1.X,
+; R600-NEXT:     CNDE_INT * T1.W, T0.X, T0.Z, PV.X,
 ; R600-NEXT:    23(3.222986e-44), 1065353216(1.000000e+00)
 ; R600-NEXT:     MUL_IEEE T1.Y, PS, PV.W,
-; R600-NEXT:     SETGT T1.Z, literal.x, KC0[3].X,
+; R600-NEXT:     SETGT T0.Z, literal.x, KC0[3].X,
 ; R600-NEXT:     ADD_INT * T0.W, PV.Z, literal.y,
 ; R600-NEXT:    -1026650416(-1.032789e+02), 1065353216(1.000000e+00)
 ; R600-NEXT:    ALU clause starting at 101:
-; R600-NEXT:     CNDE_INT * T1.W, T2.Z, T0.Y, T2.X,
+; R600-NEXT:     CNDE_INT * T1.W, T2.Z, T1.X, T0.Y,
 ; R600-NEXT:     MUL_IEEE T0.Y, PV.W, T0.W,
-; R600-NEXT:     SETGT T0.Z, literal.x, KC0[2].W,
-; R600-NEXT:     CNDE T0.W, T1.Z, T1.Y, 0.0,
+; R600-NEXT:     SETGT T1.Z, literal.x, KC0[2].W,
+; R600-NEXT:     CNDE T0.W, T0.Z, T1.Y, 0.0,
 ; R600-NEXT:     SETGT * T1.W, KC0[3].X, literal.y,
 ; R600-NEXT:    -1026650416(-1.032789e+02), 1118925336(8.872284e+01)
 ; R600-NEXT:     CNDE T1.Y, PS, PV.W, literal.x,
@@ -721,118 +719,116 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ;
 ; CM-LABEL: s_exp_v2f32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 100, @4, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    ALU 18, @105, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 98, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 18, @103, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    ALU clause starting at 4:
 ; CM-NEXT:     AND_INT * T0.W, KC0[2].W, literal.x,
 ; CM-NEXT:    -4096(nan), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.Z, PV.W, literal.x,
 ; CM-NEXT:     ADD * T1.W, KC0[2].W, -PV.W,
+; CM-NEXT:     MUL_IEEE T0.Y, PV.W, literal.x,
+; CM-NEXT:     MUL_IEEE T0.Z, T0.W, literal.y,
+; CM-NEXT:     AND_INT * T2.W, KC0[3].X, literal.z,
+; CM-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
+; CM-NEXT:    -4096(nan), 0(0.000000e+00)
+; CM-NEXT:     ADD T1.Y, KC0[3].X, -PV.W,
+; CM-NEXT:     RNDNE T1.Z, PV.Z,
+; CM-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.x, PV.Y,
 ; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T1.Z, PV.W, literal.x,
-; CM-NEXT:     RNDNE * T2.W, PV.Z,
-; CM-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; CM-NEXT:     TRUNC T0.Y, PV.W,
-; CM-NEXT:     AND_INT T2.Z, KC0[3].X, literal.x,
-; CM-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.y, PV.Z,
-; CM-NEXT:    -4096(nan), 1069064192(1.442383e+00)
 ; CM-NEXT:     MULADD_IEEE T0.X, T0.W, literal.x, PV.W,
-; CM-NEXT:     MUL_IEEE T1.Y, PV.Z, literal.y,
-; CM-NEXT:     FLT_TO_INT T1.Z, PV.Y,
-; CM-NEXT:     ADD * T0.W, KC0[3].X, -PV.Z,
+; CM-NEXT:     ADD T0.Y, T0.Z, -PV.Z,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.x,
+; CM-NEXT:     MUL_IEEE * T0.W, T2.W, literal.y, BS:VEC_120/SCL_212
 ; CM-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
-; CM-NEXT:     ADD T1.X, T0.Z, -T2.W,
-; CM-NEXT:     MUL_IEEE T0.Y, PV.W, literal.x,
-; CM-NEXT:     MAX_INT T0.Z, PV.Z, literal.y,
-; CM-NEXT:     RNDNE * T1.W, PV.Y,
-; CM-NEXT:    967029397(3.122284e-04), -330(nan)
-; CM-NEXT:     TRUNC T2.X, PV.W,
-; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.x,
-; CM-NEXT:     MULADD_IEEE T0.Z, T0.W, literal.y, PV.Y,
-; CM-NEXT:     ADD * T0.W, PV.X, T0.X,
-; CM-NEXT:    204(2.858649e-43), 1069064192(1.442383e+00)
-; CM-NEXT:     EXP_IEEE T0.X, T0.W,
-; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
-; CM-NEXT:     ADD_INT T1.X, T1.Z, literal.x,
-; CM-NEXT:     MULADD_IEEE T0.Y, T2.Z, literal.y, T0.Z, BS:VEC_102/SCL_221
-; CM-NEXT:     ADD T0.Z, T1.Y, -T1.W,
-; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.z,
-; CM-NEXT:    102(1.429324e-43), 967029397(3.122284e-04)
-; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_UINT T3.X, T1.Z, literal.x,
-; CM-NEXT:     MUL_IEEE T1.Y, PV.W, literal.y,
-; CM-NEXT:     SETGT_UINT T2.Z, T1.Z, literal.z,
-; CM-NEXT:     ADD * T1.W, PV.Z, PV.Y,
-; CM-NEXT:    -229(nan), 2130706432(1.701412e+38)
-; CM-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; CM-NEXT:     TRUNC T1.X, T1.Z,
+; CM-NEXT:     RNDNE T2.Y, PV.W,
+; CM-NEXT:     MULADD_IEEE T0.Z, T1.Y, literal.x, PV.Z,
+; CM-NEXT:     ADD * T1.W, PV.Y, PV.X,
+; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
+; CM-NEXT:     EXP_IEEE T0.X, T1.W,
+; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T1.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
+; CM-NEXT:     MULADD_IEEE T2.X, T2.W, literal.x, T0.Z,
+; CM-NEXT:     ADD T0.Y, T0.W, -T2.Y, BS:VEC_120/SCL_212
+; CM-NEXT:     FLT_TO_INT T0.Z, T1.X,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.y,
+; CM-NEXT:    967029397(3.122284e-04), 209715200(1.972152e-31)
+; CM-NEXT:     MUL_IEEE T1.X, PV.W, literal.x,
+; CM-NEXT:     SETGT_UINT T1.Y, PV.Z, literal.y,
+; CM-NEXT:     TRUNC T1.Z, T2.Y,
+; CM-NEXT:     ADD * T1.W, PV.Y, PV.X,
+; CM-NEXT:    209715200(1.972152e-31), -229(nan)
 ; CM-NEXT:     EXP_IEEE T0.X (MASKED), T1.W,
 ; CM-NEXT:     EXP_IEEE T0.Y, T1.W,
 ; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
 ; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
-; CM-NEXT:     CNDE_INT T4.X, T2.Z, T0.W, T1.Y,
-; CM-NEXT:     CNDE_INT T1.Y, T3.X, T2.Y, T1.X,
-; CM-NEXT:     FLT_TO_INT T0.Z, T2.X, BS:VEC_120/SCL_212
-; CM-NEXT:     MUL_IEEE * T0.W, PV.Y, literal.x,
-; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_INT T1.X, T1.Z, literal.x,
-; CM-NEXT:     MUL_IEEE T2.Y, T0.X, literal.y,
-; CM-NEXT:     MUL_IEEE T3.Z, PV.W, literal.z,
-; CM-NEXT:     SETGT_UINT * T1.W, PV.Z, literal.w,
-; CM-NEXT:    -127(nan), 209715200(1.972152e-31)
-; CM-NEXT:    2130706432(1.701412e+38), 254(3.559298e-43)
-; CM-NEXT:     CNDE_INT T2.X, PV.W, T0.W, PV.Z,
+; CM-NEXT:     FLT_TO_INT T2.X, T1.Z,
+; CM-NEXT:     MUL_IEEE T2.Y, PV.Y, literal.x,
+; CM-NEXT:     CNDE_INT T1.Z, T1.Y, T1.X, T0.W,
+; CM-NEXT:     SETGT_INT * T0.W, T0.Z, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:    209715200(1.972152e-31), -127(nan)
+; CM-NEXT:     CNDE_INT T1.X, PV.W, PV.Z, T0.X,
 ; CM-NEXT:     MUL_IEEE T3.Y, PV.Y, literal.x,
-; CM-NEXT:     CNDE_INT T3.Z, PV.X, T1.Y, T1.Z,
-; CM-NEXT:     MAX_INT * T0.W, T0.Z, literal.y,
-; CM-NEXT:    209715200(1.972152e-31), -330(nan)
-; CM-NEXT:     ADD_INT T5.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, T0.Z, literal.y,
-; CM-NEXT:     SETGT_UINT T4.Z, T0.Z, literal.z,
-; CM-NEXT:     MUL_IEEE * T0.W, T0.Y, literal.w,
+; CM-NEXT:     SETGT_UINT T1.Z, PV.X, literal.y,
+; CM-NEXT:     MAX_INT * T1.W, T0.Z, literal.z,
+; CM-NEXT:    209715200(1.972152e-31), -229(nan)
+; CM-NEXT:    -330(nan), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T3.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T4.Y, T0.Z, literal.y,
+; CM-NEXT:     CNDE_INT T2.Z, PV.Z, PV.Y, T2.Y,
+; CM-NEXT:     SETGT_INT * T1.W, T2.X, literal.z,
 ; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
-; CM-NEXT:    -229(nan), 209715200(1.972152e-31)
-; CM-NEXT:     MUL_IEEE T6.X, PV.W, literal.x,
-; CM-NEXT:     MIN_INT T4.Y, T0.Z, literal.y,
-; CM-NEXT:     CNDE_INT T5.Z, PV.Z, PV.X, PV.Y,
-; CM-NEXT:     SETGT_INT * T2.W, T0.Z, literal.z,
-; CM-NEXT:    209715200(1.972152e-31), 381(5.338947e-43)
-; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T5.X, PV.W, PV.Z, T0.Z,
-; CM-NEXT:     MIN_INT T1.Y, T1.Z, literal.x,
-; CM-NEXT:     ADD_INT T5.Z, PV.Y, literal.y,
-; CM-NEXT:     ADD_INT * T3.W, T0.Z, literal.z, BS:VEC_120/SCL_212
-; CM-NEXT:    381(5.338947e-43), -254(nan)
 ; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T7.X, T1.W, PV.W, PV.Z,
-; CM-NEXT:     SETGT_INT T4.Y, T0.Z, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, PV.Y, literal.y,
-; CM-NEXT:     ADD_INT * T1.W, T1.Z, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:     CNDE_INT T4.X, PV.W, PV.Z, T0.Y,
+; CM-NEXT:     MUL_IEEE T2.Y, T0.X, literal.x,
+; CM-NEXT:     MAX_INT T2.Z, T2.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     CNDE_INT * T2.W, T1.Y, PV.X, PV.Y,
+; CM-NEXT:    2130706432(1.701412e+38), -330(nan)
+; CM-NEXT:     CNDE_INT T0.X, T0.W, PV.W, T0.Z,
+; CM-NEXT:     ADD_INT T1.Y, PV.Z, literal.x,
+; CM-NEXT:     ADD_INT T2.Z, T2.X, literal.y,
+; CM-NEXT:     MIN_INT * T0.W, T2.X, literal.z,
+; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T3.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T3.Y, T2.X, literal.y,
+; CM-NEXT:     SETGT_UINT T3.Z, T2.X, literal.z,
+; CM-NEXT:     CNDE_INT * T0.W, T1.Z, PV.Y, PV.Z,
+; CM-NEXT:    -254(nan), -127(nan)
+; CM-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T5.X, T0.Y, literal.x,
+; CM-NEXT:     CNDE_INT T0.Y, T1.W, PV.W, T2.X,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Z, PV.Y, PV.X,
+; CM-NEXT:     MIN_INT * T0.W, T0.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 381(5.338947e-43)
+; CM-NEXT:     SETGT_INT T2.X, T2.X, literal.x,
+; CM-NEXT:     ADD_INT T1.Y, PV.W, literal.y,
+; CM-NEXT:     ADD_INT T2.Z, T0.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T0.W, T0.Z, literal.w,
 ; CM-NEXT:    127(1.779649e-43), -254(nan)
-; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T8.X, T2.Z, PV.W, PV.Z,
-; CM-NEXT:     SETGT_INT T1.Y, T1.Z, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T5.X, PV.X,
-; CM-NEXT:     CNDE_INT * T0.W, T4.Z, T6.X, T0.W, BS:VEC_201
-; CM-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T5.X, T2.W, PV.W, T0.Y,
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     CNDE_INT T3.X, PV.W, PV.Z, PV.Y,
+; CM-NEXT:     SETGT_INT T1.Y, T0.Z, literal.x,
+; CM-NEXT:     CNDE_INT T0.Z, PV.X, T0.Y, T1.Z,
+; CM-NEXT:     MUL_IEEE * T1.W, T5.X, literal.y,
+; CM-NEXT:    127(1.779649e-43), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T5.X, T3.Z, T5.X, PV.W,
 ; CM-NEXT:     LSHL T0.Y, PV.Z, literal.x,
-; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T3.Z, PV.X,
-; CM-NEXT:     CNDE_INT * T0.W, T3.X, T3.Y, T2.Y, BS:VEC_201
-; CM-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T0.X, T1.X, PV.W, T0.X,
+; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T0.X, PV.X, BS:VEC_021/SCL_122
+; CM-NEXT:     MUL_IEEE * T1.W, T2.Y, literal.y,
+; CM-NEXT:    23(3.222986e-44), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T0.X, T0.W, T2.Y, PV.W,
 ; CM-NEXT:     LSHL T2.Y, PV.Z, literal.x,
 ; CM-NEXT:     ADD_INT * T0.Z, PV.Y, literal.y,
 ; CM-NEXT:    23(3.222986e-44), 1065353216(1.000000e+00)
-; CM-NEXT:    ALU clause starting at 105:
-; CM-NEXT:     CNDE_INT * T0.W, T4.Y, T5.X, T2.X,
-; CM-NEXT:     MUL_IEEE T1.X, PV.W, T0.Z,
+; CM-NEXT:    ALU clause starting at 103:
+; CM-NEXT:     CNDE_INT * T0.W, T2.X, T4.X, T5.X,
+; CM-NEXT:     MUL_IEEE T2.X, PV.W, T0.Z,
 ; CM-NEXT:     SETGT T0.Y, literal.x, KC0[3].X,
 ; CM-NEXT:     ADD_INT T0.Z, T2.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T1.Y, T0.X, T4.X, BS:VEC_120/SCL_212
+; CM-NEXT:     CNDE_INT * T0.W, T1.Y, T1.X, T0.X, BS:VEC_120/SCL_212
 ; CM-NEXT:    -1026650416(-1.032789e+02), 1065353216(1.000000e+00)
 ; CM-NEXT:     MUL_IEEE T0.X, PV.W, PV.Z,
 ; CM-NEXT:     SETGT T1.Y, literal.x, KC0[2].W,
@@ -1215,8 +1211,8 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ;
 ; R600-LABEL: s_exp_v3f32:
 ; R600:       ; %bb.0:
-; R600-NEXT:    ALU 100, @6, KC0[CB0:0-32], KC1[]
-; R600-NEXT:    ALU 69, @107, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    ALU 99, @6, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    ALU 69, @106, KC0[CB0:0-32], KC1[]
 ; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
 ; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
 ; R600-NEXT:    CF_END
@@ -1224,69 +1220,68 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; R600-NEXT:    ALU clause starting at 6:
 ; R600-NEXT:     AND_INT * T0.W, KC0[3].Y, literal.x,
 ; R600-NEXT:    -4096(nan), 0(0.000000e+00)
-; R600-NEXT:     ADD T1.W, KC0[3].Y, -PV.W,
-; R600-NEXT:     MUL_IEEE * T2.W, PV.W, literal.x,
+; R600-NEXT:     MUL_IEEE T1.W, PV.W, literal.x,
+; R600-NEXT:     ADD * T2.W, KC0[3].Y, -PV.W,
 ; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; R600-NEXT:     RNDNE T3.W, PS,
-; R600-NEXT:     MUL_IEEE * T4.W, PV.W, literal.x,
+; R600-NEXT:     RNDNE * T3.W, PV.W,
+; R600-NEXT:     TRUNC T4.W, PV.W,
+; R600-NEXT:     MUL_IEEE * T5.W, T2.W, literal.x,
 ; R600-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; R600-NEXT:     MULADD_IEEE T1.W, T1.W, literal.x, PS,
-; R600-NEXT:     TRUNC * T4.W, PV.W,
+; R600-NEXT:     MULADD_IEEE T2.W, T2.W, literal.x, PS,
+; R600-NEXT:     FLT_TO_INT * T4.W, PV.W,
 ; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; R600-NEXT:     FLT_TO_INT T0.Z, PS,
-; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.x, PV.W,
-; R600-NEXT:     ADD * T1.W, T2.W, -T3.W,
-; R600-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; R600-NEXT:     ADD T0.W, PS, PV.W,
-; R600-NEXT:     MAX_INT * T1.W, PV.Z, literal.x,
-; R600-NEXT:    -330(nan), 0(0.000000e+00)
-; R600-NEXT:     ADD_INT T0.Y, PS, literal.x,
-; R600-NEXT:     ADD_INT T1.Z, T0.Z, literal.y,
-; R600-NEXT:     SETGT_UINT T1.W, T0.Z, literal.z,
-; R600-NEXT:     EXP_IEEE * T0.X, PV.W,
+; R600-NEXT:     MAX_INT T0.Z, PS, literal.x,
+; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.y, PV.W,
+; R600-NEXT:     ADD * T1.W, T1.W, -T3.W,
+; R600-NEXT:    -330(nan), 967029397(3.122284e-04)
+; R600-NEXT:     ADD T0.Y, PS, PV.W,
+; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.x,
+; R600-NEXT:     ADD_INT T0.W, T4.W, literal.y,
+; R600-NEXT:     SETGT_UINT * T1.W, T4.W, literal.z,
 ; R600-NEXT:    204(2.858649e-43), 102(1.429324e-43)
 ; R600-NEXT:    -229(nan), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
-; R600-NEXT:     SETGT_INT T0.W, T0.Z, literal.x,
-; R600-NEXT:     MUL_IEEE * T2.W, PS, literal.y,
-; R600-NEXT:    -127(nan), 209715200(1.972152e-31)
-; R600-NEXT:     MUL_IEEE T0.Y, PS, literal.x,
-; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Z, T0.Z,
-; R600-NEXT:     MIN_INT T3.W, T0.Z, literal.y,
-; R600-NEXT:     AND_INT * T4.W, KC0[3].W, literal.z,
-; R600-NEXT:    209715200(1.972152e-31), 381(5.338947e-43)
-; R600-NEXT:    -4096(nan), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T1.X, T0.X, literal.x,
-; R600-NEXT:     ADD T1.Y, KC0[3].W, -PS,
-; R600-NEXT:     ADD_INT T2.Z, PV.W, literal.y,
-; R600-NEXT:     ADD_INT T3.W, T0.Z, literal.z,
-; R600-NEXT:     SETGT_UINT * T5.W, T0.Z, literal.w,
-; R600-NEXT:    2130706432(1.701412e+38), -254(nan)
+; R600-NEXT:     CNDE_INT T0.Z, PS, PV.Z, PV.W,
+; R600-NEXT:     SETGT_INT T0.W, T4.W, literal.x,
+; R600-NEXT:     EXP_IEEE * T0.X, PV.Y,
+; R600-NEXT:    -127(nan), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T1.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T0.Y, PV.W, PV.Z, T4.W,
+; R600-NEXT:     MIN_INT T0.Z, T4.W, literal.y,
+; R600-NEXT:     AND_INT T2.W, KC0[3].W, literal.z,
+; R600-NEXT:     MUL_IEEE * T3.W, PS, literal.w,
+; R600-NEXT:    2130706432(1.701412e+38), 381(5.338947e-43)
+; R600-NEXT:    -4096(nan), 209715200(1.972152e-31)
+; R600-NEXT:     MUL_IEEE T2.X, PS, literal.x,
+; R600-NEXT:     ADD T1.Y, KC0[3].W, -PV.W,
+; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
+; R600-NEXT:     ADD_INT T5.W, T4.W, literal.z,
+; R600-NEXT:     SETGT_UINT * T6.W, T4.W, literal.w,
+; R600-NEXT:    209715200(1.972152e-31), -254(nan)
 ; R600-NEXT:    -127(nan), 254(3.559298e-43)
-; R600-NEXT:     CNDE_INT T2.X, PS, PV.W, PV.Z,
-; R600-NEXT:     SETGT_INT T2.Y, T0.Z, literal.x,
+; R600-NEXT:     CNDE_INT T3.X, PS, PV.W, PV.Z,
+; R600-NEXT:     SETGT_INT T2.Y, T4.W, literal.x,
 ; R600-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.y,
-; R600-NEXT:     MUL_IEEE T3.W, T4.W, literal.z,
-; R600-NEXT:     MUL_IEEE * T6.W, PV.X, literal.w,
+; R600-NEXT:     MUL_IEEE * T4.W, T2.W, literal.z, BS:VEC_120/SCL_212
 ; R600-NEXT:    127(1.779649e-43), 967029397(3.122284e-04)
-; R600-NEXT:    1069064192(1.442383e+00), 2130706432(1.701412e+38)
-; R600-NEXT:     CNDE_INT T1.X, T5.W, T1.X, PS, BS:VEC_120/SCL_212
-; R600-NEXT:     RNDNE T3.Y, PV.W,
-; R600-NEXT:     MULADD_IEEE T0.Z, T1.Y, literal.x, PV.Z,
-; R600-NEXT:     CNDE_INT T5.W, PV.Y, T1.Z, PV.X,
-; R600-NEXT:     CNDE_INT * T1.W, T1.W, T0.Y, T2.W,
 ; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T0.X, T0.W, PS, T0.X,
+; R600-NEXT:     CNDE_INT * T1.W, T1.W, T2.X, T3.W,
+; R600-NEXT:     CNDE_INT T0.X, T0.W, PV.W, T0.X, BS:VEC_021/SCL_122
+; R600-NEXT:     RNDNE T3.Y, T4.W, BS:VEC_120/SCL_212
+; R600-NEXT:     MULADD_IEEE T0.Z, T1.Y, literal.x, T0.Z,
+; R600-NEXT:     CNDE_INT T0.W, T2.Y, T0.Y, T3.X, BS:VEC_120/SCL_212
+; R600-NEXT:     MUL_IEEE * T1.W, T1.X, literal.y,
+; R600-NEXT:    1069064192(1.442383e+00), 2130706432(1.701412e+38)
+; R600-NEXT:     CNDE_INT T1.X, T6.W, T1.X, PS,
 ; R600-NEXT:     LSHL T0.Y, PV.W, literal.x,
 ; R600-NEXT:     AND_INT T1.Z, KC0[3].Z, literal.y,
-; R600-NEXT:     MULADD_IEEE T0.W, T4.W, literal.z, PV.Z, BS:VEC_120/SCL_212
-; R600-NEXT:     ADD * T1.W, T3.W, -PV.Y,
+; R600-NEXT:     MULADD_IEEE T0.W, T2.W, literal.z, PV.Z, BS:VEC_120/SCL_212
+; R600-NEXT:     ADD * T1.W, T4.W, -PV.Y,
 ; R600-NEXT:    23(3.222986e-44), -4096(nan)
 ; R600-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
 ; R600-NEXT:     ADD T1.Y, PS, PV.W,
 ; R600-NEXT:     MUL_IEEE T0.Z, PV.Z, literal.x,
 ; R600-NEXT:     ADD_INT T0.W, PV.Y, literal.y,
-; R600-NEXT:     CNDE_INT * T1.W, T2.Y, PV.X, T1.X,
+; R600-NEXT:     CNDE_INT * T1.W, T2.Y, T0.X, PV.X,
 ; R600-NEXT:    1069064192(1.442383e+00), 1065353216(1.000000e+00)
 ; R600-NEXT:     MUL_IEEE T0.X, PS, PV.W,
 ; R600-NEXT:     ADD T0.Y, KC0[3].Z, -T1.Z,
@@ -1300,12 +1295,12 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; R600-NEXT:     MUL_IEEE * T1.W, PS, literal.z,
 ; R600-NEXT:    -1026650416(-1.032789e+02), 967029397(3.122284e-04)
 ; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T3.X, T1.X, literal.x,
-; R600-NEXT:     MUL_IEEE T2.Y, PS, literal.y,
+; R600-NEXT:     MUL_IEEE T3.X, PS, literal.x,
+; R600-NEXT:     MUL_IEEE T2.Y, T1.X, literal.y,
 ; R600-NEXT:     MULADD_IEEE T4.Z, T0.Y, literal.z, PV.W,
 ; R600-NEXT:     FLT_TO_INT T0.W, PV.Z,
 ; R600-NEXT:     MIN_INT * T2.W, PV.Y, literal.w,
-; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
+; R600-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
 ; R600-NEXT:    1069064192(1.442383e+00), 381(5.338947e-43)
 ; R600-NEXT:     ADD_INT T4.X, PS, literal.x,
 ; R600-NEXT:     MAX_INT T0.Y, PV.W, literal.y,
@@ -1323,7 +1318,7 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; R600-NEXT:    102(1.429324e-43), -229(nan)
 ; R600-NEXT:     ADD_INT * T6.X, T0.W, literal.x,
 ; R600-NEXT:    -127(nan), 0(0.000000e+00)
-; R600-NEXT:    ALU clause starting at 107:
+; R600-NEXT:    ALU clause starting at 106:
 ; R600-NEXT:     SETGT_UINT T0.Y, T0.W, literal.x,
 ; R600-NEXT:     CNDE_INT T0.Z, T3.W, T0.Z, T2.W, BS:VEC_102/SCL_221
 ; R600-NEXT:     SETGT_INT T2.W, T0.W, literal.y,
@@ -1339,25 +1334,25 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; R600-NEXT:     SETGT_UINT T5.X, T1.Y, literal.x,
 ; R600-NEXT:     CNDE_INT T4.Y, PS, PV.Z, PV.W,
 ; R600-NEXT:     MAX_INT T0.Z, T1.Y, literal.y,
-; R600-NEXT:     MUL_IEEE T4.W, T1.Z, literal.z,
-; R600-NEXT:     MUL_IEEE * T5.W, PV.Y, literal.w,
+; R600-NEXT:     MUL_IEEE T4.W, PV.Y, literal.z,
+; R600-NEXT:     MUL_IEEE * T5.W, T1.Z, literal.w,
 ; R600-NEXT:    254(3.559298e-43), -330(nan)
-; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
-; R600-NEXT:     CNDE_INT T6.X, T3.W, PS, T3.Y, BS:VEC_021/SCL_122
-; R600-NEXT:     MUL_IEEE T3.Y, PV.W, literal.x,
+; R600-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
+; R600-NEXT:     MUL_IEEE T6.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T3.Y, T3.W, PV.W, T3.Y, BS:VEC_021/SCL_122
 ; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
 ; R600-NEXT:     ADD_INT T3.W, T1.Y, literal.z,
-; R600-NEXT:     SETGT_UINT * T5.W, T1.Y, literal.w,
+; R600-NEXT:     SETGT_UINT * T4.W, T1.Y, literal.w,
 ; R600-NEXT:    2130706432(1.701412e+38), 204(2.858649e-43)
 ; R600-NEXT:    102(1.429324e-43), -229(nan)
 ; R600-NEXT:     CNDE_INT T8.X, PS, PV.Z, PV.W,
 ; R600-NEXT:     SETGT_INT T5.Y, T1.Y, literal.x,
-; R600-NEXT:     CNDE_INT T0.Z, T0.Y, T4.W, PV.Y, BS:VEC_120/SCL_212
-; R600-NEXT:     CNDE_INT T2.W, T2.W, PV.X, T1.Z,
+; R600-NEXT:     CNDE_INT T0.Z, T2.W, PV.Y, T1.Z,
+; R600-NEXT:     CNDE_INT T2.W, T0.Y, T5.W, PV.X, BS:VEC_120/SCL_212
 ; R600-NEXT:     LSHL * T3.W, T4.Y, literal.y,
 ; R600-NEXT:    -127(nan), 23(3.222986e-44)
 ; R600-NEXT:     ADD_INT T6.X, PS, literal.x,
-; R600-NEXT:     CNDE_INT T0.Y, T0.W, PV.W, PV.Z,
+; R600-NEXT:     CNDE_INT T0.Y, T0.W, PV.Z, PV.W,
 ; R600-NEXT:     CNDE_INT T0.Z, PV.Y, PV.X, T1.Y,
 ; R600-NEXT:     CNDE_INT T0.W, T5.X, T7.X, T4.X,
 ; R600-NEXT:     SETGT_INT * T2.W, T1.Y, literal.y,
@@ -1365,18 +1360,18 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; R600-NEXT:     CNDE_INT T4.X, PS, PV.Z, PV.W,
 ; R600-NEXT:     MUL_IEEE T0.Y, PV.Y, PV.X,
 ; R600-NEXT:     SETGT T0.Z, literal.x, KC0[3].Z,
-; R600-NEXT:     CNDE_INT T0.W, T5.W, T2.Y, T1.W,
-; R600-NEXT:     MUL_IEEE * T1.W, T3.X, literal.y,
+; R600-NEXT:     MUL_IEEE T0.W, T2.Y, literal.y,
+; R600-NEXT:     CNDE_INT * T1.W, T4.W, T3.X, T1.W,
 ; R600-NEXT:    -1026650416(-1.032789e+02), 2130706432(1.701412e+38)
-; R600-NEXT:     CNDE_INT T3.X, T5.X, T3.X, PS,
-; R600-NEXT:     CNDE_INT T1.Y, T5.Y, PV.W, T1.X,
+; R600-NEXT:     CNDE_INT T1.X, T5.Y, PS, T1.X,
+; R600-NEXT:     CNDE_INT T1.Y, T5.X, T2.Y, PV.W,
 ; R600-NEXT:     CNDE T0.Z, PV.Z, PV.Y, 0.0,
 ; R600-NEXT:     SETGT T0.W, KC0[3].Z, literal.x,
 ; R600-NEXT:     LSHL * T1.W, PV.X, literal.y,
 ; R600-NEXT:    1118925336(8.872284e+01), 23(3.222986e-44)
-; R600-NEXT:     ADD_INT T1.X, PS, literal.x,
+; R600-NEXT:     ADD_INT T3.X, PS, literal.x,
 ; R600-NEXT:     CNDE T0.Y, PV.W, PV.Z, literal.y,
-; R600-NEXT:     CNDE_INT T0.Z, T2.W, PV.Y, PV.X,
+; R600-NEXT:     CNDE_INT T0.Z, T2.W, PV.X, PV.Y,
 ; R600-NEXT:     CNDE T0.W, T2.X, T0.X, 0.0,
 ; R600-NEXT:     SETGT * T1.W, KC0[3].Y, literal.z,
 ; R600-NEXT:    1065353216(1.000000e+00), 2139095040(INF)
@@ -1397,197 +1392,193 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ;
 ; CM-LABEL: s_exp_v3f32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 102, @6, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    ALU 80, @109, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T0.X
+; CM-NEXT:    ALU 101, @6, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 77, @108, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T3.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
 ; CM-NEXT:    ALU clause starting at 6:
 ; CM-NEXT:     AND_INT * T0.W, KC0[3].Y, literal.x,
 ; CM-NEXT:    -4096(nan), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.Z, PV.W, literal.x,
 ; CM-NEXT:     ADD * T1.W, KC0[3].Y, -PV.W,
-; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T1.Z, PV.W, literal.x,
-; CM-NEXT:     RNDNE * T2.W, PV.Z,
-; CM-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; CM-NEXT:     TRUNC T2.Z, PV.W,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.W, literal.x,
+; CM-NEXT:     MUL_IEEE * T2.W, T0.W, literal.y,
+; CM-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
+; CM-NEXT:     RNDNE T1.Z, PV.W,
 ; CM-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.x, PV.Z,
 ; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; CM-NEXT:     MULADD_IEEE T0.Y, T0.W, literal.x, PV.W,
-; CM-NEXT:     ADD T0.Z, T0.Z, -T2.W,
-; CM-NEXT:     FLT_TO_INT * T0.W, PV.Z,
+; CM-NEXT:     MULADD_IEEE T0.Z, T0.W, literal.x, PV.W,
+; CM-NEXT:     ADD * T0.W, T2.W, -PV.Z, BS:VEC_120/SCL_212
 ; CM-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; CM-NEXT:     MIN_INT T1.Z, PV.W, literal.x,
-; CM-NEXT:     ADD * T1.W, PV.Z, PV.Y,
+; CM-NEXT:     TRUNC T1.Z, T1.Z,
+; CM-NEXT:     ADD * T0.W, PV.W, PV.Z,
+; CM-NEXT:     EXP_IEEE T0.X, T0.W,
+; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
+; CM-NEXT:     FLT_TO_INT T0.Z, T1.Z,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T0.Y, PV.W, literal.x,
+; CM-NEXT:     MAX_INT T1.Z, PV.Z, literal.y,
+; CM-NEXT:     MIN_INT * T1.W, PV.Z, literal.z,
+; CM-NEXT:    209715200(1.972152e-31), -330(nan)
 ; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
-; CM-NEXT:     EXP_IEEE T0.X, T1.W,
-; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
-; CM-NEXT:     MUL_IEEE T0.Y, PV.X, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, T1.Z, literal.y,
-; CM-NEXT:     MAX_INT * T1.W, T0.W, literal.z,
-; CM-NEXT:    2130706432(1.701412e+38), -254(nan)
-; CM-NEXT:    -330(nan), 0(0.000000e+00)
-; CM-NEXT:     ADD_INT T1.X, T0.W, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, PV.W, literal.y,
-; CM-NEXT:     ADD_INT T1.Z, T0.W, literal.z,
-; CM-NEXT:     SETGT_UINT * T1.W, T0.W, literal.w,
-; CM-NEXT:    -127(nan), 204(2.858649e-43)
+; CM-NEXT:     ADD_INT T1.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T1.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T1.Z, T0.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T1.W, T0.Z, literal.w,
+; CM-NEXT:    -254(nan), 204(2.858649e-43)
 ; CM-NEXT:    102(1.429324e-43), -229(nan)
-; CM-NEXT:     SETGT_UINT T2.X, T0.W, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     SETGT_INT T1.Z, T0.W, literal.y,
-; CM-NEXT:     MUL_IEEE * T2.W, T0.X, literal.z,
-; CM-NEXT:    254(3.559298e-43), -127(nan)
-; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T3.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.Z, PV.Y, T0.W,
-; CM-NEXT:     CNDE_INT T0.Z, PV.X, T1.X, T0.Z,
-; CM-NEXT:     SETGT_INT * T0.W, T0.W, literal.y,
-; CM-NEXT:    209715200(1.972152e-31), 127(1.779649e-43)
+; CM-NEXT:     ADD_INT T2.X, T0.Z, literal.x,
+; CM-NEXT:     SETGT_UINT T2.Y, T0.Z, literal.y,
+; CM-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     SETGT_INT * T2.W, T0.Z, literal.x,
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     MUL_IEEE T3.X, T0.X, literal.x,
+; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Z, T0.Z,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Y, PV.X, T1.X,
+; CM-NEXT:     SETGT_INT * T3.W, T0.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 127(1.779649e-43)
 ; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     CNDE_INT T0.Z, T1.W, PV.X, T2.W,
-; CM-NEXT:     MUL_IEEE * T1.W, T0.Y, literal.x,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.X, literal.x,
+; CM-NEXT:     CNDE_INT * T0.W, T1.W, T0.Y, T0.W,
 ; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T1.X, T2.X, T0.Y, PV.W,
-; CM-NEXT:     CNDE_INT T0.Y, T1.Z, PV.Z, T0.X,
+; CM-NEXT:     CNDE_INT T0.X, T2.W, PV.W, T0.X,
+; CM-NEXT:     CNDE_INT T0.Y, T2.Y, T3.X, PV.Z,
 ; CM-NEXT:     LSHL T0.Z, PV.Y, literal.x,
-; CM-NEXT:     AND_INT * T1.W, KC0[3].Z, literal.y,
+; CM-NEXT:     AND_INT * T0.W, KC0[3].Z, literal.y,
 ; CM-NEXT:    23(3.222986e-44), -4096(nan)
-; CM-NEXT:     MUL_IEEE T0.X, PV.W, literal.x,
 ; CM-NEXT:     ADD T1.Y, KC0[3].Z, -PV.W,
-; CM-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T0.W, PV.Y, PV.X,
-; CM-NEXT:    1069064192(1.442383e+00), 1065353216(1.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.Y, PV.W, PV.Z,
-; CM-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.x,
-; CM-NEXT:     RNDNE * T0.W, PV.X,
-; CM-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T0.Z, PV.Z, literal.x,
+; CM-NEXT:     CNDE_INT * T1.W, T3.W, PV.X, PV.Y,
+; CM-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T0.X, PV.W, PV.Z,
+; CM-NEXT:     MUL_IEEE T0.Y, PV.Y, literal.x,
+; CM-NEXT:     MUL_IEEE T0.Z, T0.W, literal.y,
+; CM-NEXT:     AND_INT * T1.W, KC0[3].W, literal.z,
+; CM-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
+; CM-NEXT:    -4096(nan), 0(0.000000e+00)
 ; CM-NEXT:     SETGT T1.X, literal.x, KC0[3].Y,
-; CM-NEXT:     TRUNC T2.Y, PV.W,
-; CM-NEXT:     AND_INT T1.Z, KC0[3].W, literal.y,
-; CM-NEXT:     MULADD_IEEE * T2.W, T1.Y, literal.z, PV.Z,
-; CM-NEXT:    -1026650416(-1.032789e+02), -4096(nan)
-; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; CM-NEXT:     MULADD_IEEE T2.X, T1.W, literal.x, PV.W,
-; CM-NEXT:     MUL_IEEE T1.Y, PV.Z, literal.y,
-; CM-NEXT:     FLT_TO_INT T0.Z, PV.Y,
-; CM-NEXT:     ADD * T1.W, KC0[3].W, -PV.Z,
+; CM-NEXT:     ADD T2.Y, KC0[3].W, -PV.W,
+; CM-NEXT:     RNDNE T1.Z, PV.Z,
+; CM-NEXT:     MULADD_IEEE * T2.W, T1.Y, literal.y, PV.Y,
+; CM-NEXT:    -1026650416(-1.032789e+02), 1069064192(1.442383e+00)
+; CM-NEXT:     MULADD_IEEE T2.X, T0.W, literal.x, PV.W,
+; CM-NEXT:     ADD T0.Y, T0.Z, -PV.Z,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.x,
+; CM-NEXT:     MUL_IEEE * T0.W, T1.W, literal.y, BS:VEC_120/SCL_212
 ; CM-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
-; CM-NEXT:     ADD T0.X, T0.X, -T0.W,
-; CM-NEXT:     MUL_IEEE T2.Y, PV.W, literal.x,
-; CM-NEXT:     MAX_INT T2.Z, PV.Z, literal.y,
-; CM-NEXT:     RNDNE * T0.W, PV.Y,
-; CM-NEXT:    967029397(3.122284e-04), -330(nan)
-; CM-NEXT:     TRUNC T3.X, PV.W,
-; CM-NEXT:     ADD_INT T3.Y, PV.Z, literal.x,
-; CM-NEXT:     MULADD_IEEE T2.Z, T1.W, literal.y, PV.Y,
-; CM-NEXT:     ADD * T1.W, PV.X, T2.X,
-; CM-NEXT:    204(2.858649e-43), 1069064192(1.442383e+00)
-; CM-NEXT:     EXP_IEEE T0.X, T1.W,
-; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
-; CM-NEXT:     ADD_INT T2.X, T0.Z, literal.x,
-; CM-NEXT:     MULADD_IEEE T2.Y, T1.Z, literal.y, T2.Z, BS:VEC_102/SCL_221
-; CM-NEXT:     ADD T1.Z, T1.Y, -T0.W,
-; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.z,
-; CM-NEXT:    102(1.429324e-43), 967029397(3.122284e-04)
-; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_UINT T4.X, T0.Z, literal.x,
-; CM-NEXT:     MUL_IEEE T1.Y, PV.W, literal.y,
-; CM-NEXT:     SETGT_UINT T2.Z, T0.Z, literal.z,
-; CM-NEXT:     ADD * T1.W, PV.Z, PV.Y,
-; CM-NEXT:    -229(nan), 2130706432(1.701412e+38)
-; CM-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; CM-NEXT:     TRUNC T3.X, T1.Z,
+; CM-NEXT:     RNDNE T1.Y, PV.W,
+; CM-NEXT:     MULADD_IEEE T0.Z, T2.Y, literal.x, PV.Z,
+; CM-NEXT:     ADD * T2.W, PV.Y, PV.X,
+; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
+; CM-NEXT:     EXP_IEEE T0.X (MASKED), T2.W,
+; CM-NEXT:     EXP_IEEE T0.Y, T2.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T2.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T2.W,
+; CM-NEXT:     MULADD_IEEE T2.X, T1.W, literal.x, T0.Z,
+; CM-NEXT:     ADD T2.Y, T0.W, -T1.Y, BS:VEC_120/SCL_212
+; CM-NEXT:     FLT_TO_INT T0.Z, T3.X,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.Y, literal.y,
+; CM-NEXT:    967029397(3.122284e-04), 209715200(1.972152e-31)
+; CM-NEXT:     MUL_IEEE T3.X, PV.W, literal.x,
+; CM-NEXT:     SETGT_UINT T3.Y, PV.Z, literal.y,
+; CM-NEXT:     TRUNC T1.Z, T1.Y,
+; CM-NEXT:     ADD * T1.W, PV.Y, PV.X,
+; CM-NEXT:    209715200(1.972152e-31), -229(nan)
 ; CM-NEXT:     EXP_IEEE T1.X (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE T1.Y (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE T1.Z, T1.W,
+; CM-NEXT:     EXP_IEEE T1.Y, T1.W,
+; CM-NEXT:     EXP_IEEE T1.Z (MASKED), T1.W,
 ; CM-NEXT:     EXP_IEEE * T1.W (MASKED), T1.W,
-; CM-NEXT:    ALU clause starting at 109:
-; CM-NEXT:     CNDE_INT T5.X, T2.Z, T0.W, T1.Y,
-; CM-NEXT:     CNDE_INT T1.Y, T4.X, T3.Y, T2.X,
-; CM-NEXT:     FLT_TO_INT T3.Z, T3.X, BS:VEC_120/SCL_212
-; CM-NEXT:     MUL_IEEE * T0.W, T1.Z, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_INT T2.X, T0.Z, literal.x,
-; CM-NEXT:     MUL_IEEE T2.Y, T0.X, literal.y,
-; CM-NEXT:     MUL_IEEE T4.Z, PV.W, literal.z,
-; CM-NEXT:     SETGT_UINT * T1.W, PV.Z, literal.w,
-; CM-NEXT:    -127(nan), 209715200(1.972152e-31)
-; CM-NEXT:    2130706432(1.701412e+38), 254(3.559298e-43)
-; CM-NEXT:     CNDE_INT T3.X, PV.W, T0.W, PV.Z,
-; CM-NEXT:     MUL_IEEE T3.Y, PV.Y, literal.x,
-; CM-NEXT:     CNDE_INT T4.Z, PV.X, T1.Y, T0.Z,
-; CM-NEXT:     MAX_INT * T0.W, T3.Z, literal.y,
-; CM-NEXT:    209715200(1.972152e-31), -330(nan)
-; CM-NEXT:     ADD_INT T6.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, T3.Z, literal.y,
-; CM-NEXT:     SETGT_UINT T5.Z, T3.Z, literal.z,
-; CM-NEXT:     MUL_IEEE * T0.W, T1.Z, literal.w, BS:VEC_120/SCL_212
+; CM-NEXT:     FLT_TO_INT T2.X, T1.Z,
+; CM-NEXT:     MUL_IEEE T2.Y, PV.Y, literal.x,
+; CM-NEXT:     CNDE_INT T1.Z, T3.Y, T3.X, T0.W,
+; CM-NEXT:     SETGT_INT * T0.W, T0.Z, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:    209715200(1.972152e-31), -127(nan)
+; CM-NEXT:     CNDE_INT T3.X, PV.W, PV.Z, T0.Y,
+; CM-NEXT:     MUL_IEEE * T4.Y, PV.Y, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:    ALU clause starting at 108:
+; CM-NEXT:     SETGT_UINT T1.Z, T2.X, literal.x,
+; CM-NEXT:     MAX_INT * T1.W, T0.Z, literal.y,
+; CM-NEXT:    -229(nan), -330(nan)
+; CM-NEXT:     ADD_INT T4.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T5.Y, T0.Z, literal.y,
+; CM-NEXT:     CNDE_INT T2.Z, PV.Z, T4.Y, T2.Y,
+; CM-NEXT:     SETGT_INT * T1.W, T2.X, literal.z,
 ; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
-; CM-NEXT:    -229(nan), 209715200(1.972152e-31)
-; CM-NEXT:     MUL_IEEE T7.X, PV.W, literal.x,
-; CM-NEXT:     MIN_INT T4.Y, T3.Z, literal.y,
-; CM-NEXT:     CNDE_INT T6.Z, PV.Z, PV.X, PV.Y,
-; CM-NEXT:     SETGT_INT * T2.W, T3.Z, literal.z,
-; CM-NEXT:    209715200(1.972152e-31), 381(5.338947e-43)
 ; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T6.X, PV.W, PV.Z, T3.Z,
-; CM-NEXT:     MIN_INT T1.Y, T0.Z, literal.x,
-; CM-NEXT:     ADD_INT T6.Z, PV.Y, literal.y,
-; CM-NEXT:     ADD_INT * T3.W, T3.Z, literal.z, BS:VEC_120/SCL_212
-; CM-NEXT:    381(5.338947e-43), -254(nan)
-; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T8.X, T1.W, PV.W, PV.Z,
-; CM-NEXT:     SETGT_INT T4.Y, T3.Z, literal.x,
-; CM-NEXT:     ADD_INT T3.Z, PV.Y, literal.y,
-; CM-NEXT:     ADD_INT * T1.W, T0.Z, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:     CNDE_INT T5.X, PV.W, PV.Z, T1.Y,
+; CM-NEXT:     MUL_IEEE T0.Y, T0.Y, literal.x,
+; CM-NEXT:     MAX_INT T2.Z, T2.X, literal.y,
+; CM-NEXT:     CNDE_INT * T2.W, T3.Y, PV.X, PV.Y, BS:VEC_120/SCL_212
+; CM-NEXT:    2130706432(1.701412e+38), -330(nan)
+; CM-NEXT:     CNDE_INT T4.X, T0.W, PV.W, T0.Z,
+; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.x,
+; CM-NEXT:     ADD_INT T2.Z, T2.X, literal.y,
+; CM-NEXT:     MIN_INT * T0.W, T2.X, literal.z,
+; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T6.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T3.Y, T2.X, literal.y,
+; CM-NEXT:     SETGT_UINT T3.Z, T2.X, literal.z,
+; CM-NEXT:     CNDE_INT * T0.W, T1.Z, PV.Y, PV.Z,
+; CM-NEXT:    -254(nan), -127(nan)
+; CM-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T7.X, T1.Y, literal.x,
+; CM-NEXT:     CNDE_INT T1.Y, T1.W, PV.W, T2.X,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Z, PV.Y, PV.X,
+; CM-NEXT:     MIN_INT * T0.W, T0.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 381(5.338947e-43)
+; CM-NEXT:     SETGT_INT T2.X, T2.X, literal.x,
+; CM-NEXT:     ADD_INT T2.Y, PV.W, literal.y,
+; CM-NEXT:     ADD_INT T2.Z, T0.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T0.W, T0.Z, literal.w,
 ; CM-NEXT:    127(1.779649e-43), -254(nan)
-; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T9.X, T2.Z, PV.W, PV.Z,
-; CM-NEXT:     SETGT_INT T1.Y, T0.Z, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T6.X, PV.X,
-; CM-NEXT:     CNDE_INT * T0.W, T5.Z, T7.X, T0.W, BS:VEC_201
-; CM-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T6.X, T2.W, PV.W, T1.Z,
-; CM-NEXT:     LSHL T5.Y, PV.Z, literal.x,
-; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T4.Z, PV.X,
-; CM-NEXT:     CNDE_INT * T0.W, T4.X, T3.Y, T2.Y,
-; CM-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T0.X, T2.X, PV.W, T0.X,
-; CM-NEXT:     LSHL T2.Y, PV.Z, literal.x,
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     CNDE_INT T6.X, PV.W, PV.Z, PV.Y,
+; CM-NEXT:     SETGT_INT T2.Y, T0.Z, literal.x,
+; CM-NEXT:     CNDE_INT T0.Z, PV.X, T1.Y, T1.Z,
+; CM-NEXT:     MUL_IEEE * T1.W, T7.X, literal.y,
+; CM-NEXT:    127(1.779649e-43), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T7.X, T3.Z, T7.X, PV.W,
+; CM-NEXT:     LSHL T1.Y, PV.Z, literal.x,
+; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T4.X, PV.X, BS:VEC_021/SCL_122
+; CM-NEXT:     MUL_IEEE * T1.W, T0.Y, literal.y,
+; CM-NEXT:    23(3.222986e-44), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T4.X, T0.W, T0.Y, PV.W,
+; CM-NEXT:     LSHL T0.Y, PV.Z, literal.x,
 ; CM-NEXT:     ADD_INT T0.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T4.Y, PV.X, T3.X, BS:VEC_021/SCL_122
+; CM-NEXT:     CNDE_INT * T0.W, T2.X, T5.X, PV.X,
 ; CM-NEXT:    23(3.222986e-44), 1065353216(1.000000e+00)
 ; CM-NEXT:     MUL_IEEE T2.X, PV.W, PV.Z,
-; CM-NEXT:     SETGT T3.Y, literal.x, KC0[3].W,
+; CM-NEXT:     SETGT T1.Y, literal.x, KC0[3].W,
 ; CM-NEXT:     ADD_INT T0.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T1.Y, PV.X, T5.X,
+; CM-NEXT:     CNDE_INT * T0.W, T2.Y, T3.X, PV.X,
 ; CM-NEXT:    -1026650416(-1.032789e+02), 1065353216(1.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.X, PV.W, PV.Z,
-; CM-NEXT:     SETGT T1.Y, literal.x, KC0[3].Z,
+; CM-NEXT:     MUL_IEEE T3.X, PV.W, PV.Z,
+; CM-NEXT:     SETGT T0.Y, literal.x, KC0[3].Z,
 ; CM-NEXT:     CNDE T0.Z, PV.Y, PV.X, 0.0,
 ; CM-NEXT:     SETGT * T0.W, KC0[3].W, literal.y,
 ; CM-NEXT:    -1026650416(-1.032789e+02), 1118925336(8.872284e+01)
 ; CM-NEXT:     CNDE T2.X, PV.W, PV.Z, literal.x,
-; CM-NEXT:     CNDE T1.Y, PV.Y, PV.X, 0.0,
+; CM-NEXT:     CNDE T0.Y, PV.Y, PV.X, 0.0,
 ; CM-NEXT:     SETGT T0.Z, KC0[3].Z, literal.y,
 ; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
 ; CM-NEXT:    2139095040(INF), 1118925336(8.872284e+01)
 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T0.X, PV.W, literal.x,
-; CM-NEXT:     CNDE T1.Y, PV.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE T0.Z, T1.X, T0.Y, 0.0,
+; CM-NEXT:     LSHR T3.X, PV.W, literal.x,
+; CM-NEXT:     CNDE T0.Y, PV.Z, PV.Y, literal.y,
+; CM-NEXT:     CNDE T0.Z, T1.X, T0.X, 0.0,
 ; CM-NEXT:     SETGT * T0.W, KC0[3].Y, literal.z,
 ; CM-NEXT:    2(2.802597e-45), 2139095040(INF)
 ; CM-NEXT:    1118925336(8.872284e+01), 0(0.000000e+00)
-; CM-NEXT:     CNDE * T1.X, PV.W, PV.Z, literal.x,
+; CM-NEXT:     CNDE * T0.X, PV.W, PV.Z, literal.x,
 ; CM-NEXT:    2139095040(INF), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %result = call <3 x float> @llvm.exp.v3f32(<3 x float> %in)
   store <3 x float> %result, ptr addrspace(1) %out
@@ -2050,227 +2041,224 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; R600-LABEL: s_exp_v4f32:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    ALU 98, @6, KC0[CB0:0-32], KC1[]
-; R600-NEXT:    ALU 98, @105, KC0[CB0:0-32], KC1[]
-; R600-NEXT:    ALU 24, @204, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    ALU 95, @105, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    ALU 24, @201, KC0[CB0:0-32], KC1[]
 ; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
 ; R600-NEXT:    CF_END
 ; R600-NEXT:    PAD
 ; R600-NEXT:    ALU clause starting at 6:
 ; R600-NEXT:     AND_INT * T0.W, KC0[3].Z, literal.x,
 ; R600-NEXT:    -4096(nan), 0(0.000000e+00)
-; R600-NEXT:     ADD T1.W, KC0[3].Z, -PV.W,
-; R600-NEXT:     MUL_IEEE * T2.W, PV.W, literal.x,
+; R600-NEXT:     ADD * T1.W, KC0[3].Z, -PV.W,
+; R600-NEXT:     MUL_IEEE T2.W, PV.W, literal.x,
+; R600-NEXT:     MUL_IEEE * T3.W, T0.W, literal.y,
+; R600-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
+; R600-NEXT:     RNDNE T4.W, PS,
+; R600-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.x, PV.W, BS:VEC_021/SCL_122
 ; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; R600-NEXT:     RNDNE T3.W, PS,
-; R600-NEXT:     MUL_IEEE * T4.W, PV.W, literal.x,
+; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.x, PS,
+; R600-NEXT:     ADD * T1.W, T3.W, -PV.W,
 ; R600-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; R600-NEXT:     MULADD_IEEE T1.W, T1.W, literal.x, PS,
-; R600-NEXT:     TRUNC * T4.W, PV.W,
-; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; R600-NEXT:     FLT_TO_INT T0.Z, PS,
-; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.x, PV.W,
-; R600-NEXT:     ADD * T1.W, T2.W, -T3.W,
-; R600-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; R600-NEXT:     ADD T1.Z, PS, PV.W,
-; R600-NEXT:     MAX_INT T0.W, PV.Z, literal.x,
-; R600-NEXT:     MIN_INT * T1.W, PV.Z, literal.y,
-; R600-NEXT:    -330(nan), 381(5.338947e-43)
-; R600-NEXT:     ADD_INT T0.X, PS, literal.x,
-; R600-NEXT:     ADD_INT T0.Y, PV.W, literal.y,
-; R600-NEXT:     ADD_INT T2.Z, T0.Z, literal.z,
-; R600-NEXT:     SETGT_UINT T0.W, T0.Z, literal.w,
-; R600-NEXT:     EXP_IEEE * T1.X, PV.Z,
-; R600-NEXT:    -254(nan), 204(2.858649e-43)
-; R600-NEXT:    102(1.429324e-43), -229(nan)
-; R600-NEXT:     ADD_INT T2.X, T0.Z, literal.x,
-; R600-NEXT:     SETGT_UINT T1.Y, T0.Z, literal.y,
-; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
-; R600-NEXT:     SETGT_INT T1.W, T0.Z, literal.x,
-; R600-NEXT:     MUL_IEEE * T2.W, PS, literal.z,
-; R600-NEXT:    -127(nan), 254(3.559298e-43)
-; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T3.X, T1.X, literal.x,
-; R600-NEXT:     MUL_IEEE T0.Y, PS, literal.y,
-; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Z, T0.Z,
-; R600-NEXT:     CNDE_INT T3.W, PV.Y, PV.X, T0.X,
-; R600-NEXT:     SETGT_INT * T4.W, T0.Z, literal.z,
-; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
-; R600-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; R600-NEXT:     AND_INT T2.Y, KC0[4].X, literal.x,
-; R600-NEXT:     CNDE_INT T0.Z, PS, PV.Z, PV.W,
-; R600-NEXT:     CNDE_INT T0.W, T0.W, PV.Y, T2.W,
-; R600-NEXT:     MUL_IEEE * T2.W, PV.X, literal.y,
-; R600-NEXT:    -4096(nan), 2130706432(1.701412e+38)
-; R600-NEXT:     CNDE_INT T0.X, T1.Y, T3.X, PS,
-; R600-NEXT:     CNDE_INT T0.Y, T1.W, PV.W, T1.X,
-; R600-NEXT:     LSHL T0.Z, PV.Z, literal.x,
-; R600-NEXT:     ADD T0.W, KC0[4].X, -PV.Y,
-; R600-NEXT:     MUL_IEEE * T1.W, PV.Y, literal.y,
-; R600-NEXT:    23(3.222986e-44), 1069064192(1.442383e+00)
-; R600-NEXT:     RNDNE T1.Y, PS,
-; R600-NEXT:     MUL_IEEE T1.Z, PV.W, literal.x,
-; R600-NEXT:     ADD_INT T2.W, PV.Z, literal.y,
-; R600-NEXT:     CNDE_INT * T3.W, T4.W, PV.Y, PV.X,
-; R600-NEXT:    967029397(3.122284e-04), 1065353216(1.000000e+00)
-; R600-NEXT:     MUL_IEEE T0.Y, PS, PV.W,
-; R600-NEXT:     AND_INT T0.Z, KC0[3].W, literal.x,
-; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.y, PV.Z,
-; R600-NEXT:     TRUNC * T2.W, PV.Y,
-; R600-NEXT:    -4096(nan), 1069064192(1.442383e+00)
-; R600-NEXT:     SETGT T0.X, literal.x, KC0[3].Z,
-; R600-NEXT:     FLT_TO_INT T3.Y, PS,
-; R600-NEXT:     MULADD_IEEE T1.Z, T2.Y, literal.y, PV.W,
-; R600-NEXT:     ADD T0.W, T1.W, -T1.Y,
-; R600-NEXT:     MUL_IEEE * T1.W, PV.Z, literal.z,
-; R600-NEXT:    -1026650416(-1.032789e+02), 967029397(3.122284e-04)
-; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; R600-NEXT:     RNDNE T1.X, PS,
-; R600-NEXT:     AND_INT T1.Y, KC0[3].Y, literal.x,
-; R600-NEXT:     ADD T1.Z, PV.W, PV.Z,
-; R600-NEXT:     MAX_INT T0.W, PV.Y, literal.y,
-; R600-NEXT:     MIN_INT * T2.W, PV.Y, literal.z,
-; R600-NEXT:    -4096(nan), -330(nan)
+; R600-NEXT:     ADD T0.W, PS, PV.W,
+; R600-NEXT:     TRUNC * T1.W, T4.W,
+; R600-NEXT:     FLT_TO_INT T1.W, PS,
+; R600-NEXT:     EXP_IEEE * T0.X, PV.W,
+; R600-NEXT:     MUL_IEEE T0.Z, PS, literal.x,
+; R600-NEXT:     MAX_INT T0.W, PV.W, literal.y,
+; R600-NEXT:     MIN_INT * T2.W, PV.W, literal.z,
+; R600-NEXT:    209715200(1.972152e-31), -330(nan)
 ; R600-NEXT:    381(5.338947e-43), 0(0.000000e+00)
-; R600-NEXT:     ADD_INT T2.X, PS, literal.x,
-; R600-NEXT:     ADD_INT T2.Y, PV.W, literal.y,
-; R600-NEXT:     ADD_INT T2.Z, T3.Y, literal.z,
-; R600-NEXT:     SETGT_UINT T0.W, T3.Y, literal.w,
-; R600-NEXT:     EXP_IEEE * T1.Z, PV.Z,
-; R600-NEXT:    -254(nan), 204(2.858649e-43)
-; R600-NEXT:    102(1.429324e-43), -229(nan)
-; R600-NEXT:     ADD_INT T3.X, T3.Y, literal.x,
-; R600-NEXT:     SETGT_UINT T4.Y, T3.Y, literal.y,
-; R600-NEXT:     CNDE_INT T2.Z, PV.W, PV.Y, PV.Z,
-; R600-NEXT:     SETGT_INT T2.W, T3.Y, literal.x,
-; R600-NEXT:     MUL_IEEE * T3.W, PS, literal.z,
+; R600-NEXT:     ADD_INT T1.X, PS, literal.x,
+; R600-NEXT:     AND_INT T0.Y, KC0[4].X, literal.y,
+; R600-NEXT:     ADD_INT T1.Z, PV.W, literal.z,
+; R600-NEXT:     ADD_INT * T0.W, T1.W, literal.w,
+; R600-NEXT:    -254(nan), -4096(nan)
+; R600-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; R600-NEXT:     SETGT_UINT * T2.W, T1.W, literal.x,
+; R600-NEXT:    -229(nan), 0(0.000000e+00)
+; R600-NEXT:     ADD_INT T2.X, T1.W, literal.x,
+; R600-NEXT:     SETGT_UINT T1.Y, T1.W, literal.y,
+; R600-NEXT:     CNDE_INT T1.Z, PV.W, T1.Z, T0.W,
+; R600-NEXT:     SETGT_INT T0.W, T1.W, literal.x,
+; R600-NEXT:     ADD * T3.W, KC0[4].X, -T0.Y,
 ; R600-NEXT:    -127(nan), 254(3.559298e-43)
-; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T4.X, T1.Z, literal.x,
-; R600-NEXT:     MUL_IEEE T2.Y, PS, literal.y,
-; R600-NEXT:     CNDE_INT T2.Z, PV.W, PV.Z, T3.Y,
-; R600-NEXT:     CNDE_INT T4.W, PV.Y, PV.X, T2.X,
-; R600-NEXT:     SETGT_INT * T5.W, T3.Y, literal.z,
-; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
+; R600-NEXT:     MUL_IEEE T3.X, PS, literal.x,
+; R600-NEXT:     MUL_IEEE T2.Y, T0.Y, literal.y,
+; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Z, T1.W,
+; R600-NEXT:     CNDE_INT T4.W, PV.Y, PV.X, T1.X,
+; R600-NEXT:     SETGT_INT * T1.W, T1.W, literal.z,
+; R600-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
 ; R600-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; R600-NEXT:     ADD T2.X, KC0[3].W, -T0.Z,
-; R600-NEXT:     CNDE_INT T3.Y, PS, PV.Z, PV.W,
-; R600-NEXT:     CNDE_INT * T2.Z, T0.W, PV.Y, T3.W,
-; R600-NEXT:    ALU clause starting at 105:
-; R600-NEXT:     MUL_IEEE T0.W, T4.X, literal.x,
-; R600-NEXT:     ADD * T3.W, KC0[3].Y, -T1.Y,
+; R600-NEXT:     CNDE_INT T1.X, PS, PV.Z, PV.W,
+; R600-NEXT:     RNDNE T3.Y, PV.Y,
+; R600-NEXT:     MULADD_IEEE T1.Z, T3.W, literal.x, PV.X,
+; R600-NEXT:     MUL_IEEE T3.W, T0.Z, literal.y,
+; R600-NEXT:     MUL_IEEE * T4.W, T0.X, literal.z,
+; R600-NEXT:    1069064192(1.442383e+00), 209715200(1.972152e-31)
 ; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T2.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T4.Y, T2.W, PV.W, T0.Z,
+; R600-NEXT:     MULADD_IEEE T0.Z, T0.Y, literal.y, PV.Z,
+; R600-NEXT:     ADD T2.W, T2.Y, -PV.Y, BS:VEC_120/SCL_212
+; R600-NEXT:     AND_INT * T3.W, KC0[3].Y, literal.z,
+; R600-NEXT:    2130706432(1.701412e+38), 967029397(3.122284e-04)
+; R600-NEXT:    -4096(nan), 0(0.000000e+00)
 ; R600-NEXT:     MUL_IEEE T3.X, PS, literal.x,
-; R600-NEXT:     MUL_IEEE T2.Y, T1.Y, literal.y,
-; R600-NEXT:     CNDE_INT T3.Z, T4.Y, T4.X, PV.W, BS:VEC_120/SCL_212
-; R600-NEXT:     CNDE_INT T0.W, T2.W, T2.Z, T1.Z,
-; R600-NEXT:     LSHL * T2.W, T3.Y, literal.z,
-; R600-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
-; R600-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; R600-NEXT:     ADD_INT T4.X, PS, literal.x,
-; R600-NEXT:     CNDE_INT T3.Y, T5.W, PV.W, PV.Z,
-; R600-NEXT:     RNDNE T1.Z, PV.Y,
-; R600-NEXT:     MULADD_IEEE T0.W, T3.W, literal.y, PV.X, BS:VEC_120/SCL_212
-; R600-NEXT:     MUL_IEEE * T2.W, T2.X, literal.z,
+; R600-NEXT:     ADD T0.Y, PV.W, PV.Z,
+; R600-NEXT:     CNDE_INT T0.Z, T0.W, PV.Y, T0.X, BS:VEC_021/SCL_122
+; R600-NEXT:     CNDE_INT T0.W, T1.Y, T4.W, PV.X,
+; R600-NEXT:     LSHL * T2.W, T1.X, literal.y,
+; R600-NEXT:    1069064192(1.442383e+00), 23(3.222986e-44)
+; R600-NEXT:     AND_INT T0.X, KC0[3].W, literal.x,
+; R600-NEXT:     TRUNC T1.Y, T3.Y,
+; R600-NEXT:     ADD_INT T1.Z, PS, literal.y,
+; R600-NEXT:     CNDE_INT T0.W, T1.W, PV.Z, PV.W,
+; R600-NEXT:     EXP_IEEE * T0.Y, PV.Y,
+; R600-NEXT:    -4096(nan), 1065353216(1.000000e+00)
+; R600-NEXT:     MUL_IEEE T1.X, PV.W, PV.Z,
+; R600-NEXT:     FLT_TO_INT T1.Y, PV.Y,
+; R600-NEXT:     MUL_IEEE T0.Z, PS, literal.x,
+; R600-NEXT:     ADD T0.W, KC0[3].W, -PV.X,
+; R600-NEXT:     RNDNE * T1.W, T3.X,
+; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; R600-NEXT:     SETGT T2.X, literal.x, KC0[3].Z,
+; R600-NEXT:     TRUNC T2.Y, PS,
+; R600-NEXT:     MUL_IEEE T1.Z, PV.W, literal.y,
+; R600-NEXT:     MUL_IEEE T2.W, PV.Z, literal.z,
+; R600-NEXT:     MAX_INT * T4.W, PV.Y, literal.w,
+; R600-NEXT:    -1026650416(-1.032789e+02), 967029397(3.122284e-04)
+; R600-NEXT:    209715200(1.972152e-31), -330(nan)
+; R600-NEXT:     ADD T4.X, KC0[3].Y, -T3.W,
+; R600-NEXT:     ADD_INT T3.Y, PS, literal.x,
+; R600-NEXT:     ADD_INT T2.Z, T1.Y, literal.y,
+; R600-NEXT:     SETGT_UINT T4.W, T1.Y, literal.z,
+; R600-NEXT:     MIN_INT * T5.W, T1.Y, literal.w,
+; R600-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; R600-NEXT:    -229(nan), 381(5.338947e-43)
+; R600-NEXT:     ADD_INT T5.X, PS, literal.x,
+; R600-NEXT:     ADD_INT T4.Y, T1.Y, literal.y,
+; R600-NEXT:     SETGT_UINT T3.Z, T1.Y, literal.z,
+; R600-NEXT:     CNDE_INT T5.W, PV.W, PV.Y, PV.Z,
+; R600-NEXT:     SETGT_INT * T6.W, T1.Y, literal.y,
+; R600-NEXT:    -254(nan), -127(nan)
+; R600-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T6.X, T0.Y, literal.x,
+; R600-NEXT:     CNDE_INT T3.Y, PS, PV.W, T1.Y,
+; R600-NEXT:     CNDE_INT * T2.Z, PV.Z, PV.Y, PV.X,
+; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
+; R600-NEXT:    ALU clause starting at 105:
+; R600-NEXT:     SETGT_INT T5.W, T1.Y, literal.x,
+; R600-NEXT:     MUL_IEEE * T7.W, T4.X, literal.y,
+; R600-NEXT:    127(1.779649e-43), 967029397(3.122284e-04)
+; R600-NEXT:     MUL_IEEE T5.X, T0.X, literal.x,
+; R600-NEXT:     MULADD_IEEE T1.Y, T4.X, literal.x, PS, BS:VEC_120/SCL_212
+; R600-NEXT:     CNDE_INT T2.Z, PV.W, T3.Y, T2.Z,
+; R600-NEXT:     MUL_IEEE T7.W, T6.X, literal.y, BS:VEC_201
+; R600-NEXT:     CNDE_INT * T2.W, T4.W, T2.W, T0.Z,
+; R600-NEXT:    1069064192(1.442383e+00), 2130706432(1.701412e+38)
+; R600-NEXT:     CNDE_INT T4.X, T6.W, PS, T0.Y,
+; R600-NEXT:     CNDE_INT T0.Y, T3.Z, T6.X, PV.W,
+; R600-NEXT:     LSHL T0.Z, PV.Z, literal.x,
+; R600-NEXT:     MULADD_IEEE T2.W, T3.W, literal.y, PV.Y, BS:VEC_201
+; R600-NEXT:     ADD * T1.W, T3.X, -T1.W,
+; R600-NEXT:    23(3.222986e-44), 967029397(3.122284e-04)
+; R600-NEXT:     ADD T3.X, PS, PV.W,
+; R600-NEXT:     ADD_INT T1.Y, PV.Z, literal.x,
+; R600-NEXT:     CNDE_INT T0.Z, T5.W, PV.X, PV.Y,
+; R600-NEXT:     RNDNE T1.W, T5.X,
+; R600-NEXT:     MULADD_IEEE * T0.W, T0.W, literal.y, T1.Z, BS:VEC_021/SCL_122
 ; R600-NEXT:    1065353216(1.000000e+00), 1069064192(1.442383e+00)
-; R600-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; R600-NEXT:     MULADD_IEEE T2.X, T2.X, literal.x, PS,
-; R600-NEXT:     MULADD_IEEE T1.Y, T1.Y, literal.y, PV.W,
-; R600-NEXT:     ADD T2.Z, T2.Y, -PV.Z, BS:VEC_120/SCL_212
-; R600-NEXT:     MUL_IEEE T0.W, PV.Y, PV.X,
-; R600-NEXT:     SETGT * T2.W, literal.z, KC0[4].X,
-; R600-NEXT:    1069064192(1.442383e+00), 967029397(3.122284e-04)
-; R600-NEXT:    -1026650416(-1.032789e+02), 0(0.000000e+00)
-; R600-NEXT:     CNDE T3.X, PS, PV.W, 0.0,
-; R600-NEXT:     ADD T1.Y, PV.Z, PV.Y,
-; R600-NEXT:     TRUNC T1.Z, T1.Z,
-; R600-NEXT:     MULADD_IEEE T0.W, T0.Z, literal.x, PV.X, BS:VEC_120/SCL_212
-; R600-NEXT:     ADD * T1.W, T1.W, -T1.X,
-; R600-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; R600-NEXT:     SETGT T2.X, KC0[4].X, literal.x,
-; R600-NEXT:     ADD T2.Y, PS, PV.W,
-; R600-NEXT:     FLT_TO_INT T0.Z, PV.Z,
-; R600-NEXT:     TRUNC T0.W, T1.X,
-; R600-NEXT:     EXP_IEEE * T1.X, PV.Y,
-; R600-NEXT:    1118925336(8.872284e+01), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T4.X, PS, literal.x,
-; R600-NEXT:     FLT_TO_INT T1.Y, PV.W,
-; R600-NEXT:     MAX_INT T1.Z, PV.Z, literal.y,
-; R600-NEXT:     MUL_IEEE T0.W, PS, literal.z,
-; R600-NEXT:     EXP_IEEE * T1.W, PV.Y,
-; R600-NEXT:    2130706432(1.701412e+38), -330(nan)
+; R600-NEXT:     MULADD_IEEE T0.X, T0.X, literal.x, PS,
+; R600-NEXT:     ADD T0.Y, T5.X, -PV.W, BS:VEC_120/SCL_212
+; R600-NEXT:     MUL_IEEE T0.Z, PV.Z, PV.Y,
+; R600-NEXT:     SETGT T0.W, literal.y, KC0[4].X,
+; R600-NEXT:     EXP_IEEE * T1.Y, PV.X,
+; R600-NEXT:    967029397(3.122284e-04), -1026650416(-1.032789e+02)
+; R600-NEXT:     CNDE T3.X, PV.W, PV.Z, 0.0,
+; R600-NEXT:     ADD T0.Y, PV.Y, PV.X,
+; R600-NEXT:     FLT_TO_INT T0.Z, T2.Y,
+; R600-NEXT:     TRUNC T0.W, T1.W,
+; R600-NEXT:     MUL_IEEE * T1.W, PS, literal.x,
 ; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T5.X, PV.W, literal.x,
-; R600-NEXT:     MUL_IEEE T2.Y, PS, literal.x,
-; R600-NEXT:     ADD_INT T1.Z, PV.Z, literal.y,
-; R600-NEXT:     ADD_INT T2.W, T0.Z, literal.z,
-; R600-NEXT:     MAX_INT * T3.W, PV.Y, literal.w,
-; R600-NEXT:    209715200(1.972152e-31), 204(2.858649e-43)
-; R600-NEXT:    102(1.429324e-43), -330(nan)
-; R600-NEXT:     SETGT_UINT T6.X, T0.Z, literal.x,
-; R600-NEXT:     ADD_INT T3.Y, PS, literal.y,
-; R600-NEXT:     ADD_INT T2.Z, T1.Y, literal.z,
-; R600-NEXT:     SETGT_UINT T3.W, T1.Y, literal.x,
-; R600-NEXT:     MIN_INT * T4.W, T1.Y, literal.w,
+; R600-NEXT:     SETGT T0.X, KC0[4].X, literal.x,
+; R600-NEXT:     MUL_IEEE T2.Y, PS, literal.y,
+; R600-NEXT:     FLT_TO_INT T1.Z, PV.W,
+; R600-NEXT:     MAX_INT T0.W, PV.Z, literal.z,
+; R600-NEXT:     EXP_IEEE * T0.Y, PV.Y,
+; R600-NEXT:    1118925336(8.872284e+01), 209715200(1.972152e-31)
+; R600-NEXT:    -330(nan), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T4.X, T1.Y, literal.x,
+; R600-NEXT:     MUL_IEEE T3.Y, PS, literal.y,
+; R600-NEXT:     ADD_INT T2.Z, PV.W, literal.z,
+; R600-NEXT:     ADD_INT * T0.W, T0.Z, literal.w,
+; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
+; R600-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; R600-NEXT:     MAX_INT * T2.W, T1.Z, literal.x,
+; R600-NEXT:    -330(nan), 0(0.000000e+00)
+; R600-NEXT:     SETGT_UINT T5.X, T0.Z, literal.x,
+; R600-NEXT:     ADD_INT T4.Y, PV.W, literal.y,
+; R600-NEXT:     ADD_INT T3.Z, T1.Z, literal.z, BS:VEC_120/SCL_212
+; R600-NEXT:     SETGT_UINT T2.W, T1.Z, literal.x, BS:VEC_120/SCL_212
+; R600-NEXT:     MIN_INT * T3.W, T1.Z, literal.w,
 ; R600-NEXT:    -229(nan), 204(2.858649e-43)
 ; R600-NEXT:    102(1.429324e-43), 381(5.338947e-43)
-; R600-NEXT:     ADD_INT T7.X, PS, literal.x,
-; R600-NEXT:     ADD_INT T4.Y, T1.Y, literal.y,
-; R600-NEXT:     SETGT_UINT T3.Z, T1.Y, literal.z,
-; R600-NEXT:     CNDE_INT T4.W, PV.W, PV.Y, PV.Z,
-; R600-NEXT:     SETGT_INT * T5.W, T1.Y, literal.y,
+; R600-NEXT:     ADD_INT T6.X, PS, literal.x,
+; R600-NEXT:     ADD_INT T5.Y, T1.Z, literal.y,
+; R600-NEXT:     SETGT_UINT T4.Z, T1.Z, literal.z,
+; R600-NEXT:     CNDE_INT T3.W, PV.W, PV.Y, PV.Z,
+; R600-NEXT:     SETGT_INT * T4.W, T1.Z, literal.y,
 ; R600-NEXT:    -254(nan), -127(nan)
 ; R600-NEXT:    254(3.559298e-43), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T8.X, PS, PV.W, T1.Y,
-; R600-NEXT:     CNDE_INT T3.Y, PV.Z, PV.Y, PV.X,
-; R600-NEXT:     SETGT_INT T2.Z, T1.Y, literal.x,
-; R600-NEXT:     CNDE_INT T2.W, T6.X, T1.Z, T2.W,
-; R600-NEXT:     SETGT_INT * T4.W, T0.Z, literal.y,
+; R600-NEXT:     CNDE_INT T7.X, PS, PV.W, T1.Z, BS:VEC_021/SCL_122
+; R600-NEXT:     CNDE_INT T4.Y, PV.Z, PV.Y, PV.X,
+; R600-NEXT:     SETGT_INT T1.Z, T1.Z, literal.x, BS:VEC_120/SCL_212
+; R600-NEXT:     CNDE_INT T0.W, T5.X, T2.Z, T0.W, BS:VEC_102/SCL_221
+; R600-NEXT:     SETGT_INT * T3.W, T0.Z, literal.y,
 ; R600-NEXT:    127(1.779649e-43), -127(nan)
-; R600-NEXT:     CNDE_INT T7.X, PS, PV.W, T0.Z,
-; R600-NEXT:     CNDE_INT T1.Y, PV.Z, PV.X, PV.Y,
-; R600-NEXT:     MIN_INT T1.Z, T0.Z, literal.x,
-; R600-NEXT:     MUL_IEEE T2.W, T1.W, literal.y,
-; R600-NEXT:     MUL_IEEE * T6.W, T2.Y, literal.z,
-; R600-NEXT:    381(5.338947e-43), 2130706432(1.701412e+38)
-; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T8.X, T3.W, PS, T2.Y,
-; R600-NEXT:     MUL_IEEE T2.Y, PV.W, literal.x,
-; R600-NEXT:     ADD_INT T1.Z, PV.Z, literal.y,
-; R600-NEXT:     ADD_INT T3.W, T0.Z, literal.z,
-; R600-NEXT:     SETGT_UINT * T6.W, T0.Z, literal.w,
+; R600-NEXT:     CNDE_INT T6.X, PS, PV.W, T0.Z,
+; R600-NEXT:     CNDE_INT T4.Y, PV.Z, PV.X, PV.Y,
+; R600-NEXT:     MIN_INT T2.Z, T0.Z, literal.x,
+; R600-NEXT:     MUL_IEEE T0.W, T3.Y, literal.y,
+; R600-NEXT:     MUL_IEEE * T5.W, T0.Y, literal.z,
+; R600-NEXT:    381(5.338947e-43), 209715200(1.972152e-31)
+; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T7.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T3.Y, T2.W, PV.W, T3.Y,
+; R600-NEXT:     ADD_INT T2.Z, PV.Z, literal.y,
+; R600-NEXT:     ADD_INT T0.W, T0.Z, literal.z,
+; R600-NEXT:     SETGT_UINT * T2.W, T0.Z, literal.w,
 ; R600-NEXT:    2130706432(1.701412e+38), -254(nan)
 ; R600-NEXT:    -127(nan), 254(3.559298e-43)
-; R600-NEXT:     CNDE_INT T9.X, PS, PV.W, PV.Z,
-; R600-NEXT:     SETGT_INT T3.Y, T0.Z, literal.x,
-; R600-NEXT:     CNDE_INT T0.Z, T3.Z, T2.W, PV.Y, BS:VEC_120/SCL_212
-; R600-NEXT:     CNDE_INT T1.W, T5.W, PV.X, T1.W, BS:VEC_021/SCL_122
-; R600-NEXT:     LSHL * T2.W, T1.Y, literal.y,
+; R600-NEXT:     CNDE_INT T8.X, PS, PV.W, PV.Z,
+; R600-NEXT:     SETGT_INT T5.Y, T0.Z, literal.x,
+; R600-NEXT:     CNDE_INT T0.Z, T4.W, PV.Y, T0.Y, BS:VEC_021/SCL_122
+; R600-NEXT:     CNDE_INT T0.W, T4.Z, T5.W, PV.X, BS:VEC_120/SCL_212
+; R600-NEXT:     LSHL * T4.W, T4.Y, literal.y,
 ; R600-NEXT:    127(1.779649e-43), 23(3.222986e-44)
-; R600-NEXT:     ADD_INT T8.X, PS, literal.x,
-; R600-NEXT:     CNDE_INT T1.Y, T2.Z, PV.W, PV.Z,
-; R600-NEXT:     CNDE_INT T0.Z, PV.Y, T7.X, PV.X,
-; R600-NEXT:     CNDE_INT * T0.W, T6.X, T5.X, T0.W, BS:VEC_021/SCL_122
-; R600-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE * T1.W, T4.X, literal.x,
-; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T4.X, T6.W, T4.X, PV.W,
-; R600-NEXT:     CNDE_INT * T2.Y, T4.W, T0.W, T1.X, BS:VEC_120/SCL_212
-; R600-NEXT:    ALU clause starting at 204:
+; R600-NEXT:     ADD_INT T7.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T0.Y, T1.Z, PV.Z, PV.W,
+; R600-NEXT:     CNDE_INT T0.Z, PV.Y, T6.X, PV.X,
+; R600-NEXT:     MUL_IEEE T0.W, T4.X, literal.y,
+; R600-NEXT:     CNDE_INT * T1.W, T5.X, T2.Y, T1.W,
+; R600-NEXT:    1065353216(1.000000e+00), 2130706432(1.701412e+38)
+; R600-NEXT:     CNDE_INT T5.X, T3.W, PS, T1.Y,
+; R600-NEXT:     CNDE_INT * T1.Y, T2.W, T4.X, PV.W, BS:VEC_120/SCL_212
+; R600-NEXT:    ALU clause starting at 201:
 ; R600-NEXT:     LSHL T0.Z, T0.Z, literal.x,
-; R600-NEXT:     MUL_IEEE T0.W, T1.Y, T8.X,
+; R600-NEXT:     MUL_IEEE T0.W, T0.Y, T7.X,
 ; R600-NEXT:     SETGT * T1.W, literal.y, KC0[3].W,
 ; R600-NEXT:    23(3.222986e-44), -1026650416(-1.032789e+02)
-; R600-NEXT:     CNDE T1.X, PS, PV.W, 0.0,
-; R600-NEXT:     SETGT T1.Y, KC0[3].W, literal.x,
+; R600-NEXT:     CNDE T4.X, PS, PV.W, 0.0,
+; R600-NEXT:     SETGT T0.Y, KC0[3].W, literal.x,
 ; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
-; R600-NEXT:     CNDE_INT T0.W, T3.Y, T2.Y, T4.X, BS:VEC_120/SCL_212
-; R600-NEXT:     CNDE * T1.W, T2.X, T3.X, literal.z,
+; R600-NEXT:     CNDE_INT T0.W, T5.Y, T5.X, T1.Y, BS:VEC_102/SCL_221
+; R600-NEXT:     CNDE * T1.W, T0.X, T3.X, literal.z,
 ; R600-NEXT:    1118925336(8.872284e+01), 1065353216(1.000000e+00)
 ; R600-NEXT:    2139095040(INF), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T2.X, PV.W, PV.Z,
+; R600-NEXT:     MUL_IEEE T0.X, PV.W, PV.Z,
 ; R600-NEXT:     SETGT T2.Y, literal.x, KC0[3].Y,
 ; R600-NEXT:     CNDE T1.Z, PV.Y, PV.X, literal.y,
-; R600-NEXT:     CNDE T0.W, T0.X, T0.Y, 0.0,
+; R600-NEXT:     CNDE T0.W, T2.X, T1.X, 0.0,
 ; R600-NEXT:     SETGT * T2.W, KC0[3].Z, literal.z,
 ; R600-NEXT:    -1026650416(-1.032789e+02), 2139095040(INF)
 ; R600-NEXT:    1118925336(8.872284e+01), 0(0.000000e+00)
@@ -2285,8 +2273,8 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; CM-LABEL: s_exp_v4f32:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 97, @6, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    ALU 100, @104, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    ALU 36, @205, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 97, @104, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 35, @202, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
@@ -2305,224 +2293,220 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
 ; CM-NEXT:     MULADD_IEEE T0.X, T0.W, literal.x, PV.W,
 ; CM-NEXT:     ADD T0.Y, T0.Z, -PV.Z,
-; CM-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.x,
-; CM-NEXT:     MUL_IEEE * T0.W, T2.W, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     MUL_IEEE T0.Z, T2.W, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     MUL_IEEE * T0.W, PV.Y, literal.x,
 ; CM-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
 ; CM-NEXT:     TRUNC T1.X, T1.Z,
-; CM-NEXT:     RNDNE T2.Y, PV.W,
-; CM-NEXT:     MULADD_IEEE T0.Z, T1.Y, literal.x, PV.Z,
-; CM-NEXT:     ADD * T1.W, PV.Y, PV.X,
+; CM-NEXT:     MULADD_IEEE T1.Y, T1.Y, literal.x, PV.W,
+; CM-NEXT:     RNDNE T1.Z, PV.Z,
+; CM-NEXT:     ADD * T0.W, PV.Y, PV.X,
 ; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
+; CM-NEXT:     EXP_IEEE T0.X, T0.W,
+; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
+; CM-NEXT:     TRUNC T2.X, T1.Z,
+; CM-NEXT:     MULADD_IEEE T0.Y, T2.W, literal.x, T1.Y,
+; CM-NEXT:     FLT_TO_INT T2.Z, T1.X,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.y,
+; CM-NEXT:    967029397(3.122284e-04), 209715200(1.972152e-31)
+; CM-NEXT:     ADD T1.X, T0.Z, -T1.Z,
+; CM-NEXT:     MUL_IEEE T1.Y, PV.W, literal.x,
+; CM-NEXT:     MAX_INT T0.Z, PV.Z, literal.y,
+; CM-NEXT:     MIN_INT * T1.W, PV.Z, literal.z,
+; CM-NEXT:    209715200(1.972152e-31), -330(nan)
+; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T3.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T0.Z, T2.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T1.W, T2.Z, literal.w,
+; CM-NEXT:    -254(nan), 204(2.858649e-43)
+; CM-NEXT:    102(1.429324e-43), -229(nan)
+; CM-NEXT:     ADD_INT T4.X, T2.Z, literal.x,
+; CM-NEXT:     SETGT_UINT T3.Y, T2.Z, literal.y,
+; CM-NEXT:     CNDE_INT T0.Z, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     SETGT_INT * T2.W, T2.Z, literal.x,
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     MUL_IEEE T5.X, T0.X, literal.x,
+; CM-NEXT:     CNDE_INT T2.Y, PV.W, PV.Z, T2.Z,
+; CM-NEXT:     CNDE_INT T0.Z, PV.Y, PV.X, T3.X,
+; CM-NEXT:     SETGT_INT * T3.W, T2.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 127(1.779649e-43)
+; CM-NEXT:     AND_INT T3.X, KC0[3].Z, literal.x,
+; CM-NEXT:     CNDE_INT T2.Y, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.X, literal.y,
+; CM-NEXT:     CNDE_INT * T0.W, T1.W, T1.Y, T0.W,
+; CM-NEXT:    -4096(nan), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T0.X, T2.W, PV.W, T0.X,
+; CM-NEXT:     CNDE_INT T1.Y, T3.Y, T5.X, PV.Z,
+; CM-NEXT:     LSHL T0.Z, PV.Y, literal.x,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.y,
+; CM-NEXT:    23(3.222986e-44), 1069064192(1.442383e+00)
+; CM-NEXT:     RNDNE T4.X, PV.W,
+; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.x,
+; CM-NEXT:     CNDE_INT T0.Z, T3.W, PV.X, PV.Y,
+; CM-NEXT:     ADD * T1.W, T1.X, T0.Y,
+; CM-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
 ; CM-NEXT:     EXP_IEEE T0.X, T1.W,
 ; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T1.W,
 ; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
 ; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
-; CM-NEXT:     MULADD_IEEE T2.X, T2.W, literal.x, T0.Z,
-; CM-NEXT:     ADD T0.Y, T0.W, -T2.Y, BS:VEC_120/SCL_212
-; CM-NEXT:     FLT_TO_INT T0.Z, T1.X,
-; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.y,
-; CM-NEXT:    967029397(3.122284e-04), 209715200(1.972152e-31)
-; CM-NEXT:     MUL_IEEE T1.X, PV.W, literal.x,
+; CM-NEXT:     MUL_IEEE T1.X, T0.Z, T2.Y,
+; CM-NEXT:     TRUNC T0.Y, T4.X,
+; CM-NEXT:     FLT_TO_INT T0.Z, T2.X, BS:VEC_120/SCL_212
+; CM-NEXT:     MUL_IEEE * T1.W, PV.X, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T2.X, PV.W, literal.x,
 ; CM-NEXT:     MUL_IEEE T1.Y, T0.X, literal.y,
 ; CM-NEXT:     MAX_INT T1.Z, PV.Z, literal.z,
-; CM-NEXT:     MIN_INT * T1.W, PV.Z, literal.w,
+; CM-NEXT:     MIN_INT * T2.W, PV.Z, literal.w,
 ; CM-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
 ; CM-NEXT:    -330(nan), 381(5.338947e-43)
-; CM-NEXT:     ADD_INT T3.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T3.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T5.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.y,
 ; CM-NEXT:     ADD_INT T1.Z, T0.Z, literal.z,
-; CM-NEXT:     SETGT_UINT * T1.W, T0.Z, literal.w,
+; CM-NEXT:     SETGT_UINT * T2.W, T0.Z, literal.w,
 ; CM-NEXT:    -254(nan), 204(2.858649e-43)
 ; CM-NEXT:    102(1.429324e-43), -229(nan)
-; CM-NEXT:     ADD_INT T4.X, T0.Z, literal.x,
-; CM-NEXT:     SETGT_UINT T4.Y, T0.Z, literal.y,
+; CM-NEXT:     ADD_INT T6.X, T0.Z, literal.x,
+; CM-NEXT:     SETGT_UINT T3.Y, T0.Z, literal.y,
 ; CM-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     SETGT_INT * T2.W, T0.Z, literal.x,
+; CM-NEXT:     SETGT_INT * T3.W, T0.Z, literal.x,
 ; CM-NEXT:    -127(nan), 254(3.559298e-43)
-; CM-NEXT:     CNDE_INT T5.X, PV.W, PV.Z, T0.Z,
-; CM-NEXT:     CNDE_INT T3.Y, PV.Y, PV.X, T3.X,
-; CM-NEXT:     SETGT_INT T0.Z, T0.Z, literal.x,
-; CM-NEXT:     MUL_IEEE * T3.W, T1.Y, literal.y,
-; CM-NEXT:    127(1.779649e-43), 2130706432(1.701412e+38)
-; CM-NEXT:     CNDE_INT T3.X, T4.Y, T1.Y, PV.W,
-; CM-NEXT:     AND_INT T1.Y, KC0[3].Z, literal.x,
-; CM-NEXT:     CNDE_INT T1.Z, PV.Z, PV.X, PV.Y,
-; CM-NEXT:     CNDE_INT * T0.W, T1.W, T1.X, T0.W,
-; CM-NEXT:    -4096(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T0.X, T2.W, PV.W, T0.X,
-; CM-NEXT:     LSHL T3.Y, PV.Z, literal.x,
-; CM-NEXT:     TRUNC T1.Z, T2.Y,
-; CM-NEXT:     ADD * T0.W, KC0[3].Z, -PV.Y,
-; CM-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T1.X, PV.W, literal.x,
-; CM-NEXT:     FLT_TO_INT T2.Y, PV.Z,
-; CM-NEXT:     ADD_INT T1.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T1.W, T0.Z, PV.X, T3.X,
-; CM-NEXT:    967029397(3.122284e-04), 1065353216(1.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.X, PV.W, PV.Z,
-; CM-NEXT:     MIN_INT T3.Y, PV.Y, literal.x,
-; CM-NEXT:     MULADD_IEEE T0.Z, T0.W, literal.y, PV.X,
-; CM-NEXT:     ADD * T0.W, T0.Y, T2.X,
-; CM-NEXT:    381(5.338947e-43), 1069064192(1.442383e+00)
-; CM-NEXT:     EXP_IEEE T0.X (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE T0.Y, T0.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
-; CM-NEXT:     MULADD_IEEE T1.X, T1.Y, literal.x, T0.Z,
-; CM-NEXT:     MUL_IEEE T4.Y, PV.Y, literal.y,
-; CM-NEXT:     ADD_INT T0.Z, T3.Y, literal.z, BS:VEC_120/SCL_212
-; CM-NEXT:     MAX_INT * T0.W, T2.Y, literal.w, BS:VEC_201
-; CM-NEXT:    967029397(3.122284e-04), 2130706432(1.701412e+38)
-; CM-NEXT:    -254(nan), -330(nan)
-; CM-NEXT:     ADD_INT T2.X, T2.Y, literal.x,
-; CM-NEXT:     ADD_INT T3.Y, PV.W, literal.y,
-; CM-NEXT:     ADD_INT T1.Z, T2.Y, literal.z,
-; CM-NEXT:     SETGT_UINT * T0.W, T2.Y, literal.w,
-; CM-NEXT:    -127(nan), 204(2.858649e-43)
-; CM-NEXT:    102(1.429324e-43), -229(nan)
-; CM-NEXT:     SETGT_UINT T3.X, T2.Y, literal.x,
-; CM-NEXT:     CNDE_INT T3.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     SETGT_INT T1.Z, T2.Y, literal.y,
-; CM-NEXT:     MUL_IEEE * T1.W, T0.Y, literal.z, BS:VEC_120/SCL_212
-; CM-NEXT:    254(3.559298e-43), -127(nan)
-; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T4.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT * T3.Y, PV.Z, PV.Y, T2.Y,
-; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; CM-NEXT:    ALU clause starting at 104:
-; CM-NEXT:     CNDE_INT T0.Z, T3.X, T2.X, T0.Z,
-; CM-NEXT:     SETGT_INT * T2.W, T2.Y, literal.x,
+; CM-NEXT:     CNDE_INT T7.X, PV.W, PV.Z, T0.Z,
+; CM-NEXT:     CNDE_INT T2.Y, PV.Y, PV.X, T5.X,
+; CM-NEXT:     SETGT_INT * T0.Z, T0.Z, literal.x,
 ; CM-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T2.X, T1.Y, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.W, T3.Y, PV.Z,
-; CM-NEXT:     CNDE_INT T0.Z, T0.W, T4.X, T1.W,
-; CM-NEXT:     MUL_IEEE * T0.W, T4.Y, literal.y, BS:VEC_201
-; CM-NEXT:    1069064192(1.442383e+00), 2130706432(1.701412e+38)
-; CM-NEXT:     AND_INT T4.X, KC0[4].X, literal.x,
-; CM-NEXT:     CNDE_INT T2.Y, T3.X, T4.Y, PV.W,
-; CM-NEXT:     CNDE_INT T0.Z, T1.Z, PV.Z, T0.Y,
-; CM-NEXT:     LSHL * T0.W, PV.Y, literal.y,
-; CM-NEXT:    -4096(nan), 23(3.222986e-44)
-; CM-NEXT:     ADD_INT T3.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T0.Y, T2.W, PV.Z, PV.Y,
-; CM-NEXT:     MUL_IEEE T0.Z, PV.X, literal.y,
-; CM-NEXT:     RNDNE * T0.W, T2.X,
-; CM-NEXT:    1065353216(1.000000e+00), 1069064192(1.442383e+00)
-; CM-NEXT:     ADD T2.X, T2.X, -PV.W,
-; CM-NEXT:     RNDNE T1.Y, PV.Z,
-; CM-NEXT:     MUL_IEEE T1.Z, PV.Y, PV.X,
-; CM-NEXT:     SETGT * T1.W, literal.x, KC0[3].W,
-; CM-NEXT:    -1026650416(-1.032789e+02), 0(0.000000e+00)
-; CM-NEXT:     CNDE T3.X, PV.W, PV.Z, 0.0,
-; CM-NEXT:     TRUNC T0.Y, T0.W,
-; CM-NEXT:     TRUNC T1.Z, PV.Y,
-; CM-NEXT:     ADD * T0.W, PV.X, T1.X,
+; CM-NEXT:    ALU clause starting at 104:
+; CM-NEXT:     ADD * T4.W, KC0[3].Z, -T3.X,
+; CM-NEXT:     MUL_IEEE T5.X, PV.W, literal.x,
+; CM-NEXT:     CNDE_INT T2.Y, T0.Z, T7.X, T2.Y,
+; CM-NEXT:     MUL_IEEE T1.Z, T1.Y, literal.y,
+; CM-NEXT:     CNDE_INT * T1.W, T2.W, T2.X, T1.W, BS:VEC_021/SCL_122
+; CM-NEXT:    967029397(3.122284e-04), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T0.X, T3.W, PV.W, T0.X,
+; CM-NEXT:     CNDE_INT T1.Y, T3.Y, T1.Y, PV.Z,
+; CM-NEXT:     LSHL T1.Z, PV.Y, literal.x,
+; CM-NEXT:     MULADD_IEEE * T1.W, T4.W, literal.y, PV.X, BS:VEC_120/SCL_212
+; CM-NEXT:    23(3.222986e-44), 1069064192(1.442383e+00)
+; CM-NEXT:     MULADD_IEEE T2.X, T3.X, literal.x, PV.W,
+; CM-NEXT:     ADD T2.Y, T0.W, -T4.X,
+; CM-NEXT:     ADD_INT T1.Z, PV.Z, literal.y,
+; CM-NEXT:     CNDE_INT * T0.W, T0.Z, PV.X, PV.Y,
+; CM-NEXT:    967029397(3.122284e-04), 1065353216(1.000000e+00)
+; CM-NEXT:     AND_INT T0.X, KC0[4].X, literal.x,
+; CM-NEXT:     MUL_IEEE T1.Y, PV.W, PV.Z,
+; CM-NEXT:     SETGT T0.Z, literal.y, KC0[3].W,
+; CM-NEXT:     ADD * T0.W, PV.Y, PV.X,
+; CM-NEXT:    -4096(nan), -1026650416(-1.032789e+02)
 ; CM-NEXT:     EXP_IEEE T0.X (MASKED), T0.W,
 ; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
 ; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
 ; CM-NEXT:     EXP_IEEE * T0.W, T0.W,
-; CM-NEXT:     FLT_TO_INT T1.X, T1.Z,
-; CM-NEXT:     FLT_TO_INT T0.Y, T0.Y,
-; CM-NEXT:     MUL_IEEE T1.Z, PV.W, literal.x,
-; CM-NEXT:     ADD * T1.W, KC0[4].X, -T4.X,
-; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T2.X, PV.W, literal.x,
-; CM-NEXT:     MUL_IEEE T2.Y, T0.W, literal.y,
-; CM-NEXT:     MUL_IEEE T2.Z, PV.Z, literal.z,
-; CM-NEXT:     SETGT_UINT * T2.W, PV.Y, literal.w,
-; CM-NEXT:    967029397(3.122284e-04), 209715200(1.972152e-31)
-; CM-NEXT:    2130706432(1.701412e+38), 254(3.559298e-43)
-; CM-NEXT:     CNDE_INT T5.X, PV.W, T1.Z, PV.Z,
-; CM-NEXT:     MUL_IEEE T3.Y, PV.Y, literal.x,
-; CM-NEXT:     MULADD_IEEE T1.Z, T1.W, literal.y, PV.X,
-; CM-NEXT:     MAX_INT * T1.W, T1.X, literal.z,
-; CM-NEXT:    209715200(1.972152e-31), 1069064192(1.442383e+00)
-; CM-NEXT:    -330(nan), 0(0.000000e+00)
-; CM-NEXT:     ADD_INT T2.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T4.Y, T1.X, literal.y,
-; CM-NEXT:     MULADD_IEEE T1.Z, T4.X, literal.z, PV.Z, BS:VEC_120/SCL_212
-; CM-NEXT:     MAX_INT * T1.W, T0.Y, literal.w,
-; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; CM-NEXT:     CNDE T2.X, T0.Z, T1.Y, 0.0,
+; CM-NEXT:     ADD T1.Y, KC0[4].X, -T0.X,
+; CM-NEXT:     FLT_TO_INT T0.Z, T0.Y,
+; CM-NEXT:     MUL_IEEE * T1.W, PV.W, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T3.X, PV.W, literal.x,
+; CM-NEXT:     SETGT_UINT T0.Y, PV.Z, literal.y,
+; CM-NEXT:     MUL_IEEE T1.Z, PV.Y, literal.z,
+; CM-NEXT:     MUL_IEEE * T2.W, T0.X, literal.w,
+; CM-NEXT:    209715200(1.972152e-31), -229(nan)
+; CM-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
+; CM-NEXT:     RNDNE T4.X, PV.W,
+; CM-NEXT:     MULADD_IEEE T1.Y, T1.Y, literal.x, PV.Z,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Y, PV.X, T1.W,
+; CM-NEXT:     SETGT_INT * T1.W, T0.Z, literal.y,
+; CM-NEXT:    1069064192(1.442383e+00), -127(nan)
+; CM-NEXT:     CNDE_INT T3.X, PV.W, PV.Z, T0.W,
+; CM-NEXT:     MULADD_IEEE T1.Y, T0.X, literal.x, PV.Y,
+; CM-NEXT:     ADD T1.Z, T2.W, -PV.X,
+; CM-NEXT:     MAX_INT * T2.W, T0.Z, literal.y,
 ; CM-NEXT:    967029397(3.122284e-04), -330(nan)
-; CM-NEXT:     ADD T4.X, T0.Z, -T1.Y,
-; CM-NEXT:     ADD_INT T1.Y, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, T0.Y, literal.y,
-; CM-NEXT:     SETGT_UINT * T1.W, T0.Y, literal.z,
+; CM-NEXT:     ADD_INT T0.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T2.Y, T0.Z, literal.y,
+; CM-NEXT:     TRUNC T2.Z, T4.X,
+; CM-NEXT:     ADD * T2.W, PV.Z, PV.Y,
 ; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
-; CM-NEXT:    -229(nan), 0(0.000000e+00)
-; CM-NEXT:     SETGT_UINT T6.X, T1.X, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     SETGT_INT T0.Z, T0.Y, literal.y,
-; CM-NEXT:     ADD * T3.W, PV.X, T1.Z,
-; CM-NEXT:    -229(nan), -127(nan)
-; CM-NEXT:     EXP_IEEE T1.X (MASKED), T3.W,
-; CM-NEXT:     EXP_IEEE T1.Y (MASKED), T3.W,
-; CM-NEXT:     EXP_IEEE T1.Z, T3.W,
-; CM-NEXT:     EXP_IEEE * T1.W (MASKED), T3.W,
-; CM-NEXT:     CNDE_INT T4.X, T0.Z, T1.Y, T0.Y,
-; CM-NEXT:     CNDE_INT T1.Y, T6.X, T2.X, T4.Y, BS:VEC_120/SCL_212
-; CM-NEXT:     SETGT_INT T2.Z, T1.X, literal.x,
-; CM-NEXT:     MUL_IEEE * T3.W, PV.Z, literal.y,
-; CM-NEXT:    -127(nan), 209715200(1.972152e-31)
-; CM-NEXT:     MUL_IEEE T2.X, T1.Z, literal.x,
-; CM-NEXT:     MUL_IEEE T4.Y, PV.W, literal.y,
-; CM-NEXT:     CNDE_INT T3.Z, PV.Z, PV.Y, T1.X,
-; CM-NEXT:     MIN_INT * T4.W, T1.X, literal.z,
+; CM-NEXT:     EXP_IEEE T1.X (MASKED), T2.W,
+; CM-NEXT:     EXP_IEEE T1.Y, T2.W,
+; CM-NEXT:     EXP_IEEE T1.Z (MASKED), T2.W,
+; CM-NEXT:     EXP_IEEE * T1.W (MASKED), T2.W,
+; CM-NEXT:     MUL_IEEE T4.X, T0.W, literal.x,
+; CM-NEXT:     FLT_TO_INT T3.Y, T2.Z,
+; CM-NEXT:     MUL_IEEE T1.Z, PV.Y, literal.y,
+; CM-NEXT:     CNDE_INT * T0.W, T0.Y, T0.X, T2.Y,
 ; CM-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
+; CM-NEXT:     CNDE_INT T0.X, T1.W, PV.W, T0.Z,
+; CM-NEXT:     MUL_IEEE T0.Y, PV.Z, literal.x,
+; CM-NEXT:     MAX_INT T2.Z, PV.Y, literal.y,
+; CM-NEXT:     MIN_INT * T0.W, PV.Y, literal.z,
+; CM-NEXT:    209715200(1.972152e-31), -330(nan)
 ; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
-; CM-NEXT:     MIN_INT T7.X, T0.Y, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, PV.W, literal.y,
-; CM-NEXT:     ADD_INT T4.Z, T1.X, literal.z,
-; CM-NEXT:     SETGT_UINT * T4.W, T1.X, literal.w,
-; CM-NEXT:    381(5.338947e-43), -254(nan)
+; CM-NEXT:     ADD_INT T5.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T2.Z, T3.Y, literal.z,
+; CM-NEXT:     SETGT_UINT * T0.W, T3.Y, literal.w,
+; CM-NEXT:    -254(nan), 204(2.858649e-43)
+; CM-NEXT:    102(1.429324e-43), -229(nan)
+; CM-NEXT:     ADD_INT T6.X, T3.Y, literal.x,
+; CM-NEXT:     SETGT_UINT T4.Y, T3.Y, literal.y,
+; CM-NEXT:     CNDE_INT T2.Z, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     SETGT_INT * T1.W, T3.Y, literal.x,
 ; CM-NEXT:    -127(nan), 254(3.559298e-43)
-; CM-NEXT:     CNDE_INT T8.X, PV.W, PV.Z, PV.Y,
-; CM-NEXT:     SETGT_INT T1.Y, T1.X, literal.x,
-; CM-NEXT:     ADD_INT T4.Z, PV.X, literal.y,
-; CM-NEXT:     ADD_INT * T5.W, T0.Y, literal.z,
+; CM-NEXT:     MUL_IEEE T7.X, T1.Y, literal.x,
+; CM-NEXT:     CNDE_INT T2.Y, PV.W, PV.Z, T3.Y,
+; CM-NEXT:     CNDE_INT T2.Z, PV.Y, PV.X, T5.X,
+; CM-NEXT:     MIN_INT * T2.W, T0.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 381(5.338947e-43)
+; CM-NEXT:     SETGT_INT T5.X, T3.Y, literal.x,
+; CM-NEXT:     ADD_INT T3.Y, PV.W, literal.y,
+; CM-NEXT:     ADD_INT T3.Z, T0.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T2.W, T0.Z, literal.w,
 ; CM-NEXT:    127(1.779649e-43), -254(nan)
-; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T1.X, T2.W, PV.W, PV.Z,
-; CM-NEXT:     CNDE_INT T5.Y, PV.Y, T3.Z, PV.X,
-; CM-NEXT:     CNDE_INT T3.Z, T6.X, T4.Y, T3.W,
-; CM-NEXT:     MUL_IEEE * T2.W, T2.X, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     CNDE_INT T6.X, PV.W, PV.Z, PV.Y,
+; CM-NEXT:     CNDE_INT T2.Y, PV.X, T2.Y, T2.Z,
+; CM-NEXT:     MUL_IEEE T2.Z, T7.X, literal.x,
+; CM-NEXT:     CNDE_INT * T0.W, T0.W, T0.Y, T1.Z, BS:VEC_021/SCL_122
 ; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_INT T6.X, T0.Y, literal.x,
-; CM-NEXT:     CNDE_INT T0.Y, T4.W, T2.X, PV.W,
-; CM-NEXT:     CNDE_INT * T1.Z, T2.Z, PV.Z, T1.Z,
-; CM-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; CM-NEXT:    ALU clause starting at 205:
-; CM-NEXT:     LSHL * T2.W, T5.Y, literal.x,
-; CM-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; CM-NEXT:     ADD_INT T2.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T0.Y, T1.Y, T1.Z, T0.Y,
-; CM-NEXT:     CNDE_INT * T1.Z, T6.X, T4.X, T1.X,
+; CM-NEXT:     SETGT_INT T8.X, T0.Z, literal.x,
+; CM-NEXT:     CNDE_INT T0.Y, T1.W, PV.W, T1.Y,
+; CM-NEXT:     CNDE_INT T0.Z, T4.Y, T7.X, PV.Z,
+; CM-NEXT:     LSHL * T0.W, PV.Y, literal.y,
+; CM-NEXT:    127(1.779649e-43), 23(3.222986e-44)
+; CM-NEXT:    ALU clause starting at 202:
+; CM-NEXT:     ADD_INT T7.X, T0.W, literal.x,
+; CM-NEXT:     CNDE_INT * T0.Y, T5.X, T0.Y, T0.Z,
 ; CM-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT * T1.W, T1.W, T3.Y, T2.Y,
-; CM-NEXT:     CNDE_INT T1.X, T0.Z, PV.W, T0.W,
-; CM-NEXT:     LSHL T1.Y, T1.Z, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     MUL_IEEE T0.Z, T0.Y, T2.X,
+; CM-NEXT:     CNDE_INT * T0.Z, T8.X, T0.X, T6.X,
+; CM-NEXT:     MUL_IEEE * T0.W, T4.X, literal.x,
+; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
+; CM-NEXT:     CNDE_INT T0.X, T2.W, T4.X, PV.W,
+; CM-NEXT:     LSHL T1.Y, T0.Z, literal.x,
+; CM-NEXT:     MUL_IEEE T0.Z, T0.Y, T7.X, BS:VEC_021/SCL_122
 ; CM-NEXT:     SETGT * T0.W, literal.y, KC0[4].X,
 ; CM-NEXT:    23(3.222986e-44), -1026650416(-1.032789e+02)
-; CM-NEXT:     CNDE T2.X, PV.W, PV.Z, 0.0,
+; CM-NEXT:     CNDE T4.X, PV.W, PV.Z, 0.0,
 ; CM-NEXT:     SETGT T0.Y, KC0[4].X, literal.x,
 ; CM-NEXT:     ADD_INT T0.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T6.X, PV.X, T5.X,
+; CM-NEXT:     CNDE_INT * T0.W, T8.X, T3.X, PV.X,
 ; CM-NEXT:    1118925336(8.872284e+01), 1065353216(1.000000e+00)
-; CM-NEXT:     SETGT T1.X, KC0[3].W, literal.x,
+; CM-NEXT:     SETGT T0.X, KC0[3].W, literal.x,
 ; CM-NEXT:     MUL_IEEE T1.Y, PV.W, PV.Z,
 ; CM-NEXT:     SETGT T0.Z, literal.y, KC0[3].Z,
 ; CM-NEXT:     CNDE * T0.W, PV.Y, PV.X, literal.z,
 ; CM-NEXT:    1118925336(8.872284e+01), -1026650416(-1.032789e+02)
 ; CM-NEXT:    2139095040(INF), 0(0.000000e+00)
-; CM-NEXT:     SETGT T2.X, literal.x, KC0[3].Y,
+; CM-NEXT:     SETGT T3.X, literal.x, KC0[3].Y,
 ; CM-NEXT:     CNDE T0.Y, PV.Z, PV.Y, 0.0,
-; CM-NEXT:     CNDE T0.Z, PV.X, T3.X, literal.y,
+; CM-NEXT:     CNDE T0.Z, PV.X, T2.X, literal.y,
 ; CM-NEXT:     SETGT * T1.W, KC0[3].Z, literal.z,
 ; CM-NEXT:    -1026650416(-1.032789e+02), 2139095040(INF)
 ; CM-NEXT:    1118925336(8.872284e+01), 0(0.000000e+00)
 ; CM-NEXT:     CNDE T0.Y, PV.W, PV.Y, literal.x,
-; CM-NEXT:     CNDE T1.Z, PV.X, T0.X, 0.0,
+; CM-NEXT:     CNDE T1.Z, PV.X, T1.X, 0.0,
 ; CM-NEXT:     SETGT * T1.W, KC0[3].Y, literal.y,
 ; CM-NEXT:    2139095040(INF), 1118925336(8.872284e+01)
 ; CM-NEXT:     CNDE * T0.X, PV.W, PV.Z, literal.x,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index 544c1de..a162949 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -230,23 +230,23 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) {
 ; R600-NEXT:     MUL_IEEE * T2.W, PS, literal.z,
 ; R600-NEXT:    -127(nan), 254(3.559298e-43)
 ; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T3.X, T1.X, literal.x,
-; R600-NEXT:     MUL_IEEE T0.Y, PS, literal.y,
+; R600-NEXT:     MUL_IEEE T3.X, PS, literal.x,
+; R600-NEXT:     MUL_IEEE T0.Y, T1.X, literal.y,
 ; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Z, T0.Z,
 ; R600-NEXT:     CNDE_INT T3.W, PV.Y, PV.X, T0.X,
 ; R600-NEXT:     SETGT_INT * T4.W, T0.Z, literal.z,
-; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
+; R600-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
 ; R600-NEXT:    127(1.779649e-43), 0(0.000000e+00)
 ; R600-NEXT:     CNDE_INT T0.Z, PS, PV.Z, PV.W,
-; R600-NEXT:     CNDE_INT T0.W, T0.W, PV.Y, T2.W,
-; R600-NEXT:     MUL_IEEE * T2.W, PV.X, literal.x,
+; R600-NEXT:     MUL_IEEE T3.W, PV.Y, literal.x,
+; R600-NEXT:     CNDE_INT * T0.W, T0.W, PV.X, T2.W,
 ; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T1.Z, T1.Y, T3.X, PS,
-; R600-NEXT:     CNDE_INT T0.W, T1.W, PV.W, T1.X,
+; R600-NEXT:     CNDE_INT T1.Z, T1.W, PS, T1.X,
+; R600-NEXT:     CNDE_INT T0.W, T1.Y, T0.Y, PV.W,
 ; R600-NEXT:     LSHL * T1.W, PV.Z, literal.x,
 ; R600-NEXT:    23(3.222986e-44), 0(0.000000e+00)
 ; R600-NEXT:     ADD_INT T1.W, PS, literal.x,
-; R600-NEXT:     CNDE_INT * T0.W, T4.W, PV.W, PV.Z,
+; R600-NEXT:     CNDE_INT * T0.W, T4.W, PV.Z, PV.W,
 ; R600-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
 ; R600-NEXT:     MUL_IEEE T0.W, PS, PV.W,
 ; R600-NEXT:     SETGT * T1.W, literal.x, KC0[2].Z,
@@ -260,65 +260,63 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) {
 ;
 ; CM-LABEL: s_exp10_f32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 64, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 62, @4, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
 ; CM-NEXT:    ALU clause starting at 4:
 ; CM-NEXT:     AND_INT * T0.W, KC0[2].Z, literal.x,
 ; CM-NEXT:    -4096(nan), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.Z, PV.W, literal.x,
 ; CM-NEXT:     ADD * T1.W, KC0[2].Z, -PV.W,
-; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T1.Z, PV.W, literal.x,
-; CM-NEXT:     RNDNE * T2.W, PV.Z,
-; CM-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; CM-NEXT:     TRUNC T2.Z, PV.W,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.W, literal.x,
+; CM-NEXT:     MUL_IEEE * T2.W, T0.W, literal.y,
+; CM-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
+; CM-NEXT:     RNDNE T1.Z, PV.W,
 ; CM-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.x, PV.Z,
 ; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; CM-NEXT:     MULADD_IEEE T0.Y, T0.W, literal.x, PV.W,
-; CM-NEXT:     ADD T0.Z, T0.Z, -T2.W,
-; CM-NEXT:     FLT_TO_INT * T0.W, PV.Z,
+; CM-NEXT:     MULADD_IEEE T0.Z, T0.W, literal.x, PV.W,
+; CM-NEXT:     ADD * T0.W, T2.W, -PV.Z, BS:VEC_120/SCL_212
 ; CM-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; CM-NEXT:     MIN_INT T1.Z, PV.W, literal.x,
-; CM-NEXT:     ADD * T1.W, PV.Z, PV.Y,
+; CM-NEXT:     TRUNC T1.Z, T1.Z,
+; CM-NEXT:     ADD * T0.W, PV.W, PV.Z,
+; CM-NEXT:     EXP_IEEE T0.X, T0.W,
+; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
+; CM-NEXT:     FLT_TO_INT T0.Z, T1.Z,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T0.Y, PV.W, literal.x,
+; CM-NEXT:     MAX_INT T1.Z, PV.Z, literal.y,
+; CM-NEXT:     MIN_INT * T1.W, PV.Z, literal.z,
+; CM-NEXT:    209715200(1.972152e-31), -330(nan)
 ; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
-; CM-NEXT:     EXP_IEEE T0.X, T1.W,
-; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
-; CM-NEXT:     MUL_IEEE T0.Y, PV.X, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, T1.Z, literal.y,
-; CM-NEXT:     MAX_INT * T1.W, T0.W, literal.z,
-; CM-NEXT:    2130706432(1.701412e+38), -254(nan)
-; CM-NEXT:    -330(nan), 0(0.000000e+00)
-; CM-NEXT:     ADD_INT T1.X, T0.W, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, PV.W, literal.y,
-; CM-NEXT:     ADD_INT T1.Z, T0.W, literal.z,
-; CM-NEXT:     SETGT_UINT * T1.W, T0.W, literal.w,
-; CM-NEXT:    -127(nan), 204(2.858649e-43)
+; CM-NEXT:     ADD_INT T1.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T1.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T1.Z, T0.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T1.W, T0.Z, literal.w,
+; CM-NEXT:    -254(nan), 204(2.858649e-43)
 ; CM-NEXT:    102(1.429324e-43), -229(nan)
-; CM-NEXT:     SETGT_UINT T2.X, T0.W, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     SETGT_INT T1.Z, T0.W, literal.y,
-; CM-NEXT:     MUL_IEEE * T2.W, T0.X, literal.z,
-; CM-NEXT:    254(3.559298e-43), -127(nan)
-; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T3.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.Z, PV.Y, T0.W,
-; CM-NEXT:     CNDE_INT T0.Z, PV.X, T1.X, T0.Z,
-; CM-NEXT:     SETGT_INT * T0.W, T0.W, literal.y,
-; CM-NEXT:    209715200(1.972152e-31), 127(1.779649e-43)
+; CM-NEXT:     ADD_INT T2.X, T0.Z, literal.x,
+; CM-NEXT:     SETGT_UINT T2.Y, T0.Z, literal.y,
+; CM-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     SETGT_INT * T2.W, T0.Z, literal.x,
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     MUL_IEEE T3.X, T0.X, literal.x,
+; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Z, T0.Z,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Y, PV.X, T1.X,
+; CM-NEXT:     SETGT_INT * T3.W, T0.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 127(1.779649e-43)
 ; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     CNDE_INT T0.Z, T1.W, PV.X, T2.W,
-; CM-NEXT:     MUL_IEEE * T1.W, T0.Y, literal.x,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.X, literal.x,
+; CM-NEXT:     CNDE_INT * T0.W, T1.W, T0.Y, T0.W,
 ; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T0.Y, T2.X, T0.Y, PV.W,
-; CM-NEXT:     CNDE_INT T0.Z, T1.Z, PV.Z, T0.X,
-; CM-NEXT:     LSHL * T1.W, PV.Y, literal.x,
+; CM-NEXT:     CNDE_INT T0.Y, T2.W, PV.W, T0.X,
+; CM-NEXT:     CNDE_INT T0.Z, T2.Y, T3.X, PV.Z,
+; CM-NEXT:     LSHL * T0.W, PV.Y, literal.x,
 ; CM-NEXT:    23(3.222986e-44), 0(0.000000e+00)
 ; CM-NEXT:     ADD_INT T1.Z, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT * T0.W, T0.W, PV.Z, PV.Y,
+; CM-NEXT:     CNDE_INT * T0.W, T3.W, PV.Y, PV.Z,
 ; CM-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
 ; CM-NEXT:     MUL_IEEE T0.Z, PV.W, PV.Z,
 ; CM-NEXT:     SETGT * T0.W, literal.x, KC0[2].Z,
@@ -612,105 +610,105 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; R600-NEXT:     AND_INT * T0.W, KC0[3].X, literal.x,
 ; R600-NEXT:    -4096(nan), 0(0.000000e+00)
 ; R600-NEXT:     ADD * T1.W, KC0[3].X, -PV.W,
-; R600-NEXT:     AND_INT T0.Z, KC0[2].W, literal.x,
-; R600-NEXT:     MUL_IEEE T2.W, PV.W, literal.y,
-; R600-NEXT:     MUL_IEEE * T3.W, T0.W, literal.z,
-; R600-NEXT:    -4096(nan), 975668412(6.390323e-04)
-; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; R600-NEXT:     RNDNE T1.Z, PS,
+; R600-NEXT:     MUL_IEEE T2.W, PV.W, literal.x,
+; R600-NEXT:     MUL_IEEE * T3.W, T0.W, literal.y,
+; R600-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
+; R600-NEXT:     RNDNE T0.Z, PS,
 ; R600-NEXT:     MULADD_IEEE T1.W, T1.W, literal.x, PV.W,
-; R600-NEXT:     ADD * T2.W, KC0[2].W, -PV.Z,
-; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T0.Y, PS, literal.x,
-; R600-NEXT:     MUL_IEEE T2.Z, T0.Z, literal.y,
+; R600-NEXT:     AND_INT * T2.W, KC0[2].W, literal.y,
+; R600-NEXT:    1079283712(3.321289e+00), -4096(nan)
+; R600-NEXT:     ADD T1.Z, KC0[2].W, -PS,
 ; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.x, PV.W,
 ; R600-NEXT:     ADD * T1.W, T3.W, -PV.Z,
+; R600-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
+; R600-NEXT:     ADD T2.Z, PS, PV.W,
+; R600-NEXT:     MUL_IEEE T0.W, PV.Z, literal.x,
+; R600-NEXT:     MUL_IEEE * T1.W, T2.W, literal.y,
 ; R600-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
-; R600-NEXT:     ADD T3.Z, PS, PV.W,
-; R600-NEXT:     RNDNE T0.W, PV.Z,
-; R600-NEXT:     MULADD_IEEE * T1.W, T2.W, literal.x, PV.Y, BS:VEC_021/SCL_122
-; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; R600-NEXT:     TRUNC T0.Y, T1.Z,
-; R600-NEXT:     MULADD_IEEE T0.Z, T0.Z, literal.x, PS, BS:VEC_120/SCL_212
-; R600-NEXT:     ADD T1.W, T2.Z, -PV.W, BS:VEC_201
+; R600-NEXT:     RNDNE T0.Y, PS,
+; R600-NEXT:     MULADD_IEEE T1.Z, T1.Z, literal.x, PV.W,
+; R600-NEXT:     TRUNC T0.W, T0.Z, BS:VEC_120/SCL_212
 ; R600-NEXT:     EXP_IEEE * T0.X, PV.Z,
-; R600-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; R600-NEXT:     ADD T0.Z, PV.W, PV.Z,
-; R600-NEXT:     FLT_TO_INT T1.W, PV.Y,
-; R600-NEXT:     MUL_IEEE * T2.W, PS, literal.x,
-; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T1.Z, PS, literal.x,
-; R600-NEXT:     SETGT_UINT T3.W, PV.W, literal.y,
-; R600-NEXT:     EXP_IEEE * T0.Y, PV.Z,
-; R600-NEXT:    2130706432(1.701412e+38), 254(3.559298e-43)
-; R600-NEXT:     CNDE_INT T1.X, PV.W, T2.W, PV.Z,
-; R600-NEXT:     MUL_IEEE T1.Y, PS, literal.x,
-; R600-NEXT:     MAX_INT T0.Z, T1.W, literal.y,
-; R600-NEXT:     MIN_INT T2.W, T1.W, literal.z,
-; R600-NEXT:     TRUNC * T0.W, T0.W,
+; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
+; R600-NEXT:     FLT_TO_INT T1.Y, PV.W,
+; R600-NEXT:     MUL_IEEE T0.Z, PS, literal.x,
+; R600-NEXT:     MULADD_IEEE T0.W, T2.W, literal.y, PV.Z,
+; R600-NEXT:     ADD * T1.W, T1.W, -PV.Y,
+; R600-NEXT:    209715200(1.972152e-31), 975668412(6.390323e-04)
+; R600-NEXT:     ADD T1.Z, PS, PV.W,
+; R600-NEXT:     MUL_IEEE T0.W, PV.Z, literal.x,
+; R600-NEXT:     SETGT_UINT * T1.W, PV.Y, literal.y,
+; R600-NEXT:    209715200(1.972152e-31), -229(nan)
+; R600-NEXT:     CNDE_INT T0.Z, PS, PV.W, T0.Z,
+; R600-NEXT:     SETGT_INT T0.W, T1.Y, literal.x,
+; R600-NEXT:     EXP_IEEE * T1.X, PV.Z,
+; R600-NEXT:    -127(nan), 0(0.000000e+00)
+; R600-NEXT:     CNDE_INT T0.Z, PV.W, PV.Z, T0.X,
+; R600-NEXT:     MAX_INT T2.W, T1.Y, literal.x,
+; R600-NEXT:     MUL_IEEE * T3.W, PS, literal.y,
+; R600-NEXT:    -330(nan), 209715200(1.972152e-31)
+; R600-NEXT:     MUL_IEEE T2.X, PS, literal.x,
+; R600-NEXT:     ADD_INT T2.Y, PV.W, literal.y,
+; R600-NEXT:     ADD_INT T1.Z, T1.Y, literal.z,
+; R600-NEXT:     MIN_INT T2.W, T1.Y, literal.w,
+; R600-NEXT:     TRUNC * T4.W, T0.Y,
+; R600-NEXT:    209715200(1.972152e-31), 204(2.858649e-43)
+; R600-NEXT:    102(1.429324e-43), 381(5.338947e-43)
+; R600-NEXT:     FLT_TO_INT T3.X, PS,
+; R600-NEXT:     ADD_INT T0.Y, PV.W, literal.x,
+; R600-NEXT:     ADD_INT T2.Z, T1.Y, literal.y,
+; R600-NEXT:     SETGT_UINT T2.W, T1.Y, literal.z,
+; R600-NEXT:     CNDE_INT * T1.W, T1.W, PV.Y, PV.Z,
+; R600-NEXT:    -254(nan), -127(nan)
+; R600-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T4.X, T1.X, literal.x,
+; R600-NEXT:     MUL_IEEE T2.Y, T0.X, literal.x, BS:VEC_120/SCL_212
+; R600-NEXT:     CNDE_INT T1.Z, T0.W, PS, T1.Y,
+; R600-NEXT:     CNDE_INT T0.W, PV.W, PV.Z, PV.Y,
+; R600-NEXT:     MAX_INT * T1.W, PV.X, literal.y,
 ; R600-NEXT:    2130706432(1.701412e+38), -330(nan)
-; R600-NEXT:    381(5.338947e-43), 0(0.000000e+00)
-; R600-NEXT:     FLT_TO_INT T2.X, PS,
-; R600-NEXT:     ADD_INT T2.Y, PV.W, literal.x,
-; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
-; R600-NEXT:     ADD_INT T0.W, T1.W, literal.z,
-; R600-NEXT:     SETGT_UINT * T2.W, T1.W, literal.w,
-; R600-NEXT:    -254(nan), 204(2.858649e-43)
-; R600-NEXT:    102(1.429324e-43), -229(nan)
-; R600-NEXT:     ADD_INT T3.X, T1.W, literal.x,
-; R600-NEXT:     CNDE_INT T3.Y, PS, PV.Z, PV.W,
-; R600-NEXT:     SETGT_INT T0.Z, T1.W, literal.x,
-; R600-NEXT:     MUL_IEEE T0.W, T0.X, literal.y,
-; R600-NEXT:     MUL_IEEE * T4.W, T0.Y, literal.y,
-; R600-NEXT:    -127(nan), 209715200(1.972152e-31)
-; R600-NEXT:     MUL_IEEE T4.X, PS, literal.x,
-; R600-NEXT:     MUL_IEEE T4.Y, PV.W, literal.x,
-; R600-NEXT:     CNDE_INT T1.Z, PV.Z, PV.Y, T1.W,
-; R600-NEXT:     CNDE_INT T3.W, T3.W, PV.X, T2.Y,
-; R600-NEXT:     MAX_INT * T5.W, T2.X, literal.y,
-; R600-NEXT:    209715200(1.972152e-31), -330(nan)
-; R600-NEXT:     SETGT_INT T3.X, T1.W, literal.x,
-; R600-NEXT:     ADD_INT T2.Y, PS, literal.y,
-; R600-NEXT:     ADD_INT T2.Z, T2.X, literal.z,
-; R600-NEXT:     SETGT_UINT * T1.W, T2.X, literal.w,
+; R600-NEXT:     SETGT_INT T0.X, T1.Y, literal.x,
+; R600-NEXT:     ADD_INT T0.Y, PS, literal.y,
+; R600-NEXT:     ADD_INT T2.Z, T3.X, literal.z,
+; R600-NEXT:     SETGT_UINT * T1.W, T3.X, literal.w,
 ; R600-NEXT:    127(1.779649e-43), 204(2.858649e-43)
 ; R600-NEXT:    102(1.429324e-43), -229(nan)
-; R600-NEXT:     MIN_INT * T5.W, T2.X, literal.x,
+; R600-NEXT:     MIN_INT * T4.W, T3.X, literal.x,
 ; R600-NEXT:    381(5.338947e-43), 0(0.000000e+00)
 ; R600-NEXT:     ADD_INT T5.X, PV.W, literal.x,
-; R600-NEXT:     ADD_INT T3.Y, T2.X, literal.y,
-; R600-NEXT:     SETGT_UINT T3.Z, T2.X, literal.z,
-; R600-NEXT:     CNDE_INT T5.W, T1.W, T2.Y, T2.Z,
-; R600-NEXT:     SETGT_INT * T6.W, T2.X, literal.y,
+; R600-NEXT:     ADD_INT T1.Y, T3.X, literal.y,
+; R600-NEXT:     SETGT_UINT T3.Z, T3.X, literal.z,
+; R600-NEXT:     CNDE_INT T4.W, T1.W, T0.Y, T2.Z,
+; R600-NEXT:     SETGT_INT * T5.W, T3.X, literal.y,
 ; R600-NEXT:    -254(nan), -127(nan)
 ; R600-NEXT:    254(3.559298e-43), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T6.X, PS, PV.W, T2.X,
-; R600-NEXT:     CNDE_INT T2.Y, PV.Z, PV.Y, PV.X,
-; R600-NEXT:     SETGT_INT T2.Z, T2.X, literal.x, BS:VEC_120/SCL_212
-; R600-NEXT:     CNDE_INT T3.W, T3.X, T1.Z, T3.W, BS:VEC_021/SCL_122
-; R600-NEXT:     CNDE_INT * T0.W, T2.W, T4.Y, T0.W,
-; R600-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T0.X, T0.Z, PS, T0.X,
-; R600-NEXT:     LSHL T3.Y, PV.W, literal.x,
-; R600-NEXT:     CNDE_INT T0.Z, PV.Z, PV.X, PV.Y,
-; R600-NEXT:     CNDE_INT T0.W, T1.W, T4.X, T4.W,
-; R600-NEXT:     MUL_IEEE * T1.W, T1.Y, literal.y,
+; R600-NEXT:     CNDE_INT T6.X, PS, PV.W, T3.X,
+; R600-NEXT:     CNDE_INT T0.Y, PV.Z, PV.Y, PV.X,
+; R600-NEXT:     SETGT_INT T2.Z, T3.X, literal.x,
+; R600-NEXT:     CNDE_INT T0.W, T0.X, T1.Z, T0.W, BS:VEC_120/SCL_212
+; R600-NEXT:     MUL_IEEE * T4.W, T2.Y, literal.y,
+; R600-NEXT:    127(1.779649e-43), 2130706432(1.701412e+38)
+; R600-NEXT:     CNDE_INT T3.X, T2.W, T2.Y, PS, BS:VEC_120/SCL_212
+; R600-NEXT:     LSHL T1.Y, PV.W, literal.x,
+; R600-NEXT:     CNDE_INT T1.Z, PV.Z, PV.X, PV.Y,
+; R600-NEXT:     MUL_IEEE T0.W, T4.X, literal.y,
+; R600-NEXT:     CNDE_INT * T1.W, T1.W, T2.X, T3.W,
 ; R600-NEXT:    23(3.222986e-44), 2130706432(1.701412e+38)
-; R600-NEXT:     CNDE_INT T2.X, T3.Z, T1.Y, PS,
-; R600-NEXT:     CNDE_INT T0.Y, T6.W, PV.W, T0.Y,
-; R600-NEXT:     LSHL T0.Z, PV.Z, literal.x,
+; R600-NEXT:     CNDE_INT T1.X, T5.W, PS, T1.X, BS:VEC_021/SCL_122
+; R600-NEXT:     CNDE_INT T0.Y, T3.Z, T4.X, PV.W, BS:VEC_201
+; R600-NEXT:     LSHL T1.Z, PV.Z, literal.x,
 ; R600-NEXT:     ADD_INT T0.W, PV.Y, literal.y,
-; R600-NEXT:     CNDE_INT * T1.W, T3.X, PV.X, T1.X,
+; R600-NEXT:     CNDE_INT * T1.W, T0.X, T0.Z, PV.X,
 ; R600-NEXT:    23(3.222986e-44), 1065353216(1.000000e+00)
 ; R600-NEXT:     MUL_IEEE T1.Y, PS, PV.W,
-; R600-NEXT:     SETGT T1.Z, literal.x, KC0[3].X,
+; R600-NEXT:     SETGT T0.Z, literal.x, KC0[3].X,
 ; R600-NEXT:     ADD_INT * T0.W, PV.Z, literal.y,
 ; R600-NEXT:    -1036817932(-4.485347e+01), 1065353216(1.000000e+00)
 ; R600-NEXT:    ALU clause starting at 101:
-; R600-NEXT:     CNDE_INT * T1.W, T2.Z, T0.Y, T2.X,
+; R600-NEXT:     CNDE_INT * T1.W, T2.Z, T1.X, T0.Y,
 ; R600-NEXT:     MUL_IEEE T0.Y, PV.W, T0.W,
-; R600-NEXT:     SETGT T0.Z, literal.x, KC0[2].W,
-; R600-NEXT:     CNDE T0.W, T1.Z, T1.Y, 0.0,
+; R600-NEXT:     SETGT T1.Z, literal.x, KC0[2].W,
+; R600-NEXT:     CNDE T0.W, T0.Z, T1.Y, 0.0,
 ; R600-NEXT:     SETGT * T1.W, KC0[3].X, literal.y,
 ; R600-NEXT:    -1036817932(-4.485347e+01), 1109008539(3.853184e+01)
 ; R600-NEXT:     CNDE T1.Y, PS, PV.W, literal.x,
@@ -723,118 +721,116 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; CM-LABEL: s_exp10_v2f32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 100, @4, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    ALU 18, @105, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 98, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 18, @103, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    ALU clause starting at 4:
 ; CM-NEXT:     AND_INT * T0.W, KC0[2].W, literal.x,
 ; CM-NEXT:    -4096(nan), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.Z, PV.W, literal.x,
 ; CM-NEXT:     ADD * T1.W, KC0[2].W, -PV.W,
+; CM-NEXT:     MUL_IEEE T0.Y, PV.W, literal.x,
+; CM-NEXT:     MUL_IEEE T0.Z, T0.W, literal.y,
+; CM-NEXT:     AND_INT * T2.W, KC0[3].X, literal.z,
+; CM-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
+; CM-NEXT:    -4096(nan), 0(0.000000e+00)
+; CM-NEXT:     ADD T1.Y, KC0[3].X, -PV.W,
+; CM-NEXT:     RNDNE T1.Z, PV.Z,
+; CM-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.x, PV.Y,
 ; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T1.Z, PV.W, literal.x,
-; CM-NEXT:     RNDNE * T2.W, PV.Z,
-; CM-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; CM-NEXT:     TRUNC T0.Y, PV.W,
-; CM-NEXT:     AND_INT T2.Z, KC0[3].X, literal.x,
-; CM-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.y, PV.Z,
-; CM-NEXT:    -4096(nan), 1079283712(3.321289e+00)
 ; CM-NEXT:     MULADD_IEEE T0.X, T0.W, literal.x, PV.W,
-; CM-NEXT:     MUL_IEEE T1.Y, PV.Z, literal.y,
-; CM-NEXT:     FLT_TO_INT T1.Z, PV.Y,
-; CM-NEXT:     ADD * T0.W, KC0[3].X, -PV.Z,
+; CM-NEXT:     ADD T0.Y, T0.Z, -PV.Z,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.x,
+; CM-NEXT:     MUL_IEEE * T0.W, T2.W, literal.y, BS:VEC_120/SCL_212
 ; CM-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
-; CM-NEXT:     ADD T1.X, T0.Z, -T2.W,
-; CM-NEXT:     MUL_IEEE T0.Y, PV.W, literal.x,
-; CM-NEXT:     MAX_INT T0.Z, PV.Z, literal.y,
-; CM-NEXT:     RNDNE * T1.W, PV.Y,
-; CM-NEXT:    975668412(6.390323e-04), -330(nan)
-; CM-NEXT:     TRUNC T2.X, PV.W,
-; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.x,
-; CM-NEXT:     MULADD_IEEE T0.Z, T0.W, literal.y, PV.Y,
-; CM-NEXT:     ADD * T0.W, PV.X, T0.X,
-; CM-NEXT:    204(2.858649e-43), 1079283712(3.321289e+00)
-; CM-NEXT:     EXP_IEEE T0.X, T0.W,
-; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
-; CM-NEXT:     ADD_INT T1.X, T1.Z, literal.x,
-; CM-NEXT:     MULADD_IEEE T0.Y, T2.Z, literal.y, T0.Z, BS:VEC_102/SCL_221
-; CM-NEXT:     ADD T0.Z, T1.Y, -T1.W,
-; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.z,
-; CM-NEXT:    102(1.429324e-43), 975668412(6.390323e-04)
-; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_UINT T3.X, T1.Z, literal.x,
-; CM-NEXT:     MUL_IEEE T1.Y, PV.W, literal.y,
-; CM-NEXT:     SETGT_UINT T2.Z, T1.Z, literal.z,
-; CM-NEXT:     ADD * T1.W, PV.Z, PV.Y,
-; CM-NEXT:    -229(nan), 2130706432(1.701412e+38)
-; CM-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; CM-NEXT:     TRUNC T1.X, T1.Z,
+; CM-NEXT:     RNDNE T2.Y, PV.W,
+; CM-NEXT:     MULADD_IEEE T0.Z, T1.Y, literal.x, PV.Z,
+; CM-NEXT:     ADD * T1.W, PV.Y, PV.X,
+; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
+; CM-NEXT:     EXP_IEEE T0.X, T1.W,
+; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T1.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
+; CM-NEXT:     MULADD_IEEE T2.X, T2.W, literal.x, T0.Z,
+; CM-NEXT:     ADD T0.Y, T0.W, -T2.Y, BS:VEC_120/SCL_212
+; CM-NEXT:     FLT_TO_INT T0.Z, T1.X,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.y,
+; CM-NEXT:    975668412(6.390323e-04), 209715200(1.972152e-31)
+; CM-NEXT:     MUL_IEEE T1.X, PV.W, literal.x,
+; CM-NEXT:     SETGT_UINT T1.Y, PV.Z, literal.y,
+; CM-NEXT:     TRUNC T1.Z, T2.Y,
+; CM-NEXT:     ADD * T1.W, PV.Y, PV.X,
+; CM-NEXT:    209715200(1.972152e-31), -229(nan)
 ; CM-NEXT:     EXP_IEEE T0.X (MASKED), T1.W,
 ; CM-NEXT:     EXP_IEEE T0.Y, T1.W,
 ; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
 ; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
-; CM-NEXT:     CNDE_INT T4.X, T2.Z, T0.W, T1.Y,
-; CM-NEXT:     CNDE_INT T1.Y, T3.X, T2.Y, T1.X,
-; CM-NEXT:     FLT_TO_INT T0.Z, T2.X, BS:VEC_120/SCL_212
-; CM-NEXT:     MUL_IEEE * T0.W, PV.Y, literal.x,
-; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_INT T1.X, T1.Z, literal.x,
-; CM-NEXT:     MUL_IEEE T2.Y, T0.X, literal.y,
-; CM-NEXT:     MUL_IEEE T3.Z, PV.W, literal.z,
-; CM-NEXT:     SETGT_UINT * T1.W, PV.Z, literal.w,
-; CM-NEXT:    -127(nan), 209715200(1.972152e-31)
-; CM-NEXT:    2130706432(1.701412e+38), 254(3.559298e-43)
-; CM-NEXT:     CNDE_INT T2.X, PV.W, T0.W, PV.Z,
+; CM-NEXT:     FLT_TO_INT T2.X, T1.Z,
+; CM-NEXT:     MUL_IEEE T2.Y, PV.Y, literal.x,
+; CM-NEXT:     CNDE_INT T1.Z, T1.Y, T1.X, T0.W,
+; CM-NEXT:     SETGT_INT * T0.W, T0.Z, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:    209715200(1.972152e-31), -127(nan)
+; CM-NEXT:     CNDE_INT T1.X, PV.W, PV.Z, T0.X,
 ; CM-NEXT:     MUL_IEEE T3.Y, PV.Y, literal.x,
-; CM-NEXT:     CNDE_INT T3.Z, PV.X, T1.Y, T1.Z,
-; CM-NEXT:     MAX_INT * T0.W, T0.Z, literal.y,
-; CM-NEXT:    209715200(1.972152e-31), -330(nan)
-; CM-NEXT:     ADD_INT T5.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, T0.Z, literal.y,
-; CM-NEXT:     SETGT_UINT T4.Z, T0.Z, literal.z,
-; CM-NEXT:     MUL_IEEE * T0.W, T0.Y, literal.w,
+; CM-NEXT:     SETGT_UINT T1.Z, PV.X, literal.y,
+; CM-NEXT:     MAX_INT * T1.W, T0.Z, literal.z,
+; CM-NEXT:    209715200(1.972152e-31), -229(nan)
+; CM-NEXT:    -330(nan), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T3.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T4.Y, T0.Z, literal.y,
+; CM-NEXT:     CNDE_INT T2.Z, PV.Z, PV.Y, T2.Y,
+; CM-NEXT:     SETGT_INT * T1.W, T2.X, literal.z,
 ; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
-; CM-NEXT:    -229(nan), 209715200(1.972152e-31)
-; CM-NEXT:     MUL_IEEE T6.X, PV.W, literal.x,
-; CM-NEXT:     MIN_INT T4.Y, T0.Z, literal.y,
-; CM-NEXT:     CNDE_INT T5.Z, PV.Z, PV.X, PV.Y,
-; CM-NEXT:     SETGT_INT * T2.W, T0.Z, literal.z,
-; CM-NEXT:    209715200(1.972152e-31), 381(5.338947e-43)
-; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T5.X, PV.W, PV.Z, T0.Z,
-; CM-NEXT:     MIN_INT T1.Y, T1.Z, literal.x,
-; CM-NEXT:     ADD_INT T5.Z, PV.Y, literal.y,
-; CM-NEXT:     ADD_INT * T3.W, T0.Z, literal.z, BS:VEC_120/SCL_212
-; CM-NEXT:    381(5.338947e-43), -254(nan)
 ; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T7.X, T1.W, PV.W, PV.Z,
-; CM-NEXT:     SETGT_INT T4.Y, T0.Z, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, PV.Y, literal.y,
-; CM-NEXT:     ADD_INT * T1.W, T1.Z, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:     CNDE_INT T4.X, PV.W, PV.Z, T0.Y,
+; CM-NEXT:     MUL_IEEE T2.Y, T0.X, literal.x,
+; CM-NEXT:     MAX_INT T2.Z, T2.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     CNDE_INT * T2.W, T1.Y, PV.X, PV.Y,
+; CM-NEXT:    2130706432(1.701412e+38), -330(nan)
+; CM-NEXT:     CNDE_INT T0.X, T0.W, PV.W, T0.Z,
+; CM-NEXT:     ADD_INT T1.Y, PV.Z, literal.x,
+; CM-NEXT:     ADD_INT T2.Z, T2.X, literal.y,
+; CM-NEXT:     MIN_INT * T0.W, T2.X, literal.z,
+; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T3.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T3.Y, T2.X, literal.y,
+; CM-NEXT:     SETGT_UINT T3.Z, T2.X, literal.z,
+; CM-NEXT:     CNDE_INT * T0.W, T1.Z, PV.Y, PV.Z,
+; CM-NEXT:    -254(nan), -127(nan)
+; CM-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T5.X, T0.Y, literal.x,
+; CM-NEXT:     CNDE_INT T0.Y, T1.W, PV.W, T2.X,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Z, PV.Y, PV.X,
+; CM-NEXT:     MIN_INT * T0.W, T0.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 381(5.338947e-43)
+; CM-NEXT:     SETGT_INT T2.X, T2.X, literal.x,
+; CM-NEXT:     ADD_INT T1.Y, PV.W, literal.y,
+; CM-NEXT:     ADD_INT T2.Z, T0.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T0.W, T0.Z, literal.w,
 ; CM-NEXT:    127(1.779649e-43), -254(nan)
-; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T8.X, T2.Z, PV.W, PV.Z,
-; CM-NEXT:     SETGT_INT T1.Y, T1.Z, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T5.X, PV.X,
-; CM-NEXT:     CNDE_INT * T0.W, T4.Z, T6.X, T0.W, BS:VEC_201
-; CM-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T5.X, T2.W, PV.W, T0.Y,
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     CNDE_INT T3.X, PV.W, PV.Z, PV.Y,
+; CM-NEXT:     SETGT_INT T1.Y, T0.Z, literal.x,
+; CM-NEXT:     CNDE_INT T0.Z, PV.X, T0.Y, T1.Z,
+; CM-NEXT:     MUL_IEEE * T1.W, T5.X, literal.y,
+; CM-NEXT:    127(1.779649e-43), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T5.X, T3.Z, T5.X, PV.W,
 ; CM-NEXT:     LSHL T0.Y, PV.Z, literal.x,
-; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T3.Z, PV.X,
-; CM-NEXT:     CNDE_INT * T0.W, T3.X, T3.Y, T2.Y, BS:VEC_201
-; CM-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T0.X, T1.X, PV.W, T0.X,
+; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T0.X, PV.X, BS:VEC_021/SCL_122
+; CM-NEXT:     MUL_IEEE * T1.W, T2.Y, literal.y,
+; CM-NEXT:    23(3.222986e-44), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T0.X, T0.W, T2.Y, PV.W,
 ; CM-NEXT:     LSHL T2.Y, PV.Z, literal.x,
 ; CM-NEXT:     ADD_INT * T0.Z, PV.Y, literal.y,
 ; CM-NEXT:    23(3.222986e-44), 1065353216(1.000000e+00)
-; CM-NEXT:    ALU clause starting at 105:
-; CM-NEXT:     CNDE_INT * T0.W, T4.Y, T5.X, T2.X,
-; CM-NEXT:     MUL_IEEE T1.X, PV.W, T0.Z,
+; CM-NEXT:    ALU clause starting at 103:
+; CM-NEXT:     CNDE_INT * T0.W, T2.X, T4.X, T5.X,
+; CM-NEXT:     MUL_IEEE T2.X, PV.W, T0.Z,
 ; CM-NEXT:     SETGT T0.Y, literal.x, KC0[3].X,
 ; CM-NEXT:     ADD_INT T0.Z, T2.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T1.Y, T0.X, T4.X, BS:VEC_120/SCL_212
+; CM-NEXT:     CNDE_INT * T0.W, T1.Y, T1.X, T0.X, BS:VEC_120/SCL_212
 ; CM-NEXT:    -1036817932(-4.485347e+01), 1065353216(1.000000e+00)
 ; CM-NEXT:     MUL_IEEE T0.X, PV.W, PV.Z,
 ; CM-NEXT:     SETGT T1.Y, literal.x, KC0[2].W,
@@ -1217,8 +1213,8 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ;
 ; R600-LABEL: s_exp10_v3f32:
 ; R600:       ; %bb.0:
-; R600-NEXT:    ALU 100, @6, KC0[CB0:0-32], KC1[]
-; R600-NEXT:    ALU 69, @107, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    ALU 99, @6, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    ALU 69, @106, KC0[CB0:0-32], KC1[]
 ; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
 ; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
 ; R600-NEXT:    CF_END
@@ -1226,69 +1222,68 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; R600-NEXT:    ALU clause starting at 6:
 ; R600-NEXT:     AND_INT * T0.W, KC0[3].Y, literal.x,
 ; R600-NEXT:    -4096(nan), 0(0.000000e+00)
-; R600-NEXT:     ADD T1.W, KC0[3].Y, -PV.W,
-; R600-NEXT:     MUL_IEEE * T2.W, PV.W, literal.x,
+; R600-NEXT:     MUL_IEEE T1.W, PV.W, literal.x,
+; R600-NEXT:     ADD * T2.W, KC0[3].Y, -PV.W,
 ; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; R600-NEXT:     RNDNE T3.W, PS,
-; R600-NEXT:     MUL_IEEE * T4.W, PV.W, literal.x,
+; R600-NEXT:     RNDNE * T3.W, PV.W,
+; R600-NEXT:     TRUNC T4.W, PV.W,
+; R600-NEXT:     MUL_IEEE * T5.W, T2.W, literal.x,
 ; R600-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; R600-NEXT:     MULADD_IEEE T1.W, T1.W, literal.x, PS,
-; R600-NEXT:     TRUNC * T4.W, PV.W,
+; R600-NEXT:     MULADD_IEEE T2.W, T2.W, literal.x, PS,
+; R600-NEXT:     FLT_TO_INT * T4.W, PV.W,
 ; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; R600-NEXT:     FLT_TO_INT T0.Z, PS,
-; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.x, PV.W,
-; R600-NEXT:     ADD * T1.W, T2.W, -T3.W,
-; R600-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; R600-NEXT:     ADD T0.W, PS, PV.W,
-; R600-NEXT:     MAX_INT * T1.W, PV.Z, literal.x,
-; R600-NEXT:    -330(nan), 0(0.000000e+00)
-; R600-NEXT:     ADD_INT T0.Y, PS, literal.x,
-; R600-NEXT:     ADD_INT T1.Z, T0.Z, literal.y,
-; R600-NEXT:     SETGT_UINT T1.W, T0.Z, literal.z,
-; R600-NEXT:     EXP_IEEE * T0.X, PV.W,
+; R600-NEXT:     MAX_INT T0.Z, PS, literal.x,
+; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.y, PV.W,
+; R600-NEXT:     ADD * T1.W, T1.W, -T3.W,
+; R600-NEXT:    -330(nan), 975668412(6.390323e-04)
+; R600-NEXT:     ADD T0.Y, PS, PV.W,
+; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.x,
+; R600-NEXT:     ADD_INT T0.W, T4.W, literal.y,
+; R600-NEXT:     SETGT_UINT * T1.W, T4.W, literal.z,
 ; R600-NEXT:    204(2.858649e-43), 102(1.429324e-43)
 ; R600-NEXT:    -229(nan), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
-; R600-NEXT:     SETGT_INT T0.W, T0.Z, literal.x,
-; R600-NEXT:     MUL_IEEE * T2.W, PS, literal.y,
-; R600-NEXT:    -127(nan), 209715200(1.972152e-31)
-; R600-NEXT:     MUL_IEEE T0.Y, PS, literal.x,
-; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Z, T0.Z,
-; R600-NEXT:     MIN_INT T3.W, T0.Z, literal.y,
-; R600-NEXT:     AND_INT * T4.W, KC0[3].W, literal.z,
-; R600-NEXT:    209715200(1.972152e-31), 381(5.338947e-43)
-; R600-NEXT:    -4096(nan), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T1.X, T0.X, literal.x,
-; R600-NEXT:     ADD T1.Y, KC0[3].W, -PS,
-; R600-NEXT:     ADD_INT T2.Z, PV.W, literal.y,
-; R600-NEXT:     ADD_INT T3.W, T0.Z, literal.z,
-; R600-NEXT:     SETGT_UINT * T5.W, T0.Z, literal.w,
-; R600-NEXT:    2130706432(1.701412e+38), -254(nan)
+; R600-NEXT:     CNDE_INT T0.Z, PS, PV.Z, PV.W,
+; R600-NEXT:     SETGT_INT T0.W, T4.W, literal.x,
+; R600-NEXT:     EXP_IEEE * T0.X, PV.Y,
+; R600-NEXT:    -127(nan), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T1.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T0.Y, PV.W, PV.Z, T4.W,
+; R600-NEXT:     MIN_INT T0.Z, T4.W, literal.y,
+; R600-NEXT:     AND_INT T2.W, KC0[3].W, literal.z,
+; R600-NEXT:     MUL_IEEE * T3.W, PS, literal.w,
+; R600-NEXT:    2130706432(1.701412e+38), 381(5.338947e-43)
+; R600-NEXT:    -4096(nan), 209715200(1.972152e-31)
+; R600-NEXT:     MUL_IEEE T2.X, PS, literal.x,
+; R600-NEXT:     ADD T1.Y, KC0[3].W, -PV.W,
+; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
+; R600-NEXT:     ADD_INT T5.W, T4.W, literal.z,
+; R600-NEXT:     SETGT_UINT * T6.W, T4.W, literal.w,
+; R600-NEXT:    209715200(1.972152e-31), -254(nan)
 ; R600-NEXT:    -127(nan), 254(3.559298e-43)
-; R600-NEXT:     CNDE_INT T2.X, PS, PV.W, PV.Z,
-; R600-NEXT:     SETGT_INT T2.Y, T0.Z, literal.x,
+; R600-NEXT:     CNDE_INT T3.X, PS, PV.W, PV.Z,
+; R600-NEXT:     SETGT_INT T2.Y, T4.W, literal.x,
 ; R600-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.y,
-; R600-NEXT:     MUL_IEEE T3.W, T4.W, literal.z,
-; R600-NEXT:     MUL_IEEE * T6.W, PV.X, literal.w,
+; R600-NEXT:     MUL_IEEE * T4.W, T2.W, literal.z, BS:VEC_120/SCL_212
 ; R600-NEXT:    127(1.779649e-43), 975668412(6.390323e-04)
-; R600-NEXT:    1079283712(3.321289e+00), 2130706432(1.701412e+38)
-; R600-NEXT:     CNDE_INT T1.X, T5.W, T1.X, PS, BS:VEC_120/SCL_212
-; R600-NEXT:     RNDNE T3.Y, PV.W,
-; R600-NEXT:     MULADD_IEEE T0.Z, T1.Y, literal.x, PV.Z,
-; R600-NEXT:     CNDE_INT T5.W, PV.Y, T1.Z, PV.X,
-; R600-NEXT:     CNDE_INT * T1.W, T1.W, T0.Y, T2.W,
 ; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T0.X, T0.W, PS, T0.X,
+; R600-NEXT:     CNDE_INT * T1.W, T1.W, T2.X, T3.W,
+; R600-NEXT:     CNDE_INT T0.X, T0.W, PV.W, T0.X, BS:VEC_021/SCL_122
+; R600-NEXT:     RNDNE T3.Y, T4.W, BS:VEC_120/SCL_212
+; R600-NEXT:     MULADD_IEEE T0.Z, T1.Y, literal.x, T0.Z,
+; R600-NEXT:     CNDE_INT T0.W, T2.Y, T0.Y, T3.X, BS:VEC_120/SCL_212
+; R600-NEXT:     MUL_IEEE * T1.W, T1.X, literal.y,
+; R600-NEXT:    1079283712(3.321289e+00), 2130706432(1.701412e+38)
+; R600-NEXT:     CNDE_INT T1.X, T6.W, T1.X, PS,
 ; R600-NEXT:     LSHL T0.Y, PV.W, literal.x,
 ; R600-NEXT:     AND_INT T1.Z, KC0[3].Z, literal.y,
-; R600-NEXT:     MULADD_IEEE T0.W, T4.W, literal.z, PV.Z, BS:VEC_120/SCL_212
-; R600-NEXT:     ADD * T1.W, T3.W, -PV.Y,
+; R600-NEXT:     MULADD_IEEE T0.W, T2.W, literal.z, PV.Z, BS:VEC_120/SCL_212
+; R600-NEXT:     ADD * T1.W, T4.W, -PV.Y,
 ; R600-NEXT:    23(3.222986e-44), -4096(nan)
 ; R600-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
 ; R600-NEXT:     ADD T1.Y, PS, PV.W,
 ; R600-NEXT:     MUL_IEEE T0.Z, PV.Z, literal.x,
 ; R600-NEXT:     ADD_INT T0.W, PV.Y, literal.y,
-; R600-NEXT:     CNDE_INT * T1.W, T2.Y, PV.X, T1.X,
+; R600-NEXT:     CNDE_INT * T1.W, T2.Y, T0.X, PV.X,
 ; R600-NEXT:    1079283712(3.321289e+00), 1065353216(1.000000e+00)
 ; R600-NEXT:     MUL_IEEE T0.X, PS, PV.W,
 ; R600-NEXT:     ADD T0.Y, KC0[3].Z, -T1.Z,
@@ -1302,12 +1297,12 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; R600-NEXT:     MUL_IEEE * T1.W, PS, literal.z,
 ; R600-NEXT:    -1036817932(-4.485347e+01), 975668412(6.390323e-04)
 ; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T3.X, T1.X, literal.x,
-; R600-NEXT:     MUL_IEEE T2.Y, PS, literal.y,
+; R600-NEXT:     MUL_IEEE T3.X, PS, literal.x,
+; R600-NEXT:     MUL_IEEE T2.Y, T1.X, literal.y,
 ; R600-NEXT:     MULADD_IEEE T4.Z, T0.Y, literal.z, PV.W,
 ; R600-NEXT:     FLT_TO_INT T0.W, PV.Z,
 ; R600-NEXT:     MIN_INT * T2.W, PV.Y, literal.w,
-; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
+; R600-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
 ; R600-NEXT:    1079283712(3.321289e+00), 381(5.338947e-43)
 ; R600-NEXT:     ADD_INT T4.X, PS, literal.x,
 ; R600-NEXT:     MAX_INT T0.Y, PV.W, literal.y,
@@ -1325,7 +1320,7 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; R600-NEXT:    102(1.429324e-43), -229(nan)
 ; R600-NEXT:     ADD_INT * T6.X, T0.W, literal.x,
 ; R600-NEXT:    -127(nan), 0(0.000000e+00)
-; R600-NEXT:    ALU clause starting at 107:
+; R600-NEXT:    ALU clause starting at 106:
 ; R600-NEXT:     SETGT_UINT T0.Y, T0.W, literal.x,
 ; R600-NEXT:     CNDE_INT T0.Z, T3.W, T0.Z, T2.W, BS:VEC_102/SCL_221
 ; R600-NEXT:     SETGT_INT T2.W, T0.W, literal.y,
@@ -1341,25 +1336,25 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; R600-NEXT:     SETGT_UINT T5.X, T1.Y, literal.x,
 ; R600-NEXT:     CNDE_INT T4.Y, PS, PV.Z, PV.W,
 ; R600-NEXT:     MAX_INT T0.Z, T1.Y, literal.y,
-; R600-NEXT:     MUL_IEEE T4.W, T1.Z, literal.z,
-; R600-NEXT:     MUL_IEEE * T5.W, PV.Y, literal.w,
+; R600-NEXT:     MUL_IEEE T4.W, PV.Y, literal.z,
+; R600-NEXT:     MUL_IEEE * T5.W, T1.Z, literal.w,
 ; R600-NEXT:    254(3.559298e-43), -330(nan)
-; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
-; R600-NEXT:     CNDE_INT T6.X, T3.W, PS, T3.Y, BS:VEC_021/SCL_122
-; R600-NEXT:     MUL_IEEE T3.Y, PV.W, literal.x,
+; R600-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
+; R600-NEXT:     MUL_IEEE T6.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T3.Y, T3.W, PV.W, T3.Y, BS:VEC_021/SCL_122
 ; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
 ; R600-NEXT:     ADD_INT T3.W, T1.Y, literal.z,
-; R600-NEXT:     SETGT_UINT * T5.W, T1.Y, literal.w,
+; R600-NEXT:     SETGT_UINT * T4.W, T1.Y, literal.w,
 ; R600-NEXT:    2130706432(1.701412e+38), 204(2.858649e-43)
 ; R600-NEXT:    102(1.429324e-43), -229(nan)
 ; R600-NEXT:     CNDE_INT T8.X, PS, PV.Z, PV.W,
 ; R600-NEXT:     SETGT_INT T5.Y, T1.Y, literal.x,
-; R600-NEXT:     CNDE_INT T0.Z, T0.Y, T4.W, PV.Y, BS:VEC_120/SCL_212
-; R600-NEXT:     CNDE_INT T2.W, T2.W, PV.X, T1.Z,
+; R600-NEXT:     CNDE_INT T0.Z, T2.W, PV.Y, T1.Z,
+; R600-NEXT:     CNDE_INT T2.W, T0.Y, T5.W, PV.X, BS:VEC_120/SCL_212
 ; R600-NEXT:     LSHL * T3.W, T4.Y, literal.y,
 ; R600-NEXT:    -127(nan), 23(3.222986e-44)
 ; R600-NEXT:     ADD_INT T6.X, PS, literal.x,
-; R600-NEXT:     CNDE_INT T0.Y, T0.W, PV.W, PV.Z,
+; R600-NEXT:     CNDE_INT T0.Y, T0.W, PV.Z, PV.W,
 ; R600-NEXT:     CNDE_INT T0.Z, PV.Y, PV.X, T1.Y,
 ; R600-NEXT:     CNDE_INT T0.W, T5.X, T7.X, T4.X,
 ; R600-NEXT:     SETGT_INT * T2.W, T1.Y, literal.y,
@@ -1367,18 +1362,18 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; R600-NEXT:     CNDE_INT T4.X, PS, PV.Z, PV.W,
 ; R600-NEXT:     MUL_IEEE T0.Y, PV.Y, PV.X,
 ; R600-NEXT:     SETGT T0.Z, literal.x, KC0[3].Z,
-; R600-NEXT:     CNDE_INT T0.W, T5.W, T2.Y, T1.W,
-; R600-NEXT:     MUL_IEEE * T1.W, T3.X, literal.y,
+; R600-NEXT:     MUL_IEEE T0.W, T2.Y, literal.y,
+; R600-NEXT:     CNDE_INT * T1.W, T4.W, T3.X, T1.W,
 ; R600-NEXT:    -1036817932(-4.485347e+01), 2130706432(1.701412e+38)
-; R600-NEXT:     CNDE_INT T3.X, T5.X, T3.X, PS,
-; R600-NEXT:     CNDE_INT T1.Y, T5.Y, PV.W, T1.X,
+; R600-NEXT:     CNDE_INT T1.X, T5.Y, PS, T1.X,
+; R600-NEXT:     CNDE_INT T1.Y, T5.X, T2.Y, PV.W,
 ; R600-NEXT:     CNDE T0.Z, PV.Z, PV.Y, 0.0,
 ; R600-NEXT:     SETGT T0.W, KC0[3].Z, literal.x,
 ; R600-NEXT:     LSHL * T1.W, PV.X, literal.y,
 ; R600-NEXT:    1109008539(3.853184e+01), 23(3.222986e-44)
-; R600-NEXT:     ADD_INT T1.X, PS, literal.x,
+; R600-NEXT:     ADD_INT T3.X, PS, literal.x,
 ; R600-NEXT:     CNDE T0.Y, PV.W, PV.Z, literal.y,
-; R600-NEXT:     CNDE_INT T0.Z, T2.W, PV.Y, PV.X,
+; R600-NEXT:     CNDE_INT T0.Z, T2.W, PV.X, PV.Y,
 ; R600-NEXT:     CNDE T0.W, T2.X, T0.X, 0.0,
 ; R600-NEXT:     SETGT * T1.W, KC0[3].Y, literal.z,
 ; R600-NEXT:    1065353216(1.000000e+00), 2139095040(INF)
@@ -1399,197 +1394,193 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ;
 ; CM-LABEL: s_exp10_v3f32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 102, @6, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    ALU 80, @109, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T0.X
+; CM-NEXT:    ALU 101, @6, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 77, @108, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T3.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
 ; CM-NEXT:    ALU clause starting at 6:
 ; CM-NEXT:     AND_INT * T0.W, KC0[3].Y, literal.x,
 ; CM-NEXT:    -4096(nan), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.Z, PV.W, literal.x,
 ; CM-NEXT:     ADD * T1.W, KC0[3].Y, -PV.W,
-; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T1.Z, PV.W, literal.x,
-; CM-NEXT:     RNDNE * T2.W, PV.Z,
-; CM-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; CM-NEXT:     TRUNC T2.Z, PV.W,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.W, literal.x,
+; CM-NEXT:     MUL_IEEE * T2.W, T0.W, literal.y,
+; CM-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
+; CM-NEXT:     RNDNE T1.Z, PV.W,
 ; CM-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.x, PV.Z,
 ; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; CM-NEXT:     MULADD_IEEE T0.Y, T0.W, literal.x, PV.W,
-; CM-NEXT:     ADD T0.Z, T0.Z, -T2.W,
-; CM-NEXT:     FLT_TO_INT * T0.W, PV.Z,
+; CM-NEXT:     MULADD_IEEE T0.Z, T0.W, literal.x, PV.W,
+; CM-NEXT:     ADD * T0.W, T2.W, -PV.Z, BS:VEC_120/SCL_212
 ; CM-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; CM-NEXT:     MIN_INT T1.Z, PV.W, literal.x,
-; CM-NEXT:     ADD * T1.W, PV.Z, PV.Y,
+; CM-NEXT:     TRUNC T1.Z, T1.Z,
+; CM-NEXT:     ADD * T0.W, PV.W, PV.Z,
+; CM-NEXT:     EXP_IEEE T0.X, T0.W,
+; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
+; CM-NEXT:     FLT_TO_INT T0.Z, T1.Z,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T0.Y, PV.W, literal.x,
+; CM-NEXT:     MAX_INT T1.Z, PV.Z, literal.y,
+; CM-NEXT:     MIN_INT * T1.W, PV.Z, literal.z,
+; CM-NEXT:    209715200(1.972152e-31), -330(nan)
 ; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
-; CM-NEXT:     EXP_IEEE T0.X, T1.W,
-; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
-; CM-NEXT:     MUL_IEEE T0.Y, PV.X, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, T1.Z, literal.y,
-; CM-NEXT:     MAX_INT * T1.W, T0.W, literal.z,
-; CM-NEXT:    2130706432(1.701412e+38), -254(nan)
-; CM-NEXT:    -330(nan), 0(0.000000e+00)
-; CM-NEXT:     ADD_INT T1.X, T0.W, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, PV.W, literal.y,
-; CM-NEXT:     ADD_INT T1.Z, T0.W, literal.z,
-; CM-NEXT:     SETGT_UINT * T1.W, T0.W, literal.w,
-; CM-NEXT:    -127(nan), 204(2.858649e-43)
+; CM-NEXT:     ADD_INT T1.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T1.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T1.Z, T0.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T1.W, T0.Z, literal.w,
+; CM-NEXT:    -254(nan), 204(2.858649e-43)
 ; CM-NEXT:    102(1.429324e-43), -229(nan)
-; CM-NEXT:     SETGT_UINT T2.X, T0.W, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     SETGT_INT T1.Z, T0.W, literal.y,
-; CM-NEXT:     MUL_IEEE * T2.W, T0.X, literal.z,
-; CM-NEXT:    254(3.559298e-43), -127(nan)
-; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T3.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.Z, PV.Y, T0.W,
-; CM-NEXT:     CNDE_INT T0.Z, PV.X, T1.X, T0.Z,
-; CM-NEXT:     SETGT_INT * T0.W, T0.W, literal.y,
-; CM-NEXT:    209715200(1.972152e-31), 127(1.779649e-43)
+; CM-NEXT:     ADD_INT T2.X, T0.Z, literal.x,
+; CM-NEXT:     SETGT_UINT T2.Y, T0.Z, literal.y,
+; CM-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     SETGT_INT * T2.W, T0.Z, literal.x,
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     MUL_IEEE T3.X, T0.X, literal.x,
+; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Z, T0.Z,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Y, PV.X, T1.X,
+; CM-NEXT:     SETGT_INT * T3.W, T0.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 127(1.779649e-43)
 ; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     CNDE_INT T0.Z, T1.W, PV.X, T2.W,
-; CM-NEXT:     MUL_IEEE * T1.W, T0.Y, literal.x,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.X, literal.x,
+; CM-NEXT:     CNDE_INT * T0.W, T1.W, T0.Y, T0.W,
 ; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T1.X, T2.X, T0.Y, PV.W,
-; CM-NEXT:     CNDE_INT T0.Y, T1.Z, PV.Z, T0.X,
+; CM-NEXT:     CNDE_INT T0.X, T2.W, PV.W, T0.X,
+; CM-NEXT:     CNDE_INT T0.Y, T2.Y, T3.X, PV.Z,
 ; CM-NEXT:     LSHL T0.Z, PV.Y, literal.x,
-; CM-NEXT:     AND_INT * T1.W, KC0[3].Z, literal.y,
+; CM-NEXT:     AND_INT * T0.W, KC0[3].Z, literal.y,
 ; CM-NEXT:    23(3.222986e-44), -4096(nan)
-; CM-NEXT:     MUL_IEEE T0.X, PV.W, literal.x,
 ; CM-NEXT:     ADD T1.Y, KC0[3].Z, -PV.W,
-; CM-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T0.W, PV.Y, PV.X,
-; CM-NEXT:    1079283712(3.321289e+00), 1065353216(1.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.Y, PV.W, PV.Z,
-; CM-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.x,
-; CM-NEXT:     RNDNE * T0.W, PV.X,
-; CM-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T0.Z, PV.Z, literal.x,
+; CM-NEXT:     CNDE_INT * T1.W, T3.W, PV.X, PV.Y,
+; CM-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T0.X, PV.W, PV.Z,
+; CM-NEXT:     MUL_IEEE T0.Y, PV.Y, literal.x,
+; CM-NEXT:     MUL_IEEE T0.Z, T0.W, literal.y,
+; CM-NEXT:     AND_INT * T1.W, KC0[3].W, literal.z,
+; CM-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
+; CM-NEXT:    -4096(nan), 0(0.000000e+00)
 ; CM-NEXT:     SETGT T1.X, literal.x, KC0[3].Y,
-; CM-NEXT:     TRUNC T2.Y, PV.W,
-; CM-NEXT:     AND_INT T1.Z, KC0[3].W, literal.y,
-; CM-NEXT:     MULADD_IEEE * T2.W, T1.Y, literal.z, PV.Z,
-; CM-NEXT:    -1036817932(-4.485347e+01), -4096(nan)
-; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; CM-NEXT:     MULADD_IEEE T2.X, T1.W, literal.x, PV.W,
-; CM-NEXT:     MUL_IEEE T1.Y, PV.Z, literal.y,
-; CM-NEXT:     FLT_TO_INT T0.Z, PV.Y,
-; CM-NEXT:     ADD * T1.W, KC0[3].W, -PV.Z,
+; CM-NEXT:     ADD T2.Y, KC0[3].W, -PV.W,
+; CM-NEXT:     RNDNE T1.Z, PV.Z,
+; CM-NEXT:     MULADD_IEEE * T2.W, T1.Y, literal.y, PV.Y,
+; CM-NEXT:    -1036817932(-4.485347e+01), 1079283712(3.321289e+00)
+; CM-NEXT:     MULADD_IEEE T2.X, T0.W, literal.x, PV.W,
+; CM-NEXT:     ADD T0.Y, T0.Z, -PV.Z,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.x,
+; CM-NEXT:     MUL_IEEE * T0.W, T1.W, literal.y, BS:VEC_120/SCL_212
 ; CM-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
-; CM-NEXT:     ADD T0.X, T0.X, -T0.W,
-; CM-NEXT:     MUL_IEEE T2.Y, PV.W, literal.x,
-; CM-NEXT:     MAX_INT T2.Z, PV.Z, literal.y,
-; CM-NEXT:     RNDNE * T0.W, PV.Y,
-; CM-NEXT:    975668412(6.390323e-04), -330(nan)
-; CM-NEXT:     TRUNC T3.X, PV.W,
-; CM-NEXT:     ADD_INT T3.Y, PV.Z, literal.x,
-; CM-NEXT:     MULADD_IEEE T2.Z, T1.W, literal.y, PV.Y,
-; CM-NEXT:     ADD * T1.W, PV.X, T2.X,
-; CM-NEXT:    204(2.858649e-43), 1079283712(3.321289e+00)
-; CM-NEXT:     EXP_IEEE T0.X, T1.W,
-; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
-; CM-NEXT:     ADD_INT T2.X, T0.Z, literal.x,
-; CM-NEXT:     MULADD_IEEE T2.Y, T1.Z, literal.y, T2.Z, BS:VEC_102/SCL_221
-; CM-NEXT:     ADD T1.Z, T1.Y, -T0.W,
-; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.z,
-; CM-NEXT:    102(1.429324e-43), 975668412(6.390323e-04)
-; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_UINT T4.X, T0.Z, literal.x,
-; CM-NEXT:     MUL_IEEE T1.Y, PV.W, literal.y,
-; CM-NEXT:     SETGT_UINT T2.Z, T0.Z, literal.z,
-; CM-NEXT:     ADD * T1.W, PV.Z, PV.Y,
-; CM-NEXT:    -229(nan), 2130706432(1.701412e+38)
-; CM-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; CM-NEXT:     TRUNC T3.X, T1.Z,
+; CM-NEXT:     RNDNE T1.Y, PV.W,
+; CM-NEXT:     MULADD_IEEE T0.Z, T2.Y, literal.x, PV.Z,
+; CM-NEXT:     ADD * T2.W, PV.Y, PV.X,
+; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
+; CM-NEXT:     EXP_IEEE T0.X (MASKED), T2.W,
+; CM-NEXT:     EXP_IEEE T0.Y, T2.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T2.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T2.W,
+; CM-NEXT:     MULADD_IEEE T2.X, T1.W, literal.x, T0.Z,
+; CM-NEXT:     ADD T2.Y, T0.W, -T1.Y, BS:VEC_120/SCL_212
+; CM-NEXT:     FLT_TO_INT T0.Z, T3.X,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.Y, literal.y,
+; CM-NEXT:    975668412(6.390323e-04), 209715200(1.972152e-31)
+; CM-NEXT:     MUL_IEEE T3.X, PV.W, literal.x,
+; CM-NEXT:     SETGT_UINT T3.Y, PV.Z, literal.y,
+; CM-NEXT:     TRUNC T1.Z, T1.Y,
+; CM-NEXT:     ADD * T1.W, PV.Y, PV.X,
+; CM-NEXT:    209715200(1.972152e-31), -229(nan)
 ; CM-NEXT:     EXP_IEEE T1.X (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE T1.Y (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE T1.Z, T1.W,
+; CM-NEXT:     EXP_IEEE T1.Y, T1.W,
+; CM-NEXT:     EXP_IEEE T1.Z (MASKED), T1.W,
 ; CM-NEXT:     EXP_IEEE * T1.W (MASKED), T1.W,
-; CM-NEXT:    ALU clause starting at 109:
-; CM-NEXT:     CNDE_INT T5.X, T2.Z, T0.W, T1.Y,
-; CM-NEXT:     CNDE_INT T1.Y, T4.X, T3.Y, T2.X,
-; CM-NEXT:     FLT_TO_INT T3.Z, T3.X, BS:VEC_120/SCL_212
-; CM-NEXT:     MUL_IEEE * T0.W, T1.Z, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_INT T2.X, T0.Z, literal.x,
-; CM-NEXT:     MUL_IEEE T2.Y, T0.X, literal.y,
-; CM-NEXT:     MUL_IEEE T4.Z, PV.W, literal.z,
-; CM-NEXT:     SETGT_UINT * T1.W, PV.Z, literal.w,
-; CM-NEXT:    -127(nan), 209715200(1.972152e-31)
-; CM-NEXT:    2130706432(1.701412e+38), 254(3.559298e-43)
-; CM-NEXT:     CNDE_INT T3.X, PV.W, T0.W, PV.Z,
-; CM-NEXT:     MUL_IEEE T3.Y, PV.Y, literal.x,
-; CM-NEXT:     CNDE_INT T4.Z, PV.X, T1.Y, T0.Z,
-; CM-NEXT:     MAX_INT * T0.W, T3.Z, literal.y,
-; CM-NEXT:    209715200(1.972152e-31), -330(nan)
-; CM-NEXT:     ADD_INT T6.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, T3.Z, literal.y,
-; CM-NEXT:     SETGT_UINT T5.Z, T3.Z, literal.z,
-; CM-NEXT:     MUL_IEEE * T0.W, T1.Z, literal.w, BS:VEC_120/SCL_212
+; CM-NEXT:     FLT_TO_INT T2.X, T1.Z,
+; CM-NEXT:     MUL_IEEE T2.Y, PV.Y, literal.x,
+; CM-NEXT:     CNDE_INT T1.Z, T3.Y, T3.X, T0.W,
+; CM-NEXT:     SETGT_INT * T0.W, T0.Z, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:    209715200(1.972152e-31), -127(nan)
+; CM-NEXT:     CNDE_INT T3.X, PV.W, PV.Z, T0.Y,
+; CM-NEXT:     MUL_IEEE * T4.Y, PV.Y, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:    ALU clause starting at 108:
+; CM-NEXT:     SETGT_UINT T1.Z, T2.X, literal.x,
+; CM-NEXT:     MAX_INT * T1.W, T0.Z, literal.y,
+; CM-NEXT:    -229(nan), -330(nan)
+; CM-NEXT:     ADD_INT T4.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T5.Y, T0.Z, literal.y,
+; CM-NEXT:     CNDE_INT T2.Z, PV.Z, T4.Y, T2.Y,
+; CM-NEXT:     SETGT_INT * T1.W, T2.X, literal.z,
 ; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
-; CM-NEXT:    -229(nan), 209715200(1.972152e-31)
-; CM-NEXT:     MUL_IEEE T7.X, PV.W, literal.x,
-; CM-NEXT:     MIN_INT T4.Y, T3.Z, literal.y,
-; CM-NEXT:     CNDE_INT T6.Z, PV.Z, PV.X, PV.Y,
-; CM-NEXT:     SETGT_INT * T2.W, T3.Z, literal.z,
-; CM-NEXT:    209715200(1.972152e-31), 381(5.338947e-43)
 ; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T6.X, PV.W, PV.Z, T3.Z,
-; CM-NEXT:     MIN_INT T1.Y, T0.Z, literal.x,
-; CM-NEXT:     ADD_INT T6.Z, PV.Y, literal.y,
-; CM-NEXT:     ADD_INT * T3.W, T3.Z, literal.z, BS:VEC_120/SCL_212
-; CM-NEXT:    381(5.338947e-43), -254(nan)
-; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T8.X, T1.W, PV.W, PV.Z,
-; CM-NEXT:     SETGT_INT T4.Y, T3.Z, literal.x,
-; CM-NEXT:     ADD_INT T3.Z, PV.Y, literal.y,
-; CM-NEXT:     ADD_INT * T1.W, T0.Z, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:     CNDE_INT T5.X, PV.W, PV.Z, T1.Y,
+; CM-NEXT:     MUL_IEEE T0.Y, T0.Y, literal.x,
+; CM-NEXT:     MAX_INT T2.Z, T2.X, literal.y,
+; CM-NEXT:     CNDE_INT * T2.W, T3.Y, PV.X, PV.Y, BS:VEC_120/SCL_212
+; CM-NEXT:    2130706432(1.701412e+38), -330(nan)
+; CM-NEXT:     CNDE_INT T4.X, T0.W, PV.W, T0.Z,
+; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.x,
+; CM-NEXT:     ADD_INT T2.Z, T2.X, literal.y,
+; CM-NEXT:     MIN_INT * T0.W, T2.X, literal.z,
+; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T6.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T3.Y, T2.X, literal.y,
+; CM-NEXT:     SETGT_UINT T3.Z, T2.X, literal.z,
+; CM-NEXT:     CNDE_INT * T0.W, T1.Z, PV.Y, PV.Z,
+; CM-NEXT:    -254(nan), -127(nan)
+; CM-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T7.X, T1.Y, literal.x,
+; CM-NEXT:     CNDE_INT T1.Y, T1.W, PV.W, T2.X,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Z, PV.Y, PV.X,
+; CM-NEXT:     MIN_INT * T0.W, T0.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 381(5.338947e-43)
+; CM-NEXT:     SETGT_INT T2.X, T2.X, literal.x,
+; CM-NEXT:     ADD_INT T2.Y, PV.W, literal.y,
+; CM-NEXT:     ADD_INT T2.Z, T0.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T0.W, T0.Z, literal.w,
 ; CM-NEXT:    127(1.779649e-43), -254(nan)
-; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T9.X, T2.Z, PV.W, PV.Z,
-; CM-NEXT:     SETGT_INT T1.Y, T0.Z, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T6.X, PV.X,
-; CM-NEXT:     CNDE_INT * T0.W, T5.Z, T7.X, T0.W, BS:VEC_201
-; CM-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T6.X, T2.W, PV.W, T1.Z,
-; CM-NEXT:     LSHL T5.Y, PV.Z, literal.x,
-; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T4.Z, PV.X,
-; CM-NEXT:     CNDE_INT * T0.W, T4.X, T3.Y, T2.Y,
-; CM-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T0.X, T2.X, PV.W, T0.X,
-; CM-NEXT:     LSHL T2.Y, PV.Z, literal.x,
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     CNDE_INT T6.X, PV.W, PV.Z, PV.Y,
+; CM-NEXT:     SETGT_INT T2.Y, T0.Z, literal.x,
+; CM-NEXT:     CNDE_INT T0.Z, PV.X, T1.Y, T1.Z,
+; CM-NEXT:     MUL_IEEE * T1.W, T7.X, literal.y,
+; CM-NEXT:    127(1.779649e-43), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T7.X, T3.Z, T7.X, PV.W,
+; CM-NEXT:     LSHL T1.Y, PV.Z, literal.x,
+; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T4.X, PV.X, BS:VEC_021/SCL_122
+; CM-NEXT:     MUL_IEEE * T1.W, T0.Y, literal.y,
+; CM-NEXT:    23(3.222986e-44), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T4.X, T0.W, T0.Y, PV.W,
+; CM-NEXT:     LSHL T0.Y, PV.Z, literal.x,
 ; CM-NEXT:     ADD_INT T0.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T4.Y, PV.X, T3.X, BS:VEC_021/SCL_122
+; CM-NEXT:     CNDE_INT * T0.W, T2.X, T5.X, PV.X,
 ; CM-NEXT:    23(3.222986e-44), 1065353216(1.000000e+00)
 ; CM-NEXT:     MUL_IEEE T2.X, PV.W, PV.Z,
-; CM-NEXT:     SETGT T3.Y, literal.x, KC0[3].W,
+; CM-NEXT:     SETGT T1.Y, literal.x, KC0[3].W,
 ; CM-NEXT:     ADD_INT T0.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T1.Y, PV.X, T5.X,
+; CM-NEXT:     CNDE_INT * T0.W, T2.Y, T3.X, PV.X,
 ; CM-NEXT:    -1036817932(-4.485347e+01), 1065353216(1.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.X, PV.W, PV.Z,
-; CM-NEXT:     SETGT T1.Y, literal.x, KC0[3].Z,
+; CM-NEXT:     MUL_IEEE T3.X, PV.W, PV.Z,
+; CM-NEXT:     SETGT T0.Y, literal.x, KC0[3].Z,
 ; CM-NEXT:     CNDE T0.Z, PV.Y, PV.X, 0.0,
 ; CM-NEXT:     SETGT * T0.W, KC0[3].W, literal.y,
 ; CM-NEXT:    -1036817932(-4.485347e+01), 1109008539(3.853184e+01)
 ; CM-NEXT:     CNDE T2.X, PV.W, PV.Z, literal.x,
-; CM-NEXT:     CNDE T1.Y, PV.Y, PV.X, 0.0,
+; CM-NEXT:     CNDE T0.Y, PV.Y, PV.X, 0.0,
 ; CM-NEXT:     SETGT T0.Z, KC0[3].Z, literal.y,
 ; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
 ; CM-NEXT:    2139095040(INF), 1109008539(3.853184e+01)
 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T0.X, PV.W, literal.x,
-; CM-NEXT:     CNDE T1.Y, PV.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE T0.Z, T1.X, T0.Y, 0.0,
+; CM-NEXT:     LSHR T3.X, PV.W, literal.x,
+; CM-NEXT:     CNDE T0.Y, PV.Z, PV.Y, literal.y,
+; CM-NEXT:     CNDE T0.Z, T1.X, T0.X, 0.0,
 ; CM-NEXT:     SETGT * T0.W, KC0[3].Y, literal.z,
 ; CM-NEXT:    2(2.802597e-45), 2139095040(INF)
 ; CM-NEXT:    1109008539(3.853184e+01), 0(0.000000e+00)
-; CM-NEXT:     CNDE * T1.X, PV.W, PV.Z, literal.x,
+; CM-NEXT:     CNDE * T0.X, PV.W, PV.Z, literal.x,
 ; CM-NEXT:    2139095040(INF), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %result = call <3 x float> @llvm.exp10.v3f32(<3 x float> %in)
   store <3 x float> %result, ptr addrspace(1) %out
@@ -2052,227 +2043,224 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; R600-LABEL: s_exp10_v4f32:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    ALU 98, @6, KC0[CB0:0-32], KC1[]
-; R600-NEXT:    ALU 98, @105, KC0[CB0:0-32], KC1[]
-; R600-NEXT:    ALU 24, @204, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    ALU 95, @105, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    ALU 24, @201, KC0[CB0:0-32], KC1[]
 ; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
 ; R600-NEXT:    CF_END
 ; R600-NEXT:    PAD
 ; R600-NEXT:    ALU clause starting at 6:
 ; R600-NEXT:     AND_INT * T0.W, KC0[3].Z, literal.x,
 ; R600-NEXT:    -4096(nan), 0(0.000000e+00)
-; R600-NEXT:     ADD T1.W, KC0[3].Z, -PV.W,
-; R600-NEXT:     MUL_IEEE * T2.W, PV.W, literal.x,
+; R600-NEXT:     ADD * T1.W, KC0[3].Z, -PV.W,
+; R600-NEXT:     MUL_IEEE T2.W, PV.W, literal.x,
+; R600-NEXT:     MUL_IEEE * T3.W, T0.W, literal.y,
+; R600-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
+; R600-NEXT:     RNDNE T4.W, PS,
+; R600-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.x, PV.W, BS:VEC_021/SCL_122
 ; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; R600-NEXT:     RNDNE T3.W, PS,
-; R600-NEXT:     MUL_IEEE * T4.W, PV.W, literal.x,
+; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.x, PS,
+; R600-NEXT:     ADD * T1.W, T3.W, -PV.W,
 ; R600-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; R600-NEXT:     MULADD_IEEE T1.W, T1.W, literal.x, PS,
-; R600-NEXT:     TRUNC * T4.W, PV.W,
-; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; R600-NEXT:     FLT_TO_INT T0.Z, PS,
-; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.x, PV.W,
-; R600-NEXT:     ADD * T1.W, T2.W, -T3.W,
-; R600-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; R600-NEXT:     ADD T1.Z, PS, PV.W,
-; R600-NEXT:     MAX_INT T0.W, PV.Z, literal.x,
-; R600-NEXT:     MIN_INT * T1.W, PV.Z, literal.y,
-; R600-NEXT:    -330(nan), 381(5.338947e-43)
-; R600-NEXT:     ADD_INT T0.X, PS, literal.x,
-; R600-NEXT:     ADD_INT T0.Y, PV.W, literal.y,
-; R600-NEXT:     ADD_INT T2.Z, T0.Z, literal.z,
-; R600-NEXT:     SETGT_UINT T0.W, T0.Z, literal.w,
-; R600-NEXT:     EXP_IEEE * T1.X, PV.Z,
-; R600-NEXT:    -254(nan), 204(2.858649e-43)
-; R600-NEXT:    102(1.429324e-43), -229(nan)
-; R600-NEXT:     ADD_INT T2.X, T0.Z, literal.x,
-; R600-NEXT:     SETGT_UINT T1.Y, T0.Z, literal.y,
-; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
-; R600-NEXT:     SETGT_INT T1.W, T0.Z, literal.x,
-; R600-NEXT:     MUL_IEEE * T2.W, PS, literal.z,
-; R600-NEXT:    -127(nan), 254(3.559298e-43)
-; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T3.X, T1.X, literal.x,
-; R600-NEXT:     MUL_IEEE T0.Y, PS, literal.y,
-; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Z, T0.Z,
-; R600-NEXT:     CNDE_INT T3.W, PV.Y, PV.X, T0.X,
-; R600-NEXT:     SETGT_INT * T4.W, T0.Z, literal.z,
-; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
-; R600-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; R600-NEXT:     AND_INT T2.Y, KC0[4].X, literal.x,
-; R600-NEXT:     CNDE_INT T0.Z, PS, PV.Z, PV.W,
-; R600-NEXT:     CNDE_INT T0.W, T0.W, PV.Y, T2.W,
-; R600-NEXT:     MUL_IEEE * T2.W, PV.X, literal.y,
-; R600-NEXT:    -4096(nan), 2130706432(1.701412e+38)
-; R600-NEXT:     CNDE_INT T0.X, T1.Y, T3.X, PS,
-; R600-NEXT:     CNDE_INT T0.Y, T1.W, PV.W, T1.X,
-; R600-NEXT:     LSHL T0.Z, PV.Z, literal.x,
-; R600-NEXT:     ADD T0.W, KC0[4].X, -PV.Y,
-; R600-NEXT:     MUL_IEEE * T1.W, PV.Y, literal.y,
-; R600-NEXT:    23(3.222986e-44), 1079283712(3.321289e+00)
-; R600-NEXT:     RNDNE T1.Y, PS,
-; R600-NEXT:     MUL_IEEE T1.Z, PV.W, literal.x,
-; R600-NEXT:     ADD_INT T2.W, PV.Z, literal.y,
-; R600-NEXT:     CNDE_INT * T3.W, T4.W, PV.Y, PV.X,
-; R600-NEXT:    975668412(6.390323e-04), 1065353216(1.000000e+00)
-; R600-NEXT:     MUL_IEEE T0.Y, PS, PV.W,
-; R600-NEXT:     AND_INT T0.Z, KC0[3].W, literal.x,
-; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.y, PV.Z,
-; R600-NEXT:     TRUNC * T2.W, PV.Y,
-; R600-NEXT:    -4096(nan), 1079283712(3.321289e+00)
-; R600-NEXT:     SETGT T0.X, literal.x, KC0[3].Z,
-; R600-NEXT:     FLT_TO_INT T3.Y, PS,
-; R600-NEXT:     MULADD_IEEE T1.Z, T2.Y, literal.y, PV.W,
-; R600-NEXT:     ADD T0.W, T1.W, -T1.Y,
-; R600-NEXT:     MUL_IEEE * T1.W, PV.Z, literal.z,
-; R600-NEXT:    -1036817932(-4.485347e+01), 975668412(6.390323e-04)
-; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; R600-NEXT:     RNDNE T1.X, PS,
-; R600-NEXT:     AND_INT T1.Y, KC0[3].Y, literal.x,
-; R600-NEXT:     ADD T1.Z, PV.W, PV.Z,
-; R600-NEXT:     MAX_INT T0.W, PV.Y, literal.y,
-; R600-NEXT:     MIN_INT * T2.W, PV.Y, literal.z,
-; R600-NEXT:    -4096(nan), -330(nan)
+; R600-NEXT:     ADD T0.W, PS, PV.W,
+; R600-NEXT:     TRUNC * T1.W, T4.W,
+; R600-NEXT:     FLT_TO_INT T1.W, PS,
+; R600-NEXT:     EXP_IEEE * T0.X, PV.W,
+; R600-NEXT:     MUL_IEEE T0.Z, PS, literal.x,
+; R600-NEXT:     MAX_INT T0.W, PV.W, literal.y,
+; R600-NEXT:     MIN_INT * T2.W, PV.W, literal.z,
+; R600-NEXT:    209715200(1.972152e-31), -330(nan)
 ; R600-NEXT:    381(5.338947e-43), 0(0.000000e+00)
-; R600-NEXT:     ADD_INT T2.X, PS, literal.x,
-; R600-NEXT:     ADD_INT T2.Y, PV.W, literal.y,
-; R600-NEXT:     ADD_INT T2.Z, T3.Y, literal.z,
-; R600-NEXT:     SETGT_UINT T0.W, T3.Y, literal.w,
-; R600-NEXT:     EXP_IEEE * T1.Z, PV.Z,
-; R600-NEXT:    -254(nan), 204(2.858649e-43)
-; R600-NEXT:    102(1.429324e-43), -229(nan)
-; R600-NEXT:     ADD_INT T3.X, T3.Y, literal.x,
-; R600-NEXT:     SETGT_UINT T4.Y, T3.Y, literal.y,
-; R600-NEXT:     CNDE_INT T2.Z, PV.W, PV.Y, PV.Z,
-; R600-NEXT:     SETGT_INT T2.W, T3.Y, literal.x,
-; R600-NEXT:     MUL_IEEE * T3.W, PS, literal.z,
+; R600-NEXT:     ADD_INT T1.X, PS, literal.x,
+; R600-NEXT:     AND_INT T0.Y, KC0[4].X, literal.y,
+; R600-NEXT:     ADD_INT T1.Z, PV.W, literal.z,
+; R600-NEXT:     ADD_INT * T0.W, T1.W, literal.w,
+; R600-NEXT:    -254(nan), -4096(nan)
+; R600-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; R600-NEXT:     SETGT_UINT * T2.W, T1.W, literal.x,
+; R600-NEXT:    -229(nan), 0(0.000000e+00)
+; R600-NEXT:     ADD_INT T2.X, T1.W, literal.x,
+; R600-NEXT:     SETGT_UINT T1.Y, T1.W, literal.y,
+; R600-NEXT:     CNDE_INT T1.Z, PV.W, T1.Z, T0.W,
+; R600-NEXT:     SETGT_INT T0.W, T1.W, literal.x,
+; R600-NEXT:     ADD * T3.W, KC0[4].X, -T0.Y,
 ; R600-NEXT:    -127(nan), 254(3.559298e-43)
-; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T4.X, T1.Z, literal.x,
-; R600-NEXT:     MUL_IEEE T2.Y, PS, literal.y,
-; R600-NEXT:     CNDE_INT T2.Z, PV.W, PV.Z, T3.Y,
-; R600-NEXT:     CNDE_INT T4.W, PV.Y, PV.X, T2.X,
-; R600-NEXT:     SETGT_INT * T5.W, T3.Y, literal.z,
-; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
+; R600-NEXT:     MUL_IEEE T3.X, PS, literal.x,
+; R600-NEXT:     MUL_IEEE T2.Y, T0.Y, literal.y,
+; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Z, T1.W,
+; R600-NEXT:     CNDE_INT T4.W, PV.Y, PV.X, T1.X,
+; R600-NEXT:     SETGT_INT * T1.W, T1.W, literal.z,
+; R600-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
 ; R600-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; R600-NEXT:     ADD T2.X, KC0[3].W, -T0.Z,
-; R600-NEXT:     CNDE_INT T3.Y, PS, PV.Z, PV.W,
-; R600-NEXT:     CNDE_INT * T2.Z, T0.W, PV.Y, T3.W,
-; R600-NEXT:    ALU clause starting at 105:
-; R600-NEXT:     MUL_IEEE T0.W, T4.X, literal.x,
-; R600-NEXT:     ADD * T3.W, KC0[3].Y, -T1.Y,
+; R600-NEXT:     CNDE_INT T1.X, PS, PV.Z, PV.W,
+; R600-NEXT:     RNDNE T3.Y, PV.Y,
+; R600-NEXT:     MULADD_IEEE T1.Z, T3.W, literal.x, PV.X,
+; R600-NEXT:     MUL_IEEE T3.W, T0.Z, literal.y,
+; R600-NEXT:     MUL_IEEE * T4.W, T0.X, literal.z,
+; R600-NEXT:    1079283712(3.321289e+00), 209715200(1.972152e-31)
 ; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T2.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T4.Y, T2.W, PV.W, T0.Z,
+; R600-NEXT:     MULADD_IEEE T0.Z, T0.Y, literal.y, PV.Z,
+; R600-NEXT:     ADD T2.W, T2.Y, -PV.Y, BS:VEC_120/SCL_212
+; R600-NEXT:     AND_INT * T3.W, KC0[3].Y, literal.z,
+; R600-NEXT:    2130706432(1.701412e+38), 975668412(6.390323e-04)
+; R600-NEXT:    -4096(nan), 0(0.000000e+00)
 ; R600-NEXT:     MUL_IEEE T3.X, PS, literal.x,
-; R600-NEXT:     MUL_IEEE T2.Y, T1.Y, literal.y,
-; R600-NEXT:     CNDE_INT T3.Z, T4.Y, T4.X, PV.W, BS:VEC_120/SCL_212
-; R600-NEXT:     CNDE_INT T0.W, T2.W, T2.Z, T1.Z,
-; R600-NEXT:     LSHL * T2.W, T3.Y, literal.z,
-; R600-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
-; R600-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; R600-NEXT:     ADD_INT T4.X, PS, literal.x,
-; R600-NEXT:     CNDE_INT T3.Y, T5.W, PV.W, PV.Z,
-; R600-NEXT:     RNDNE T1.Z, PV.Y,
-; R600-NEXT:     MULADD_IEEE T0.W, T3.W, literal.y, PV.X, BS:VEC_120/SCL_212
-; R600-NEXT:     MUL_IEEE * T2.W, T2.X, literal.z,
+; R600-NEXT:     ADD T0.Y, PV.W, PV.Z,
+; R600-NEXT:     CNDE_INT T0.Z, T0.W, PV.Y, T0.X, BS:VEC_021/SCL_122
+; R600-NEXT:     CNDE_INT T0.W, T1.Y, T4.W, PV.X,
+; R600-NEXT:     LSHL * T2.W, T1.X, literal.y,
+; R600-NEXT:    1079283712(3.321289e+00), 23(3.222986e-44)
+; R600-NEXT:     AND_INT T0.X, KC0[3].W, literal.x,
+; R600-NEXT:     TRUNC T1.Y, T3.Y,
+; R600-NEXT:     ADD_INT T1.Z, PS, literal.y,
+; R600-NEXT:     CNDE_INT T0.W, T1.W, PV.Z, PV.W,
+; R600-NEXT:     EXP_IEEE * T0.Y, PV.Y,
+; R600-NEXT:    -4096(nan), 1065353216(1.000000e+00)
+; R600-NEXT:     MUL_IEEE T1.X, PV.W, PV.Z,
+; R600-NEXT:     FLT_TO_INT T1.Y, PV.Y,
+; R600-NEXT:     MUL_IEEE T0.Z, PS, literal.x,
+; R600-NEXT:     ADD T0.W, KC0[3].W, -PV.X,
+; R600-NEXT:     RNDNE * T1.W, T3.X,
+; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; R600-NEXT:     SETGT T2.X, literal.x, KC0[3].Z,
+; R600-NEXT:     TRUNC T2.Y, PS,
+; R600-NEXT:     MUL_IEEE T1.Z, PV.W, literal.y,
+; R600-NEXT:     MUL_IEEE T2.W, PV.Z, literal.z,
+; R600-NEXT:     MAX_INT * T4.W, PV.Y, literal.w,
+; R600-NEXT:    -1036817932(-4.485347e+01), 975668412(6.390323e-04)
+; R600-NEXT:    209715200(1.972152e-31), -330(nan)
+; R600-NEXT:     ADD T4.X, KC0[3].Y, -T3.W,
+; R600-NEXT:     ADD_INT T3.Y, PS, literal.x,
+; R600-NEXT:     ADD_INT T2.Z, T1.Y, literal.y,
+; R600-NEXT:     SETGT_UINT T4.W, T1.Y, literal.z,
+; R600-NEXT:     MIN_INT * T5.W, T1.Y, literal.w,
+; R600-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; R600-NEXT:    -229(nan), 381(5.338947e-43)
+; R600-NEXT:     ADD_INT T5.X, PS, literal.x,
+; R600-NEXT:     ADD_INT T4.Y, T1.Y, literal.y,
+; R600-NEXT:     SETGT_UINT T3.Z, T1.Y, literal.z,
+; R600-NEXT:     CNDE_INT T5.W, PV.W, PV.Y, PV.Z,
+; R600-NEXT:     SETGT_INT * T6.W, T1.Y, literal.y,
+; R600-NEXT:    -254(nan), -127(nan)
+; R600-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T6.X, T0.Y, literal.x,
+; R600-NEXT:     CNDE_INT T3.Y, PS, PV.W, T1.Y,
+; R600-NEXT:     CNDE_INT * T2.Z, PV.Z, PV.Y, PV.X,
+; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
+; R600-NEXT:    ALU clause starting at 105:
+; R600-NEXT:     SETGT_INT T5.W, T1.Y, literal.x,
+; R600-NEXT:     MUL_IEEE * T7.W, T4.X, literal.y,
+; R600-NEXT:    127(1.779649e-43), 975668412(6.390323e-04)
+; R600-NEXT:     MUL_IEEE T5.X, T0.X, literal.x,
+; R600-NEXT:     MULADD_IEEE T1.Y, T4.X, literal.x, PS, BS:VEC_120/SCL_212
+; R600-NEXT:     CNDE_INT T2.Z, PV.W, T3.Y, T2.Z,
+; R600-NEXT:     MUL_IEEE T7.W, T6.X, literal.y, BS:VEC_201
+; R600-NEXT:     CNDE_INT * T2.W, T4.W, T2.W, T0.Z,
+; R600-NEXT:    1079283712(3.321289e+00), 2130706432(1.701412e+38)
+; R600-NEXT:     CNDE_INT T4.X, T6.W, PS, T0.Y,
+; R600-NEXT:     CNDE_INT T0.Y, T3.Z, T6.X, PV.W,
+; R600-NEXT:     LSHL T0.Z, PV.Z, literal.x,
+; R600-NEXT:     MULADD_IEEE T2.W, T3.W, literal.y, PV.Y, BS:VEC_201
+; R600-NEXT:     ADD * T1.W, T3.X, -T1.W,
+; R600-NEXT:    23(3.222986e-44), 975668412(6.390323e-04)
+; R600-NEXT:     ADD T3.X, PS, PV.W,
+; R600-NEXT:     ADD_INT T1.Y, PV.Z, literal.x,
+; R600-NEXT:     CNDE_INT T0.Z, T5.W, PV.X, PV.Y,
+; R600-NEXT:     RNDNE T1.W, T5.X,
+; R600-NEXT:     MULADD_IEEE * T0.W, T0.W, literal.y, T1.Z, BS:VEC_021/SCL_122
 ; R600-NEXT:    1065353216(1.000000e+00), 1079283712(3.321289e+00)
-; R600-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; R600-NEXT:     MULADD_IEEE T2.X, T2.X, literal.x, PS,
-; R600-NEXT:     MULADD_IEEE T1.Y, T1.Y, literal.y, PV.W,
-; R600-NEXT:     ADD T2.Z, T2.Y, -PV.Z, BS:VEC_120/SCL_212
-; R600-NEXT:     MUL_IEEE T0.W, PV.Y, PV.X,
-; R600-NEXT:     SETGT * T2.W, literal.z, KC0[4].X,
-; R600-NEXT:    1079283712(3.321289e+00), 975668412(6.390323e-04)
-; R600-NEXT:    -1036817932(-4.485347e+01), 0(0.000000e+00)
-; R600-NEXT:     CNDE T3.X, PS, PV.W, 0.0,
-; R600-NEXT:     ADD T1.Y, PV.Z, PV.Y,
-; R600-NEXT:     TRUNC T1.Z, T1.Z,
-; R600-NEXT:     MULADD_IEEE T0.W, T0.Z, literal.x, PV.X, BS:VEC_120/SCL_212
-; R600-NEXT:     ADD * T1.W, T1.W, -T1.X,
-; R600-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; R600-NEXT:     SETGT T2.X, KC0[4].X, literal.x,
-; R600-NEXT:     ADD T2.Y, PS, PV.W,
-; R600-NEXT:     FLT_TO_INT T0.Z, PV.Z,
-; R600-NEXT:     TRUNC T0.W, T1.X,
-; R600-NEXT:     EXP_IEEE * T1.X, PV.Y,
-; R600-NEXT:    1109008539(3.853184e+01), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T4.X, PS, literal.x,
-; R600-NEXT:     FLT_TO_INT T1.Y, PV.W,
-; R600-NEXT:     MAX_INT T1.Z, PV.Z, literal.y,
-; R600-NEXT:     MUL_IEEE T0.W, PS, literal.z,
-; R600-NEXT:     EXP_IEEE * T1.W, PV.Y,
-; R600-NEXT:    2130706432(1.701412e+38), -330(nan)
+; R600-NEXT:     MULADD_IEEE T0.X, T0.X, literal.x, PS,
+; R600-NEXT:     ADD T0.Y, T5.X, -PV.W, BS:VEC_120/SCL_212
+; R600-NEXT:     MUL_IEEE T0.Z, PV.Z, PV.Y,
+; R600-NEXT:     SETGT T0.W, literal.y, KC0[4].X,
+; R600-NEXT:     EXP_IEEE * T1.Y, PV.X,
+; R600-NEXT:    975668412(6.390323e-04), -1036817932(-4.485347e+01)
+; R600-NEXT:     CNDE T3.X, PV.W, PV.Z, 0.0,
+; R600-NEXT:     ADD T0.Y, PV.Y, PV.X,
+; R600-NEXT:     FLT_TO_INT T0.Z, T2.Y,
+; R600-NEXT:     TRUNC T0.W, T1.W,
+; R600-NEXT:     MUL_IEEE * T1.W, PS, literal.x,
 ; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T5.X, PV.W, literal.x,
-; R600-NEXT:     MUL_IEEE T2.Y, PS, literal.x,
-; R600-NEXT:     ADD_INT T1.Z, PV.Z, literal.y,
-; R600-NEXT:     ADD_INT T2.W, T0.Z, literal.z,
-; R600-NEXT:     MAX_INT * T3.W, PV.Y, literal.w,
-; R600-NEXT:    209715200(1.972152e-31), 204(2.858649e-43)
-; R600-NEXT:    102(1.429324e-43), -330(nan)
-; R600-NEXT:     SETGT_UINT T6.X, T0.Z, literal.x,
-; R600-NEXT:     ADD_INT T3.Y, PS, literal.y,
-; R600-NEXT:     ADD_INT T2.Z, T1.Y, literal.z,
-; R600-NEXT:     SETGT_UINT T3.W, T1.Y, literal.x,
-; R600-NEXT:     MIN_INT * T4.W, T1.Y, literal.w,
+; R600-NEXT:     SETGT T0.X, KC0[4].X, literal.x,
+; R600-NEXT:     MUL_IEEE T2.Y, PS, literal.y,
+; R600-NEXT:     FLT_TO_INT T1.Z, PV.W,
+; R600-NEXT:     MAX_INT T0.W, PV.Z, literal.z,
+; R600-NEXT:     EXP_IEEE * T0.Y, PV.Y,
+; R600-NEXT:    1109008539(3.853184e+01), 209715200(1.972152e-31)
+; R600-NEXT:    -330(nan), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T4.X, T1.Y, literal.x,
+; R600-NEXT:     MUL_IEEE T3.Y, PS, literal.y,
+; R600-NEXT:     ADD_INT T2.Z, PV.W, literal.z,
+; R600-NEXT:     ADD_INT * T0.W, T0.Z, literal.w,
+; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
+; R600-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; R600-NEXT:     MAX_INT * T2.W, T1.Z, literal.x,
+; R600-NEXT:    -330(nan), 0(0.000000e+00)
+; R600-NEXT:     SETGT_UINT T5.X, T0.Z, literal.x,
+; R600-NEXT:     ADD_INT T4.Y, PV.W, literal.y,
+; R600-NEXT:     ADD_INT T3.Z, T1.Z, literal.z, BS:VEC_120/SCL_212
+; R600-NEXT:     SETGT_UINT T2.W, T1.Z, literal.x, BS:VEC_120/SCL_212
+; R600-NEXT:     MIN_INT * T3.W, T1.Z, literal.w,
 ; R600-NEXT:    -229(nan), 204(2.858649e-43)
 ; R600-NEXT:    102(1.429324e-43), 381(5.338947e-43)
-; R600-NEXT:     ADD_INT T7.X, PS, literal.x,
-; R600-NEXT:     ADD_INT T4.Y, T1.Y, literal.y,
-; R600-NEXT:     SETGT_UINT T3.Z, T1.Y, literal.z,
-; R600-NEXT:     CNDE_INT T4.W, PV.W, PV.Y, PV.Z,
-; R600-NEXT:     SETGT_INT * T5.W, T1.Y, literal.y,
+; R600-NEXT:     ADD_INT T6.X, PS, literal.x,
+; R600-NEXT:     ADD_INT T5.Y, T1.Z, literal.y,
+; R600-NEXT:     SETGT_UINT T4.Z, T1.Z, literal.z,
+; R600-NEXT:     CNDE_INT T3.W, PV.W, PV.Y, PV.Z,
+; R600-NEXT:     SETGT_INT * T4.W, T1.Z, literal.y,
 ; R600-NEXT:    -254(nan), -127(nan)
 ; R600-NEXT:    254(3.559298e-43), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T8.X, PS, PV.W, T1.Y,
-; R600-NEXT:     CNDE_INT T3.Y, PV.Z, PV.Y, PV.X,
-; R600-NEXT:     SETGT_INT T2.Z, T1.Y, literal.x,
-; R600-NEXT:     CNDE_INT T2.W, T6.X, T1.Z, T2.W,
-; R600-NEXT:     SETGT_INT * T4.W, T0.Z, literal.y,
+; R600-NEXT:     CNDE_INT T7.X, PS, PV.W, T1.Z, BS:VEC_021/SCL_122
+; R600-NEXT:     CNDE_INT T4.Y, PV.Z, PV.Y, PV.X,
+; R600-NEXT:     SETGT_INT T1.Z, T1.Z, literal.x, BS:VEC_120/SCL_212
+; R600-NEXT:     CNDE_INT T0.W, T5.X, T2.Z, T0.W, BS:VEC_102/SCL_221
+; R600-NEXT:     SETGT_INT * T3.W, T0.Z, literal.y,
 ; R600-NEXT:    127(1.779649e-43), -127(nan)
-; R600-NEXT:     CNDE_INT T7.X, PS, PV.W, T0.Z,
-; R600-NEXT:     CNDE_INT T1.Y, PV.Z, PV.X, PV.Y,
-; R600-NEXT:     MIN_INT T1.Z, T0.Z, literal.x,
-; R600-NEXT:     MUL_IEEE T2.W, T1.W, literal.y,
-; R600-NEXT:     MUL_IEEE * T6.W, T2.Y, literal.z,
-; R600-NEXT:    381(5.338947e-43), 2130706432(1.701412e+38)
-; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T8.X, T3.W, PS, T2.Y,
-; R600-NEXT:     MUL_IEEE T2.Y, PV.W, literal.x,
-; R600-NEXT:     ADD_INT T1.Z, PV.Z, literal.y,
-; R600-NEXT:     ADD_INT T3.W, T0.Z, literal.z,
-; R600-NEXT:     SETGT_UINT * T6.W, T0.Z, literal.w,
+; R600-NEXT:     CNDE_INT T6.X, PS, PV.W, T0.Z,
+; R600-NEXT:     CNDE_INT T4.Y, PV.Z, PV.X, PV.Y,
+; R600-NEXT:     MIN_INT T2.Z, T0.Z, literal.x,
+; R600-NEXT:     MUL_IEEE T0.W, T3.Y, literal.y,
+; R600-NEXT:     MUL_IEEE * T5.W, T0.Y, literal.z,
+; R600-NEXT:    381(5.338947e-43), 209715200(1.972152e-31)
+; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T7.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T3.Y, T2.W, PV.W, T3.Y,
+; R600-NEXT:     ADD_INT T2.Z, PV.Z, literal.y,
+; R600-NEXT:     ADD_INT T0.W, T0.Z, literal.z,
+; R600-NEXT:     SETGT_UINT * T2.W, T0.Z, literal.w,
 ; R600-NEXT:    2130706432(1.701412e+38), -254(nan)
 ; R600-NEXT:    -127(nan), 254(3.559298e-43)
-; R600-NEXT:     CNDE_INT T9.X, PS, PV.W, PV.Z,
-; R600-NEXT:     SETGT_INT T3.Y, T0.Z, literal.x,
-; R600-NEXT:     CNDE_INT T0.Z, T3.Z, T2.W, PV.Y, BS:VEC_120/SCL_212
-; R600-NEXT:     CNDE_INT T1.W, T5.W, PV.X, T1.W, BS:VEC_021/SCL_122
-; R600-NEXT:     LSHL * T2.W, T1.Y, literal.y,
+; R600-NEXT:     CNDE_INT T8.X, PS, PV.W, PV.Z,
+; R600-NEXT:     SETGT_INT T5.Y, T0.Z, literal.x,
+; R600-NEXT:     CNDE_INT T0.Z, T4.W, PV.Y, T0.Y, BS:VEC_021/SCL_122
+; R600-NEXT:     CNDE_INT T0.W, T4.Z, T5.W, PV.X, BS:VEC_120/SCL_212
+; R600-NEXT:     LSHL * T4.W, T4.Y, literal.y,
 ; R600-NEXT:    127(1.779649e-43), 23(3.222986e-44)
-; R600-NEXT:     ADD_INT T8.X, PS, literal.x,
-; R600-NEXT:     CNDE_INT T1.Y, T2.Z, PV.W, PV.Z,
-; R600-NEXT:     CNDE_INT T0.Z, PV.Y, T7.X, PV.X,
-; R600-NEXT:     CNDE_INT * T0.W, T6.X, T5.X, T0.W, BS:VEC_021/SCL_122
-; R600-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE * T1.W, T4.X, literal.x,
-; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T4.X, T6.W, T4.X, PV.W,
-; R600-NEXT:     CNDE_INT * T2.Y, T4.W, T0.W, T1.X, BS:VEC_120/SCL_212
-; R600-NEXT:    ALU clause starting at 204:
+; R600-NEXT:     ADD_INT T7.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T0.Y, T1.Z, PV.Z, PV.W,
+; R600-NEXT:     CNDE_INT T0.Z, PV.Y, T6.X, PV.X,
+; R600-NEXT:     MUL_IEEE T0.W, T4.X, literal.y,
+; R600-NEXT:     CNDE_INT * T1.W, T5.X, T2.Y, T1.W,
+; R600-NEXT:    1065353216(1.000000e+00), 2130706432(1.701412e+38)
+; R600-NEXT:     CNDE_INT T5.X, T3.W, PS, T1.Y,
+; R600-NEXT:     CNDE_INT * T1.Y, T2.W, T4.X, PV.W, BS:VEC_120/SCL_212
+; R600-NEXT:    ALU clause starting at 201:
 ; R600-NEXT:     LSHL T0.Z, T0.Z, literal.x,
-; R600-NEXT:     MUL_IEEE T0.W, T1.Y, T8.X,
+; R600-NEXT:     MUL_IEEE T0.W, T0.Y, T7.X,
 ; R600-NEXT:     SETGT * T1.W, literal.y, KC0[3].W,
 ; R600-NEXT:    23(3.222986e-44), -1036817932(-4.485347e+01)
-; R600-NEXT:     CNDE T1.X, PS, PV.W, 0.0,
-; R600-NEXT:     SETGT T1.Y, KC0[3].W, literal.x,
+; R600-NEXT:     CNDE T4.X, PS, PV.W, 0.0,
+; R600-NEXT:     SETGT T0.Y, KC0[3].W, literal.x,
 ; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
-; R600-NEXT:     CNDE_INT T0.W, T3.Y, T2.Y, T4.X, BS:VEC_120/SCL_212
-; R600-NEXT:     CNDE * T1.W, T2.X, T3.X, literal.z,
+; R600-NEXT:     CNDE_INT T0.W, T5.Y, T5.X, T1.Y, BS:VEC_102/SCL_221
+; R600-NEXT:     CNDE * T1.W, T0.X, T3.X, literal.z,
 ; R600-NEXT:    1109008539(3.853184e+01), 1065353216(1.000000e+00)
 ; R600-NEXT:    2139095040(INF), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T2.X, PV.W, PV.Z,
+; R600-NEXT:     MUL_IEEE T0.X, PV.W, PV.Z,
 ; R600-NEXT:     SETGT T2.Y, literal.x, KC0[3].Y,
 ; R600-NEXT:     CNDE T1.Z, PV.Y, PV.X, literal.y,
-; R600-NEXT:     CNDE T0.W, T0.X, T0.Y, 0.0,
+; R600-NEXT:     CNDE T0.W, T2.X, T1.X, 0.0,
 ; R600-NEXT:     SETGT * T2.W, KC0[3].Z, literal.z,
 ; R600-NEXT:    -1036817932(-4.485347e+01), 2139095040(INF)
 ; R600-NEXT:    1109008539(3.853184e+01), 0(0.000000e+00)
@@ -2287,8 +2275,8 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; CM-LABEL: s_exp10_v4f32:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 97, @6, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    ALU 100, @104, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    ALU 36, @205, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 97, @104, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 35, @202, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
@@ -2307,224 +2295,220 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
 ; CM-NEXT:     MULADD_IEEE T0.X, T0.W, literal.x, PV.W,
 ; CM-NEXT:     ADD T0.Y, T0.Z, -PV.Z,
-; CM-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.x,
-; CM-NEXT:     MUL_IEEE * T0.W, T2.W, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     MUL_IEEE T0.Z, T2.W, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     MUL_IEEE * T0.W, PV.Y, literal.x,
 ; CM-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
 ; CM-NEXT:     TRUNC T1.X, T1.Z,
-; CM-NEXT:     RNDNE T2.Y, PV.W,
-; CM-NEXT:     MULADD_IEEE T0.Z, T1.Y, literal.x, PV.Z,
-; CM-NEXT:     ADD * T1.W, PV.Y, PV.X,
+; CM-NEXT:     MULADD_IEEE T1.Y, T1.Y, literal.x, PV.W,
+; CM-NEXT:     RNDNE T1.Z, PV.Z,
+; CM-NEXT:     ADD * T0.W, PV.Y, PV.X,
 ; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
+; CM-NEXT:     EXP_IEEE T0.X, T0.W,
+; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
+; CM-NEXT:     TRUNC T2.X, T1.Z,
+; CM-NEXT:     MULADD_IEEE T0.Y, T2.W, literal.x, T1.Y,
+; CM-NEXT:     FLT_TO_INT T2.Z, T1.X,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.y,
+; CM-NEXT:    975668412(6.390323e-04), 209715200(1.972152e-31)
+; CM-NEXT:     ADD T1.X, T0.Z, -T1.Z,
+; CM-NEXT:     MUL_IEEE T1.Y, PV.W, literal.x,
+; CM-NEXT:     MAX_INT T0.Z, PV.Z, literal.y,
+; CM-NEXT:     MIN_INT * T1.W, PV.Z, literal.z,
+; CM-NEXT:    209715200(1.972152e-31), -330(nan)
+; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T3.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T0.Z, T2.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T1.W, T2.Z, literal.w,
+; CM-NEXT:    -254(nan), 204(2.858649e-43)
+; CM-NEXT:    102(1.429324e-43), -229(nan)
+; CM-NEXT:     ADD_INT T4.X, T2.Z, literal.x,
+; CM-NEXT:     SETGT_UINT T3.Y, T2.Z, literal.y,
+; CM-NEXT:     CNDE_INT T0.Z, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     SETGT_INT * T2.W, T2.Z, literal.x,
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     MUL_IEEE T5.X, T0.X, literal.x,
+; CM-NEXT:     CNDE_INT T2.Y, PV.W, PV.Z, T2.Z,
+; CM-NEXT:     CNDE_INT T0.Z, PV.Y, PV.X, T3.X,
+; CM-NEXT:     SETGT_INT * T3.W, T2.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 127(1.779649e-43)
+; CM-NEXT:     AND_INT T3.X, KC0[3].Z, literal.x,
+; CM-NEXT:     CNDE_INT T2.Y, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.X, literal.y,
+; CM-NEXT:     CNDE_INT * T0.W, T1.W, T1.Y, T0.W,
+; CM-NEXT:    -4096(nan), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T0.X, T2.W, PV.W, T0.X,
+; CM-NEXT:     CNDE_INT T1.Y, T3.Y, T5.X, PV.Z,
+; CM-NEXT:     LSHL T0.Z, PV.Y, literal.x,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.y,
+; CM-NEXT:    23(3.222986e-44), 1079283712(3.321289e+00)
+; CM-NEXT:     RNDNE T4.X, PV.W,
+; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.x,
+; CM-NEXT:     CNDE_INT T0.Z, T3.W, PV.X, PV.Y,
+; CM-NEXT:     ADD * T1.W, T1.X, T0.Y,
+; CM-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
 ; CM-NEXT:     EXP_IEEE T0.X, T1.W,
 ; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T1.W,
 ; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
 ; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
-; CM-NEXT:     MULADD_IEEE T2.X, T2.W, literal.x, T0.Z,
-; CM-NEXT:     ADD T0.Y, T0.W, -T2.Y, BS:VEC_120/SCL_212
-; CM-NEXT:     FLT_TO_INT T0.Z, T1.X,
-; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.y,
-; CM-NEXT:    975668412(6.390323e-04), 209715200(1.972152e-31)
-; CM-NEXT:     MUL_IEEE T1.X, PV.W, literal.x,
+; CM-NEXT:     MUL_IEEE T1.X, T0.Z, T2.Y,
+; CM-NEXT:     TRUNC T0.Y, T4.X,
+; CM-NEXT:     FLT_TO_INT T0.Z, T2.X, BS:VEC_120/SCL_212
+; CM-NEXT:     MUL_IEEE * T1.W, PV.X, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T2.X, PV.W, literal.x,
 ; CM-NEXT:     MUL_IEEE T1.Y, T0.X, literal.y,
 ; CM-NEXT:     MAX_INT T1.Z, PV.Z, literal.z,
-; CM-NEXT:     MIN_INT * T1.W, PV.Z, literal.w,
+; CM-NEXT:     MIN_INT * T2.W, PV.Z, literal.w,
 ; CM-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
 ; CM-NEXT:    -330(nan), 381(5.338947e-43)
-; CM-NEXT:     ADD_INT T3.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T3.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T5.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.y,
 ; CM-NEXT:     ADD_INT T1.Z, T0.Z, literal.z,
-; CM-NEXT:     SETGT_UINT * T1.W, T0.Z, literal.w,
+; CM-NEXT:     SETGT_UINT * T2.W, T0.Z, literal.w,
 ; CM-NEXT:    -254(nan), 204(2.858649e-43)
 ; CM-NEXT:    102(1.429324e-43), -229(nan)
-; CM-NEXT:     ADD_INT T4.X, T0.Z, literal.x,
-; CM-NEXT:     SETGT_UINT T4.Y, T0.Z, literal.y,
+; CM-NEXT:     ADD_INT T6.X, T0.Z, literal.x,
+; CM-NEXT:     SETGT_UINT T3.Y, T0.Z, literal.y,
 ; CM-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     SETGT_INT * T2.W, T0.Z, literal.x,
+; CM-NEXT:     SETGT_INT * T3.W, T0.Z, literal.x,
 ; CM-NEXT:    -127(nan), 254(3.559298e-43)
-; CM-NEXT:     CNDE_INT T5.X, PV.W, PV.Z, T0.Z,
-; CM-NEXT:     CNDE_INT T3.Y, PV.Y, PV.X, T3.X,
-; CM-NEXT:     SETGT_INT T0.Z, T0.Z, literal.x,
-; CM-NEXT:     MUL_IEEE * T3.W, T1.Y, literal.y,
-; CM-NEXT:    127(1.779649e-43), 2130706432(1.701412e+38)
-; CM-NEXT:     CNDE_INT T3.X, T4.Y, T1.Y, PV.W,
-; CM-NEXT:     AND_INT T1.Y, KC0[3].Z, literal.x,
-; CM-NEXT:     CNDE_INT T1.Z, PV.Z, PV.X, PV.Y,
-; CM-NEXT:     CNDE_INT * T0.W, T1.W, T1.X, T0.W,
-; CM-NEXT:    -4096(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T0.X, T2.W, PV.W, T0.X,
-; CM-NEXT:     LSHL T3.Y, PV.Z, literal.x,
-; CM-NEXT:     TRUNC T1.Z, T2.Y,
-; CM-NEXT:     ADD * T0.W, KC0[3].Z, -PV.Y,
-; CM-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T1.X, PV.W, literal.x,
-; CM-NEXT:     FLT_TO_INT T2.Y, PV.Z,
-; CM-NEXT:     ADD_INT T1.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T1.W, T0.Z, PV.X, T3.X,
-; CM-NEXT:    975668412(6.390323e-04), 1065353216(1.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.X, PV.W, PV.Z,
-; CM-NEXT:     MIN_INT T3.Y, PV.Y, literal.x,
-; CM-NEXT:     MULADD_IEEE T0.Z, T0.W, literal.y, PV.X,
-; CM-NEXT:     ADD * T0.W, T0.Y, T2.X,
-; CM-NEXT:    381(5.338947e-43), 1079283712(3.321289e+00)
-; CM-NEXT:     EXP_IEEE T0.X (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE T0.Y, T0.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
-; CM-NEXT:     MULADD_IEEE T1.X, T1.Y, literal.x, T0.Z,
-; CM-NEXT:     MUL_IEEE T4.Y, PV.Y, literal.y,
-; CM-NEXT:     ADD_INT T0.Z, T3.Y, literal.z, BS:VEC_120/SCL_212
-; CM-NEXT:     MAX_INT * T0.W, T2.Y, literal.w, BS:VEC_201
-; CM-NEXT:    975668412(6.390323e-04), 2130706432(1.701412e+38)
-; CM-NEXT:    -254(nan), -330(nan)
-; CM-NEXT:     ADD_INT T2.X, T2.Y, literal.x,
-; CM-NEXT:     ADD_INT T3.Y, PV.W, literal.y,
-; CM-NEXT:     ADD_INT T1.Z, T2.Y, literal.z,
-; CM-NEXT:     SETGT_UINT * T0.W, T2.Y, literal.w,
-; CM-NEXT:    -127(nan), 204(2.858649e-43)
-; CM-NEXT:    102(1.429324e-43), -229(nan)
-; CM-NEXT:     SETGT_UINT T3.X, T2.Y, literal.x,
-; CM-NEXT:     CNDE_INT T3.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     SETGT_INT T1.Z, T2.Y, literal.y,
-; CM-NEXT:     MUL_IEEE * T1.W, T0.Y, literal.z, BS:VEC_120/SCL_212
-; CM-NEXT:    254(3.559298e-43), -127(nan)
-; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T4.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT * T3.Y, PV.Z, PV.Y, T2.Y,
-; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; CM-NEXT:    ALU clause starting at 104:
-; CM-NEXT:     CNDE_INT T0.Z, T3.X, T2.X, T0.Z,
-; CM-NEXT:     SETGT_INT * T2.W, T2.Y, literal.x,
+; CM-NEXT:     CNDE_INT T7.X, PV.W, PV.Z, T0.Z,
+; CM-NEXT:     CNDE_INT T2.Y, PV.Y, PV.X, T5.X,
+; CM-NEXT:     SETGT_INT * T0.Z, T0.Z, literal.x,
 ; CM-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T2.X, T1.Y, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.W, T3.Y, PV.Z,
-; CM-NEXT:     CNDE_INT T0.Z, T0.W, T4.X, T1.W,
-; CM-NEXT:     MUL_IEEE * T0.W, T4.Y, literal.y, BS:VEC_201
-; CM-NEXT:    1079283712(3.321289e+00), 2130706432(1.701412e+38)
-; CM-NEXT:     AND_INT T4.X, KC0[4].X, literal.x,
-; CM-NEXT:     CNDE_INT T2.Y, T3.X, T4.Y, PV.W,
-; CM-NEXT:     CNDE_INT T0.Z, T1.Z, PV.Z, T0.Y,
-; CM-NEXT:     LSHL * T0.W, PV.Y, literal.y,
-; CM-NEXT:    -4096(nan), 23(3.222986e-44)
-; CM-NEXT:     ADD_INT T3.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T0.Y, T2.W, PV.Z, PV.Y,
-; CM-NEXT:     MUL_IEEE T0.Z, PV.X, literal.y,
-; CM-NEXT:     RNDNE * T0.W, T2.X,
-; CM-NEXT:    1065353216(1.000000e+00), 1079283712(3.321289e+00)
-; CM-NEXT:     ADD T2.X, T2.X, -PV.W,
-; CM-NEXT:     RNDNE T1.Y, PV.Z,
-; CM-NEXT:     MUL_IEEE T1.Z, PV.Y, PV.X,
-; CM-NEXT:     SETGT * T1.W, literal.x, KC0[3].W,
-; CM-NEXT:    -1036817932(-4.485347e+01), 0(0.000000e+00)
-; CM-NEXT:     CNDE T3.X, PV.W, PV.Z, 0.0,
-; CM-NEXT:     TRUNC T0.Y, T0.W,
-; CM-NEXT:     TRUNC T1.Z, PV.Y,
-; CM-NEXT:     ADD * T0.W, PV.X, T1.X,
+; CM-NEXT:    ALU clause starting at 104:
+; CM-NEXT:     ADD * T4.W, KC0[3].Z, -T3.X,
+; CM-NEXT:     MUL_IEEE T5.X, PV.W, literal.x,
+; CM-NEXT:     CNDE_INT T2.Y, T0.Z, T7.X, T2.Y,
+; CM-NEXT:     MUL_IEEE T1.Z, T1.Y, literal.y,
+; CM-NEXT:     CNDE_INT * T1.W, T2.W, T2.X, T1.W, BS:VEC_021/SCL_122
+; CM-NEXT:    975668412(6.390323e-04), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T0.X, T3.W, PV.W, T0.X,
+; CM-NEXT:     CNDE_INT T1.Y, T3.Y, T1.Y, PV.Z,
+; CM-NEXT:     LSHL T1.Z, PV.Y, literal.x,
+; CM-NEXT:     MULADD_IEEE * T1.W, T4.W, literal.y, PV.X, BS:VEC_120/SCL_212
+; CM-NEXT:    23(3.222986e-44), 1079283712(3.321289e+00)
+; CM-NEXT:     MULADD_IEEE T2.X, T3.X, literal.x, PV.W,
+; CM-NEXT:     ADD T2.Y, T0.W, -T4.X,
+; CM-NEXT:     ADD_INT T1.Z, PV.Z, literal.y,
+; CM-NEXT:     CNDE_INT * T0.W, T0.Z, PV.X, PV.Y,
+; CM-NEXT:    975668412(6.390323e-04), 1065353216(1.000000e+00)
+; CM-NEXT:     AND_INT T0.X, KC0[4].X, literal.x,
+; CM-NEXT:     MUL_IEEE T1.Y, PV.W, PV.Z,
+; CM-NEXT:     SETGT T0.Z, literal.y, KC0[3].W,
+; CM-NEXT:     ADD * T0.W, PV.Y, PV.X,
+; CM-NEXT:    -4096(nan), -1036817932(-4.485347e+01)
 ; CM-NEXT:     EXP_IEEE T0.X (MASKED), T0.W,
 ; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
 ; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
 ; CM-NEXT:     EXP_IEEE * T0.W, T0.W,
-; CM-NEXT:     FLT_TO_INT T1.X, T1.Z,
-; CM-NEXT:     FLT_TO_INT T0.Y, T0.Y,
-; CM-NEXT:     MUL_IEEE T1.Z, PV.W, literal.x,
-; CM-NEXT:     ADD * T1.W, KC0[4].X, -T4.X,
-; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T2.X, PV.W, literal.x,
-; CM-NEXT:     MUL_IEEE T2.Y, T0.W, literal.y,
-; CM-NEXT:     MUL_IEEE T2.Z, PV.Z, literal.z,
-; CM-NEXT:     SETGT_UINT * T2.W, PV.Y, literal.w,
-; CM-NEXT:    975668412(6.390323e-04), 209715200(1.972152e-31)
-; CM-NEXT:    2130706432(1.701412e+38), 254(3.559298e-43)
-; CM-NEXT:     CNDE_INT T5.X, PV.W, T1.Z, PV.Z,
-; CM-NEXT:     MUL_IEEE T3.Y, PV.Y, literal.x,
-; CM-NEXT:     MULADD_IEEE T1.Z, T1.W, literal.y, PV.X,
-; CM-NEXT:     MAX_INT * T1.W, T1.X, literal.z,
-; CM-NEXT:    209715200(1.972152e-31), 1079283712(3.321289e+00)
-; CM-NEXT:    -330(nan), 0(0.000000e+00)
-; CM-NEXT:     ADD_INT T2.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T4.Y, T1.X, literal.y,
-; CM-NEXT:     MULADD_IEEE T1.Z, T4.X, literal.z, PV.Z, BS:VEC_120/SCL_212
-; CM-NEXT:     MAX_INT * T1.W, T0.Y, literal.w,
-; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; CM-NEXT:     CNDE T2.X, T0.Z, T1.Y, 0.0,
+; CM-NEXT:     ADD T1.Y, KC0[4].X, -T0.X,
+; CM-NEXT:     FLT_TO_INT T0.Z, T0.Y,
+; CM-NEXT:     MUL_IEEE * T1.W, PV.W, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T3.X, PV.W, literal.x,
+; CM-NEXT:     SETGT_UINT T0.Y, PV.Z, literal.y,
+; CM-NEXT:     MUL_IEEE T1.Z, PV.Y, literal.z,
+; CM-NEXT:     MUL_IEEE * T2.W, T0.X, literal.w,
+; CM-NEXT:    209715200(1.972152e-31), -229(nan)
+; CM-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
+; CM-NEXT:     RNDNE T4.X, PV.W,
+; CM-NEXT:     MULADD_IEEE T1.Y, T1.Y, literal.x, PV.Z,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Y, PV.X, T1.W,
+; CM-NEXT:     SETGT_INT * T1.W, T0.Z, literal.y,
+; CM-NEXT:    1079283712(3.321289e+00), -127(nan)
+; CM-NEXT:     CNDE_INT T3.X, PV.W, PV.Z, T0.W,
+; CM-NEXT:     MULADD_IEEE T1.Y, T0.X, literal.x, PV.Y,
+; CM-NEXT:     ADD T1.Z, T2.W, -PV.X,
+; CM-NEXT:     MAX_INT * T2.W, T0.Z, literal.y,
 ; CM-NEXT:    975668412(6.390323e-04), -330(nan)
-; CM-NEXT:     ADD T4.X, T0.Z, -T1.Y,
-; CM-NEXT:     ADD_INT T1.Y, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, T0.Y, literal.y,
-; CM-NEXT:     SETGT_UINT * T1.W, T0.Y, literal.z,
+; CM-NEXT:     ADD_INT T0.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T2.Y, T0.Z, literal.y,
+; CM-NEXT:     TRUNC T2.Z, T4.X,
+; CM-NEXT:     ADD * T2.W, PV.Z, PV.Y,
 ; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
-; CM-NEXT:    -229(nan), 0(0.000000e+00)
-; CM-NEXT:     SETGT_UINT T6.X, T1.X, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     SETGT_INT T0.Z, T0.Y, literal.y,
-; CM-NEXT:     ADD * T3.W, PV.X, T1.Z,
-; CM-NEXT:    -229(nan), -127(nan)
-; CM-NEXT:     EXP_IEEE T1.X (MASKED), T3.W,
-; CM-NEXT:     EXP_IEEE T1.Y (MASKED), T3.W,
-; CM-NEXT:     EXP_IEEE T1.Z, T3.W,
-; CM-NEXT:     EXP_IEEE * T1.W (MASKED), T3.W,
-; CM-NEXT:     CNDE_INT T4.X, T0.Z, T1.Y, T0.Y,
-; CM-NEXT:     CNDE_INT T1.Y, T6.X, T2.X, T4.Y, BS:VEC_120/SCL_212
-; CM-NEXT:     SETGT_INT T2.Z, T1.X, literal.x,
-; CM-NEXT:     MUL_IEEE * T3.W, PV.Z, literal.y,
-; CM-NEXT:    -127(nan), 209715200(1.972152e-31)
-; CM-NEXT:     MUL_IEEE T2.X, T1.Z, literal.x,
-; CM-NEXT:     MUL_IEEE T4.Y, PV.W, literal.y,
-; CM-NEXT:     CNDE_INT T3.Z, PV.Z, PV.Y, T1.X,
-; CM-NEXT:     MIN_INT * T4.W, T1.X, literal.z,
+; CM-NEXT:     EXP_IEEE T1.X (MASKED), T2.W,
+; CM-NEXT:     EXP_IEEE T1.Y, T2.W,
+; CM-NEXT:     EXP_IEEE T1.Z (MASKED), T2.W,
+; CM-NEXT:     EXP_IEEE * T1.W (MASKED), T2.W,
+; CM-NEXT:     MUL_IEEE T4.X, T0.W, literal.x,
+; CM-NEXT:     FLT_TO_INT T3.Y, T2.Z,
+; CM-NEXT:     MUL_IEEE T1.Z, PV.Y, literal.y,
+; CM-NEXT:     CNDE_INT * T0.W, T0.Y, T0.X, T2.Y,
 ; CM-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
+; CM-NEXT:     CNDE_INT T0.X, T1.W, PV.W, T0.Z,
+; CM-NEXT:     MUL_IEEE T0.Y, PV.Z, literal.x,
+; CM-NEXT:     MAX_INT T2.Z, PV.Y, literal.y,
+; CM-NEXT:     MIN_INT * T0.W, PV.Y, literal.z,
+; CM-NEXT:    209715200(1.972152e-31), -330(nan)
 ; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
-; CM-NEXT:     MIN_INT T7.X, T0.Y, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, PV.W, literal.y,
-; CM-NEXT:     ADD_INT T4.Z, T1.X, literal.z,
-; CM-NEXT:     SETGT_UINT * T4.W, T1.X, literal.w,
-; CM-NEXT:    381(5.338947e-43), -254(nan)
+; CM-NEXT:     ADD_INT T5.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T2.Z, T3.Y, literal.z,
+; CM-NEXT:     SETGT_UINT * T0.W, T3.Y, literal.w,
+; CM-NEXT:    -254(nan), 204(2.858649e-43)
+; CM-NEXT:    102(1.429324e-43), -229(nan)
+; CM-NEXT:     ADD_INT T6.X, T3.Y, literal.x,
+; CM-NEXT:     SETGT_UINT T4.Y, T3.Y, literal.y,
+; CM-NEXT:     CNDE_INT T2.Z, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     SETGT_INT * T1.W, T3.Y, literal.x,
 ; CM-NEXT:    -127(nan), 254(3.559298e-43)
-; CM-NEXT:     CNDE_INT T8.X, PV.W, PV.Z, PV.Y,
-; CM-NEXT:     SETGT_INT T1.Y, T1.X, literal.x,
-; CM-NEXT:     ADD_INT T4.Z, PV.X, literal.y,
-; CM-NEXT:     ADD_INT * T5.W, T0.Y, literal.z,
+; CM-NEXT:     MUL_IEEE T7.X, T1.Y, literal.x,
+; CM-NEXT:     CNDE_INT T2.Y, PV.W, PV.Z, T3.Y,
+; CM-NEXT:     CNDE_INT T2.Z, PV.Y, PV.X, T5.X,
+; CM-NEXT:     MIN_INT * T2.W, T0.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 381(5.338947e-43)
+; CM-NEXT:     SETGT_INT T5.X, T3.Y, literal.x,
+; CM-NEXT:     ADD_INT T3.Y, PV.W, literal.y,
+; CM-NEXT:     ADD_INT T3.Z, T0.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T2.W, T0.Z, literal.w,
 ; CM-NEXT:    127(1.779649e-43), -254(nan)
-; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T1.X, T2.W, PV.W, PV.Z,
-; CM-NEXT:     CNDE_INT T5.Y, PV.Y, T3.Z, PV.X,
-; CM-NEXT:     CNDE_INT T3.Z, T6.X, T4.Y, T3.W,
-; CM-NEXT:     MUL_IEEE * T2.W, T2.X, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     CNDE_INT T6.X, PV.W, PV.Z, PV.Y,
+; CM-NEXT:     CNDE_INT T2.Y, PV.X, T2.Y, T2.Z,
+; CM-NEXT:     MUL_IEEE T2.Z, T7.X, literal.x,
+; CM-NEXT:     CNDE_INT * T0.W, T0.W, T0.Y, T1.Z, BS:VEC_021/SCL_122
 ; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_INT T6.X, T0.Y, literal.x,
-; CM-NEXT:     CNDE_INT T0.Y, T4.W, T2.X, PV.W,
-; CM-NEXT:     CNDE_INT * T1.Z, T2.Z, PV.Z, T1.Z,
-; CM-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; CM-NEXT:    ALU clause starting at 205:
-; CM-NEXT:     LSHL * T2.W, T5.Y, literal.x,
-; CM-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; CM-NEXT:     ADD_INT T2.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T0.Y, T1.Y, T1.Z, T0.Y,
-; CM-NEXT:     CNDE_INT * T1.Z, T6.X, T4.X, T1.X,
+; CM-NEXT:     SETGT_INT T8.X, T0.Z, literal.x,
+; CM-NEXT:     CNDE_INT T0.Y, T1.W, PV.W, T1.Y,
+; CM-NEXT:     CNDE_INT T0.Z, T4.Y, T7.X, PV.Z,
+; CM-NEXT:     LSHL * T0.W, PV.Y, literal.y,
+; CM-NEXT:    127(1.779649e-43), 23(3.222986e-44)
+; CM-NEXT:    ALU clause starting at 202:
+; CM-NEXT:     ADD_INT T7.X, T0.W, literal.x,
+; CM-NEXT:     CNDE_INT * T0.Y, T5.X, T0.Y, T0.Z,
 ; CM-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT * T1.W, T1.W, T3.Y, T2.Y,
-; CM-NEXT:     CNDE_INT T1.X, T0.Z, PV.W, T0.W,
-; CM-NEXT:     LSHL T1.Y, T1.Z, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     MUL_IEEE T0.Z, T0.Y, T2.X,
+; CM-NEXT:     CNDE_INT * T0.Z, T8.X, T0.X, T6.X,
+; CM-NEXT:     MUL_IEEE * T0.W, T4.X, literal.x,
+; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
+; CM-NEXT:     CNDE_INT T0.X, T2.W, T4.X, PV.W,
+; CM-NEXT:     LSHL T1.Y, T0.Z, literal.x,
+; CM-NEXT:     MUL_IEEE T0.Z, T0.Y, T7.X, BS:VEC_021/SCL_122
 ; CM-NEXT:     SETGT * T0.W, literal.y, KC0[4].X,
 ; CM-NEXT:    23(3.222986e-44), -1036817932(-4.485347e+01)
-; CM-NEXT:     CNDE T2.X, PV.W, PV.Z, 0.0,
+; CM-NEXT:     CNDE T4.X, PV.W, PV.Z, 0.0,
 ; CM-NEXT:     SETGT T0.Y, KC0[4].X, literal.x,
 ; CM-NEXT:     ADD_INT T0.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T6.X, PV.X, T5.X,
+; CM-NEXT:     CNDE_INT * T0.W, T8.X, T3.X, PV.X,
 ; CM-NEXT:    1109008539(3.853184e+01), 1065353216(1.000000e+00)
-; CM-NEXT:     SETGT T1.X, KC0[3].W, literal.x,
+; CM-NEXT:     SETGT T0.X, KC0[3].W, literal.x,
 ; CM-NEXT:     MUL_IEEE T1.Y, PV.W, PV.Z,
 ; CM-NEXT:     SETGT T0.Z, literal.y, KC0[3].Z,
 ; CM-NEXT:     CNDE * T0.W, PV.Y, PV.X, literal.z,
 ; CM-NEXT:    1109008539(3.853184e+01), -1036817932(-4.485347e+01)
 ; CM-NEXT:    2139095040(INF), 0(0.000000e+00)
-; CM-NEXT:     SETGT T2.X, literal.x, KC0[3].Y,
+; CM-NEXT:     SETGT T3.X, literal.x, KC0[3].Y,
 ; CM-NEXT:     CNDE T0.Y, PV.Z, PV.Y, 0.0,
-; CM-NEXT:     CNDE T0.Z, PV.X, T3.X, literal.y,
+; CM-NEXT:     CNDE T0.Z, PV.X, T2.X, literal.y,
 ; CM-NEXT:     SETGT * T1.W, KC0[3].Z, literal.z,
 ; CM-NEXT:    -1036817932(-4.485347e+01), 2139095040(INF)
 ; CM-NEXT:    1109008539(3.853184e+01), 0(0.000000e+00)
 ; CM-NEXT:     CNDE T0.Y, PV.W, PV.Y, literal.x,
-; CM-NEXT:     CNDE T1.Z, PV.X, T0.X, 0.0,
+; CM-NEXT:     CNDE T1.Z, PV.X, T1.X, 0.0,
 ; CM-NEXT:     SETGT * T1.W, KC0[3].Y, literal.y,
 ; CM-NEXT:    2139095040(INF), 1109008539(3.853184e+01)
 ; CM-NEXT:     CNDE * T0.X, PV.W, PV.Z, literal.x,
diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
index 1b03065..0492c56 100644
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -1,8 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -earlycse-debug-hash -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-; GCN-LABEL: {{^}}v_sad_u32_pat1:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: v_sad_u32_pat1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_sad_u32 v2, s0, v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -16,9 +27,18 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b,
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_constant_pat1:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 20
 define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a) {
+; GCN-LABEL: v_sad_u32_constant_pat1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x5a
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_sad_u32 v2, s2, v0, 20
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, 90
   %t0 = select i1 %icmp0, i32 %a, i32 90
 
@@ -32,9 +52,19 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_pat2:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: v_sad_u32_pat2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_sad_u32 v2, s0, v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %b
   %sub1 = sub i32 %b, %a
@@ -46,12 +76,28 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b,
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_multi_use_sub_pat1:
-; GCN: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: s_min_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: v_sad_u32_multi_use_sub_pat1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_add_u32 s8, s8, s7
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_max_u32 s3, s0, s1
+; GCN-NEXT:    s_min_u32 s0, s0, s1
+; GCN-NEXT:    s_sub_i32 s0, s3, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    s_add_i32 s0, s0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    buffer_store_dword v2, v0, s[8:11], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -66,9 +112,25 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_multi_use_add_pat1:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: v_sad_u32_multi_use_add_pat1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_add_u32 s8, s8, s7
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    v_mov_b32_e32 v3, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_sad_u32 v2, s0, v2, v3
+; GCN-NEXT:    buffer_store_dword v2, v0, s[8:11], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -82,9 +144,27 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_multi_use_max_pat1:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: v_sad_u32_multi_use_max_pat1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_add_u32 s8, s8, s7
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_max_u32 s3, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    v_sad_u32 v3, s0, v0, v1
+; GCN-NEXT:    buffer_store_dword v2, v0, s[8:11], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    flat_store_dword v[0:1], v3
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
   store volatile i32 %t0, ptr addrspace(5) undef
@@ -99,9 +179,27 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_multi_use_min_pat1:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: v_sad_u32_multi_use_min_pat1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_add_u32 s8, s8, s7
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_min_u32 s3, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    v_sad_u32 v3, s0, v0, v1
+; GCN-NEXT:    buffer_store_dword v2, v0, s[8:11], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    flat_store_dword v[0:1], v3
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -117,9 +215,27 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_multi_use_sub_pat2:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: v_sad_u32_multi_use_sub_pat2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_add_u32 s8, s8, s7
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_sub_i32 s3, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    v_sad_u32 v3, s0, v0, v1
+; GCN-NEXT:    buffer_store_dword v2, v0, s[8:11], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    flat_store_dword v[0:1], v3
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %b
   store volatile i32 %sub0, ptr addrspace(5) undef
@@ -132,11 +248,29 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_multi_use_select_pat2:
-; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: s_cmp_gt_u32 s{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: v_sad_u32_multi_use_select_pat2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_add_u32 s8, s8, s7
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_sub_i32 s3, s0, s1
+; GCN-NEXT:    s_sub_i32 s6, s1, s0
+; GCN-NEXT:    s_cmp_gt_u32 s0, s1
+; GCN-NEXT:    s_cselect_b32 s0, s3, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    s_add_i32 s0, s0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    buffer_store_dword v2, v0, s[8:11], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %b
   %sub1 = sub i32 %b, %a
@@ -149,12 +283,29 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_vector_pat1:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; GCN-LABEL: v_sad_u32_vector_pat1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x4
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xc
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s15
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-NEXT:    v_sad_u32 v3, s11, v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_sad_u32 v2, s10, v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_sad_u32 v1, s9, v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    v_sad_u32 v0, s8, v0, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt <4 x i32> %a, %b
   %t0 = select <4 x i1> %icmp0, <4 x i32> %a, <4 x i32> %b
 
@@ -168,12 +319,29 @@ define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_vector_pat2:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; GCN-LABEL: v_sad_u32_vector_pat2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x4
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xc
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s15
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-NEXT:    v_sad_u32 v3, s11, v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_sad_u32 v2, s10, v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_sad_u32 v1, s9, v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    v_sad_u32 v0, s8, v0, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt <4 x i32> %a, %b
   %sub0 = sub <4 x i32> %a, %b
   %sub1 = sub <4 x i32> %b, %a
@@ -185,10 +353,22 @@ define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_i16_pat1:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 %b, i16 %c) {
-
+; GCN-LABEL: v_sad_u32_i16_pat1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s4, s6, 0xffff
+; GCN-NEXT:    s_lshr_b32 s0, s0, 16
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    v_sad_u32 v2, s4, v1, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    flat_store_short v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i16 %a, %b
   %t0 = select i1 %icmp0, i16 %a, i16 %b
 
@@ -202,9 +382,22 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_i16_pat2:
-; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) {
+; GCN-LABEL: v_sad_u32_i16_pat2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    flat_load_ushort v0, v[0:1] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    flat_load_ushort v1, v[0:1] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_sad_u32 v2, v0, v1, v2
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_store_short v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %a = load volatile i16, ptr addrspace(1) undef
   %b = load volatile i16, ptr addrspace(1) undef
   %c = load volatile i16, ptr addrspace(1) undef
@@ -219,9 +412,22 @@ define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) {
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_i8_pat1:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b, i8 %c) {
+; GCN-LABEL: v_sad_u32_i8_pat1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s3, s2, 0xff
+; GCN-NEXT:    s_bfe_u32 s4, s2, 0x80008
+; GCN-NEXT:    s_lshr_b32 s2, s2, 16
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_sad_u32 v2, s3, v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_store_byte v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i8 %a, %b
   %t0 = select i1 %icmp0, i8 %a, i8 %b
 
@@ -235,9 +441,22 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_i8_pat2:
-; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) {
+; GCN-LABEL: v_sad_u32_i8_pat2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    flat_load_ubyte v0, v[0:1] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    flat_load_ubyte v1, v[0:1] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    flat_load_ubyte v2, v[0:1] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_sad_u32 v2, v0, v1, v2
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_store_byte v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %a = load volatile i8, ptr addrspace(1) undef
   %b = load volatile i8, ptr addrspace(1) undef
   %c = load volatile i8, ptr addrspace(1) undef
@@ -252,15 +471,26 @@ define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) {
   ret void
 }
 
-; GCN-LABEL: {{^}}s_sad_u32_i8_pat2:
-; GCN: s_load_dword
-; GCN-DAG: s_bfe_u32
-; GCN-DAG: s_sub_i32
-; GCN-DAG: s_and_b32
-; GCN-DAG: s_sub_i32
-; GCN-DAG: s_lshr_b32
-; GCN: s_add_i32
 define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
+; GCN-LABEL: s_sad_u32_i8_pat2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s4, s2, 8
+; GCN-NEXT:    s_and_b32 s3, s2, 0xff
+; GCN-NEXT:    s_bfe_u32 s5, s2, 0x80008
+; GCN-NEXT:    s_lshr_b32 s6, s2, 16
+; GCN-NEXT:    s_sub_i32 s7, s2, s4
+; GCN-NEXT:    s_sub_i32 s2, s4, s2
+; GCN-NEXT:    s_cmp_gt_u32 s3, s5
+; GCN-NEXT:    s_cselect_b32 s2, s7, s2
+; GCN-NEXT:    s_add_i32 s2, s2, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    flat_store_byte v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i8 %a, %b
   %sub0 = sub i8 %a, %b
   %sub1 = sub i8 %b, %a
@@ -272,12 +502,22 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_mismatched_operands_pat1:
-; GCN-DAG: s_cmp_le_u32 s{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+; GCN-LABEL: v_sad_u32_mismatched_operands_pat1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_max_u32 s6, s0, s1
+; GCN-NEXT:    s_cmp_le_u32 s0, s1
+; GCN-NEXT:    s_cselect_b32 s0, s0, s3
+; GCN-NEXT:    s_sub_i32 s0, s6, s0
+; GCN-NEXT:    s_add_i32 s0, s0, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -291,11 +531,22 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_mismatched_operands_pat2:
-; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+; GCN-LABEL: v_sad_u32_mismatched_operands_pat2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_sub_i32 s3, s0, s3
+; GCN-NEXT:    s_sub_i32 s6, s1, s0
+; GCN-NEXT:    s_cmp_lt_u32 s1, s0
+; GCN-NEXT:    s_cselect_b32 s0, s3, s6
+; GCN-NEXT:    s_add_i32 s0, s0, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %d
   %sub1 = sub i32 %b, %a
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index b1a82da..b3f4790 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -795,17 +795,17 @@ define amdgpu_kernel void @shl_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     AND_INT T1.Y, T0.Z, literal.x,
-; EG-NEXT:     LSHR T1.Z, T0.Y, 1,
+; EG-NEXT:     LSHR T1.Y, T0.Y, 1,
+; EG-NEXT:     NOT_INT T1.Z, T0.Z,
 ; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.Y, T0.X, 1,
-; EG-NEXT:     NOT_INT * T1.W, T0.Z,
+; EG-NEXT:     AND_INT * T1.W, T0.Z, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T1.Z, PV.Z, PV.W, PS,
-; EG-NEXT:     LSHL T0.W, T0.X, PV.Y,
+; EG-NEXT:     LSHL T2.Z, T0.X, PS,
+; EG-NEXT:     BIT_ALIGN_INT T0.W, PV.Y, PV.W, PV.Z,
 ; EG-NEXT:     AND_INT * T1.W, T0.Z, literal.x,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
-; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
+; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
+; EG-NEXT:     CNDE_INT T0.X, T1.W, T2.Z, 0.0,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %b_ptr = getelementptr i64, ptr addrspace(1) %in, i64 1
@@ -858,8 +858,8 @@ define amdgpu_kernel void @shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @6
-; EG-NEXT:    ALU 22, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1
+; EG-NEXT:    ALU 23, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 6:
@@ -868,27 +868,28 @@ define amdgpu_kernel void @shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; EG-NEXT:    ALU clause starting at 10:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 11:
-; EG-NEXT:     AND_INT T1.Y, T1.Z, literal.x,
+; EG-NEXT:     AND_INT * T1.W, T1.Z, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHL T2.X, T0.Z, PV.W,
+; EG-NEXT:     AND_INT T1.Y, T1.Z, literal.x, BS:VEC_120/SCL_212
 ; EG-NEXT:     LSHR T2.Z, T0.W, 1,
-; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.W, T0.Z, 1,
+; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.W, T0.Z, 1, BS:VEC_102/SCL_221
 ; EG-NEXT:     NOT_INT * T1.W, T1.Z,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     BIT_ALIGN_INT T3.X, PV.Z, PV.W, PS,
+; EG-NEXT:     LSHR T2.Y, T0.Y, 1,
+; EG-NEXT:     NOT_INT T0.Z, T1.X,
+; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.Y, T0.X, 1,
+; EG-NEXT:     AND_INT * T1.W, T1.X, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T0.W, PV.Z, PV.W, PS,
-; EG-NEXT:     LSHL * T1.W, T0.Z, PV.Y,
-; EG-NEXT:     AND_INT T2.X, T1.Z, literal.x,
-; EG-NEXT:     AND_INT T1.Y, T1.X, literal.y,
-; EG-NEXT:     LSHR T0.Z, T0.Y, 1,
-; EG-NEXT:     BIT_ALIGN_INT T2.W, T0.Y, T0.X, 1,
-; EG-NEXT:     NOT_INT * T3.W, T1.X,
-; EG-NEXT:    32(4.484155e-44), 31(4.344025e-44)
-; EG-NEXT:     BIT_ALIGN_INT T0.Y, PV.Z, PV.W, PS,
-; EG-NEXT:     LSHL T0.Z, T0.X, PV.Y,
-; EG-NEXT:     AND_INT T2.W, T1.X, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     CNDE_INT * T3.W, PV.X, T0.W, T1.W,
+; EG-NEXT:     LSHL T0.Y, T0.X, PS, BS:VEC_120/SCL_212
+; EG-NEXT:     AND_INT T1.Z, T1.X, literal.x, BS:VEC_201
+; EG-NEXT:     BIT_ALIGN_INT T0.W, PV.Y, PV.W, PV.Z,
+; EG-NEXT:     CNDE_INT * T2.W, T1.Y, PV.X, T2.X,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T3.Y, PV.W, PV.Y, PV.Z,
-; EG-NEXT:     CNDE_INT * T3.Z, T2.X, T1.W, 0.0,
-; EG-NEXT:     CNDE_INT T3.X, T2.W, T0.Z, 0.0,
+; EG-NEXT:     CNDE_INT T2.Y, PV.Z, PV.W, PV.Y,
+; EG-NEXT:     CNDE_INT * T2.Z, T1.Y, T2.X, 0.0,
+; EG-NEXT:     CNDE_INT T2.X, T1.Z, T0.Y, 0.0,
 ; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %in, i64 1
@@ -955,65 +956,66 @@ define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 3 @6
-; EG-NEXT:    ALU 47, @15, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1
+; EG-NEXT:    ALU 48, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T1.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 48, #1
-; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 0, #1
-; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 32, #1
-; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 48, #1
+; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 14:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 15:
-; EG-NEXT:     AND_INT T4.Z, T1.Z, literal.x,
-; EG-NEXT:     LSHR T1.W, T0.W, 1,
-; EG-NEXT:     NOT_INT * T3.W, T1.Z,
+; EG-NEXT:     AND_INT * T1.W, T1.Z, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T4.X, T0.W, T0.Z, 1,
-; EG-NEXT:     AND_INT T1.Y, T3.Z, literal.x, BS:VEC_201
-; EG-NEXT:     LSHR T5.Z, T2.W, 1, BS:VEC_120/SCL_212
-; EG-NEXT:     BIT_ALIGN_INT T0.W, T2.W, T2.Z, 1, BS:VEC_102/SCL_221
-; EG-NEXT:     NOT_INT * T2.W, T3.Z,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.Y, PV.Z, PV.W, PS,
-; EG-NEXT:     LSHL T2.Z, T2.Z, PV.Y,
-; EG-NEXT:     BIT_ALIGN_INT T0.W, T1.W, PV.X, T3.W,
-; EG-NEXT:     LSHL * T1.W, T0.Z, T4.Z,
+; EG-NEXT:     LSHL * T1.W, T0.Z, PV.W,
 ; EG-NEXT:     AND_INT T4.X, T1.Z, literal.x,
-; EG-NEXT:     AND_INT T1.Y, T1.X, literal.y,
-; EG-NEXT:     LSHR T0.Z, T0.Y, 1,
-; EG-NEXT:     BIT_ALIGN_INT T2.W, T0.Y, T0.X, 1,
-; EG-NEXT:     NOT_INT * T3.W, T1.X,
+; EG-NEXT:     LSHR T1.Y, T3.W, 1,
+; EG-NEXT:     NOT_INT T4.Z, T2.Z, BS:VEC_201
+; EG-NEXT:     BIT_ALIGN_INT T2.W, T3.W, T3.Z, 1,
+; EG-NEXT:     AND_INT * T3.W, T2.Z, literal.y,
 ; EG-NEXT:    32(4.484155e-44), 31(4.344025e-44)
-; EG-NEXT:     AND_INT T5.X, T3.Z, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T0.Y, PV.Z, PV.W, PS,
-; EG-NEXT:     LSHL T0.Z, T0.X, PV.Y,
-; EG-NEXT:     AND_INT T2.W, T1.X, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     CNDE_INT * T4.W, PV.X, T0.W, T1.W,
+; EG-NEXT:     LSHL T5.X, T3.Z, PS,
+; EG-NEXT:     AND_INT T2.Y, T2.Z, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, PV.Y, PV.W, PV.Z,
+; EG-NEXT:     LSHR T2.W, T3.Y, 1,
+; EG-NEXT:     NOT_INT * T3.W, T2.X,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     BIT_ALIGN_INT T6.X, T3.Y, T3.X, 1,
+; EG-NEXT:     AND_INT T1.Y, T2.X, literal.x,
+; EG-NEXT:     LSHR T3.Z, T0.W, 1,
+; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.W, T0.Z, 1,
+; EG-NEXT:     NOT_INT * T4.W, T1.Z,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BIT_ALIGN_INT T7.X, PV.Z, PV.W, PS,
+; EG-NEXT:     LSHL T1.Y, T3.X, PV.Y, BS:VEC_120/SCL_212
+; EG-NEXT:     AND_INT T0.Z, T2.X, literal.x, BS:VEC_201
+; EG-NEXT:     BIT_ALIGN_INT T0.W, T2.W, PV.X, T3.W,
+; EG-NEXT:     CNDE_INT * T3.W, T2.Y, T2.Z, T5.X,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T0.X, T3.X, literal.x,
-; EG-NEXT:     CNDE_INT T4.Y, PV.W, PV.Y, PV.Z,
-; EG-NEXT:     LSHR T1.Z, T2.Y, 1,
-; EG-NEXT:     BIT_ALIGN_INT T0.W, T2.Y, T2.X, 1,
-; EG-NEXT:     NOT_INT * T3.W, T3.X,
+; EG-NEXT:     LSHR T2.X, T0.Y, 1,
+; EG-NEXT:     CNDE_INT T3.Y, PV.Z, PV.W, PV.Y,
+; EG-NEXT:     NOT_INT T1.Z, T1.X,
+; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.Y, T0.X, 1,
+; EG-NEXT:     AND_INT * T2.W, T1.X, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T1.X, PV.Z, PV.W, PS,
-; EG-NEXT:     LSHL T0.Y, T2.X, PV.X,
-; EG-NEXT:     CNDE_INT T4.Z, T4.X, T1.W, 0.0, BS:VEC_120/SCL_212
-; EG-NEXT:     AND_INT * T0.W, T3.X, literal.x, BS:VEC_201
+; EG-NEXT:     LSHL T0.X, T0.X, PS,
+; EG-NEXT:     AND_INT T0.Y, T1.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     CNDE_INT T3.Z, T2.Y, T5.X, 0.0, BS:VEC_021/SCL_122
+; EG-NEXT:     BIT_ALIGN_INT * T0.W, PV.X, PV.W, PV.Z,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT * T1.W, T5.X, T3.Y, T2.Z,
-; EG-NEXT:     CNDE_INT T4.X, T2.W, T0.Z, 0.0,
-; EG-NEXT:     CNDE_INT T1.Y, T0.W, T1.X, T0.Y, BS:VEC_120/SCL_212
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
+; EG-NEXT:     CNDE_INT * T2.W, T4.X, T7.X, T1.W,
+; EG-NEXT:     CNDE_INT T3.X, T0.Z, T1.Y, 0.0,
+; EG-NEXT:     CNDE_INT T2.Y, T0.Y, T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
-; EG-NEXT:     CNDE_INT T1.Z, T5.X, T2.Z, 0.0,
-; EG-NEXT:     CNDE_INT * T1.X, T0.W, T0.Y, 0.0,
+; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
+; EG-NEXT:     CNDE_INT T2.Z, T4.X, T1.W, 0.0,
+; EG-NEXT:     CNDE_INT * T2.X, T0.Y, T0.X, 0.0,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %in, i64 1
   %a = load <4 x i64>, ptr addrspace(1) %in
@@ -1172,17 +1174,17 @@ define amdgpu_kernel void @s_shl_constant_i64(ptr addrspace(1) %out, i64 %a) {
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    ALU clause starting at 4:
-; EG-NEXT:     AND_INT T0.Z, KC0[2].W, literal.x,
-; EG-NEXT:     MOV T0.W, literal.y,
-; EG-NEXT:     NOT_INT * T1.W, KC0[2].W,
-; EG-NEXT:    31(4.344025e-44), -1(nan)
-; EG-NEXT:     BIT_ALIGN_INT T1.Z, literal.x, PV.W, PS,
-; EG-NEXT:     LSHL T0.W, literal.y, PV.Z,
+; EG-NEXT:     MOV T0.Z, literal.x,
+; EG-NEXT:     NOT_INT T0.W, KC0[2].W,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
+; EG-NEXT:    -1(nan), 31(4.344025e-44)
+; EG-NEXT:     LSHL T1.Z, literal.x, PS,
+; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.y, PV.Z, PV.W,
 ; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.z,
-; EG-NEXT:    32767(4.591635e-41), -1(nan)
+; EG-NEXT:    -1(nan), 32767(4.591635e-41)
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
-; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
+; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
+; EG-NEXT:     CNDE_INT T0.X, T1.W, T1.Z, 0.0,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %shl = shl i64 281474976710655, %a
@@ -1423,15 +1425,15 @@ define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    ALU clause starting at 4:
-; EG-NEXT:     NOT_INT T0.W, KC0[2].W,
-; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.x,
+; EG-NEXT:     AND_INT T0.W, KC0[2].W, literal.x,
+; EG-NEXT:     NOT_INT * T1.W, KC0[2].W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHL T0.Z, literal.x, PS,
-; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, literal.y, PV.W,
-; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
-; EG-NEXT:    64(8.968310e-44), 32(4.484155e-44)
-; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
-; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.Z, 0.0,
+; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS,
+; EG-NEXT:     AND_INT T1.W, KC0[2].W, literal.x,
+; EG-NEXT:     LSHL * T0.W, literal.y, PV.W,
+; EG-NEXT:    32(4.484155e-44), 64(8.968310e-44)
+; EG-NEXT:     CNDE_INT * T0.Y, PV.W, PV.Z, PS,
+; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %shl = shl i64 64, %a
@@ -1903,16 +1905,16 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, p
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    ALU clause starting at 4:
-; EG-NEXT:     NOT_INT T0.W, KC0[2].W,
-; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.x,
+; EG-NEXT:     AND_INT T0.W, KC0[2].W, literal.x,
+; EG-NEXT:     NOT_INT * T1.W, KC0[2].W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHL T0.Z, literal.x, PS,
-; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, literal.y, PV.W,
-; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.z,
-; EG-NEXT:    1082130432(4.000000e+00), 541065216(1.626303e-19)
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
-; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.Z, 0.0,
+; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS,
+; EG-NEXT:     AND_INT T1.W, KC0[2].W, literal.y,
+; EG-NEXT:     LSHL * T0.W, literal.z, PV.W,
+; EG-NEXT:    541065216(1.626303e-19), 32(4.484155e-44)
+; EG-NEXT:    1082130432(4.000000e+00), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT * T0.Y, PV.W, PV.Z, PS,
+; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %shl = shl i64 1082130432, %a
@@ -1959,17 +1961,17 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %ou
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    ALU clause starting at 4:
-; EG-NEXT:     AND_INT T0.Z, KC0[2].W, literal.x,
-; EG-NEXT:     MOV T0.W, literal.y,
-; EG-NEXT:     NOT_INT * T1.W, KC0[2].W,
-; EG-NEXT:    31(4.344025e-44), -532676608(-5.534023e+19)
-; EG-NEXT:     BIT_ALIGN_INT T1.Z, literal.x, PV.W, PS,
-; EG-NEXT:     LSHL T0.W, literal.y, PV.Z,
+; EG-NEXT:     MOV T0.Z, literal.x,
+; EG-NEXT:     NOT_INT T0.W, KC0[2].W,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
+; EG-NEXT:    -532676608(-5.534023e+19), 31(4.344025e-44)
+; EG-NEXT:     LSHL T1.Z, literal.x, PS,
+; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.y, PV.Z, PV.W,
 ; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.z,
-; EG-NEXT:    2147483647(nan), -1065353216(-4.000000e+00)
+; EG-NEXT:    -1065353216(-4.000000e+00), 2147483647(nan)
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
-; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
+; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
+; EG-NEXT:     CNDE_INT T0.X, T1.W, T1.Z, 0.0,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %shl = shl i64 -1065353216, %a
diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index 152eba5..5a241f8 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -506,8 +506,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_m2
-; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 -2)
-; GCN-PRELINK: %__rootn2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp)
+; GCN: [[SQRT:%.+]] = tail call fast float @llvm.sqrt.f32(float %tmp)
+; GCN-NEXT: fdiv fast float 1.000000e+00, [[SQRT]]
 define amdgpu_kernel void @test_rootn_m2(ptr addrspace(1) nocapture %a) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index dcc5fbd..7dce633 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -264,6 +264,142 @@ ret:
   ret void
 }
 
+define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; NOHSA-TRAP-GFX900-LABEL: trap_with_use_after:
+; NOHSA-TRAP-GFX900:       ; %bb.0:
+; NOHSA-TRAP-GFX900-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; NOHSA-TRAP-GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; NOHSA-TRAP-GFX900-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; NOHSA-TRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-NEXT:    s_cbranch_execnz .LBB2_2
+; NOHSA-TRAP-GFX900-NEXT:  ; %bb.1:
+; NOHSA-TRAP-GFX900-NEXT:    global_store_dword v0, v1, s[2:3]
+; NOHSA-TRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-NEXT:  .LBB2_2:
+; NOHSA-TRAP-GFX900-NEXT:    s_endpgm
+;
+; HSA-TRAP-GFX803-LABEL: trap_with_use_after:
+; HSA-TRAP-GFX803:       ; %bb.0:
+; HSA-TRAP-GFX803-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; HSA-TRAP-GFX803-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
+; HSA-TRAP-GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s4
+; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s5
+; HSA-TRAP-GFX803-NEXT:    flat_load_dword v2, v[0:1] glc
+; HSA-TRAP-GFX803-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s6
+; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s7
+; HSA-TRAP-GFX803-NEXT:    s_trap 2
+; HSA-TRAP-GFX803-NEXT:    flat_store_dword v[0:1], v2
+; HSA-TRAP-GFX803-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-NEXT:    s_endpgm
+;
+; HSA-TRAP-GFX900-LABEL: trap_with_use_after:
+; HSA-TRAP-GFX900:       ; %bb.0:
+; HSA-TRAP-GFX900-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; HSA-TRAP-GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX900-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; HSA-TRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-NEXT:    s_trap 2
+; HSA-TRAP-GFX900-NEXT:    global_store_dword v0, v1, s[2:3]
+; HSA-TRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-NEXT:    s_endpgm
+;
+; HSA-NOTRAP-GFX900-LABEL: trap_with_use_after:
+; HSA-NOTRAP-GFX900:       ; %bb.0:
+; HSA-NOTRAP-GFX900-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; HSA-NOTRAP-GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-NOTRAP-GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-NOTRAP-GFX900-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; HSA-NOTRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-NEXT:    s_cbranch_execnz .LBB2_2
+; HSA-NOTRAP-GFX900-NEXT:  ; %bb.1:
+; HSA-NOTRAP-GFX900-NEXT:    global_store_dword v0, v1, s[2:3]
+; HSA-NOTRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-NEXT:  .LBB2_2:
+; HSA-NOTRAP-GFX900-NEXT:    s_endpgm
+;
+; HSA-TRAP-GFX1100-LABEL: trap_with_use_after:
+; HSA-TRAP-GFX1100:       ; %bb.0:
+; HSA-TRAP-GFX1100-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; HSA-TRAP-GFX1100-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-NEXT:    global_load_b32 v1, v0, s[0:1] glc dlc
+; HSA-TRAP-GFX1100-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX1100-NEXT:    s_cbranch_execnz .LBB2_2
+; HSA-TRAP-GFX1100-NEXT:  ; %bb.1:
+; HSA-TRAP-GFX1100-NEXT:    global_store_b32 v0, v1, s[2:3] dlc
+; HSA-TRAP-GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
+; HSA-TRAP-GFX1100-NEXT:    s_nop 0
+; HSA-TRAP-GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; HSA-TRAP-GFX1100-NEXT:    s_endpgm
+; HSA-TRAP-GFX1100-NEXT:  .LBB2_2:
+; HSA-TRAP-GFX1100-NEXT:    s_trap 2
+; HSA-TRAP-GFX1100-NEXT:    s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
+; HSA-TRAP-GFX1100-NEXT:    s_mov_b32 ttmp2, m0
+; HSA-TRAP-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-NEXT:    s_and_b32 s0, s0, 0x3ff
+; HSA-TRAP-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; HSA-TRAP-GFX1100-NEXT:    s_bitset1_b32 s0, 10
+; HSA-TRAP-GFX1100-NEXT:    s_mov_b32 m0, s0
+; HSA-TRAP-GFX1100-NEXT:    s_sendmsg sendmsg(MSG_INTERRUPT)
+; HSA-TRAP-GFX1100-NEXT:    s_mov_b32 m0, ttmp2
+; HSA-TRAP-GFX1100-NEXT:  .LBB2_3: ; =>This Inner Loop Header: Depth=1
+; HSA-TRAP-GFX1100-NEXT:    s_sethalt 5
+; HSA-TRAP-GFX1100-NEXT:    s_branch .LBB2_3
+;
+; HSA-TRAP-GFX1100-O0-LABEL: trap_with_use_after:
+; HSA-TRAP-GFX1100-O0:       ; %bb.0:
+; HSA-TRAP-GFX1100-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
+; HSA-TRAP-GFX1100-O0-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX1100-O0-NEXT:    scratch_store_b32 off, v0, off offset:8 ; 4-byte Folded Spill
+; HSA-TRAP-GFX1100-O0-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; HSA-TRAP-GFX1100-O0-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT:    v_writelane_b32 v1, s2, 0
+; HSA-TRAP-GFX1100-O0-NEXT:    v_writelane_b32 v1, s3, 1
+; HSA-TRAP-GFX1100-O0-NEXT:    s_or_saveexec_b32 s6, -1
+; HSA-TRAP-GFX1100-O0-NEXT:    scratch_store_b32 off, v1, off offset:4 ; 4-byte Folded Spill
+; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 exec_lo, s6
+; HSA-TRAP-GFX1100-O0-NEXT:    global_load_b32 v0, v0, s[0:1] glc dlc
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT:    scratch_store_b32 off, v0, off ; 4-byte Folded Spill
+; HSA-TRAP-GFX1100-O0-NEXT:    s_cbranch_execnz .LBB2_2
+; HSA-TRAP-GFX1100-O0-NEXT:  ; %bb.1:
+; HSA-TRAP-GFX1100-O0-NEXT:    s_or_saveexec_b32 s6, -1
+; HSA-TRAP-GFX1100-O0-NEXT:    scratch_load_b32 v0, off, off offset:4 ; 4-byte Folded Reload
+; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 exec_lo, s6
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT:    v_readlane_b32 s0, v0, 0
+; HSA-TRAP-GFX1100-O0-NEXT:    v_readlane_b32 s1, v0, 1
+; HSA-TRAP-GFX1100-O0-NEXT:    scratch_load_b32 v1, off, off offset:8 ; 4-byte Folded Reload
+; HSA-TRAP-GFX1100-O0-NEXT:    scratch_load_b32 v2, off, off ; 4-byte Folded Reload
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT:    global_store_b32 v1, v2, s[0:1] dlc
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt_vscnt null, 0x0
+; HSA-TRAP-GFX1100-O0-NEXT:    ; kill: killed $vgpr0
+; HSA-TRAP-GFX1100-O0-NEXT:    s_endpgm
+; HSA-TRAP-GFX1100-O0-NEXT:  .LBB2_2:
+; HSA-TRAP-GFX1100-O0-NEXT:    s_trap 2
+; HSA-TRAP-GFX1100-O0-NEXT:    s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
+; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 ttmp2, m0
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT:    s_and_b32 s0, s0, 0x3ff
+; HSA-TRAP-GFX1100-O0-NEXT:    s_or_b32 s0, s0, 0x400
+; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 m0, s0
+; HSA-TRAP-GFX1100-O0-NEXT:    s_sendmsg sendmsg(MSG_INTERRUPT)
+; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 m0, ttmp2
+; HSA-TRAP-GFX1100-O0-NEXT:  .LBB2_3: ; =>This Inner Loop Header: Depth=1
+; HSA-TRAP-GFX1100-O0-NEXT:    s_sethalt 5
+; HSA-TRAP-GFX1100-O0-NEXT:    s_branch .LBB2_3
+  %tmp = load volatile i32, ptr addrspace(1) %arg0
+  call void @llvm.trap()
+  store volatile i32 %tmp, ptr addrspace(1) %arg1
+  ret void
+}
+
 define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) {
 ; NOHSA-TRAP-GFX900-LABEL: debugtrap:
 ; NOHSA-TRAP-GFX900:       ; %bb.0:
@@ -334,6 +470,20 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0)
 ; HSA-TRAP-GFX1100-NEXT:    s_nop 0
 ; HSA-TRAP-GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; HSA-TRAP-GFX1100-NEXT:    s_endpgm
+;
+; HSA-TRAP-GFX1100-O0-LABEL: debugtrap:
+; HSA-TRAP-GFX1100-O0:       ; %bb.0:
+; HSA-TRAP-GFX1100-O0-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; HSA-TRAP-GFX1100-O0-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX1100-O0-NEXT:    v_mov_b32_e32 v1, 1
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt_vscnt null, 0x0
+; HSA-TRAP-GFX1100-O0-NEXT:    s_trap 3
+; HSA-TRAP-GFX1100-O0-NEXT:    v_mov_b32_e32 v1, 2
+; HSA-TRAP-GFX1100-O0-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt_vscnt null, 0x0
+; HSA-TRAP-GFX1100-O0-NEXT:    s_endpgm
   store volatile i32 1, ptr addrspace(1) %arg0
   call void @llvm.debugtrap()
   store volatile i32 2, ptr addrspace(1) %arg0
diff --git a/llvm/test/CodeGen/Mips/mipsr6-minmaxnum.ll b/llvm/test/CodeGen/Mips/mipsr6-minmaxnum.ll
index e14e899..2a0ad07 100644
--- a/llvm/test/CodeGen/Mips/mipsr6-minmaxnum.ll
+++ b/llvm/test/CodeGen/Mips/mipsr6-minmaxnum.ll
@@ -6,13 +6,17 @@
 define float @mins(float %x, float %y) {
 ; MIPS32R6EL-LABEL:	mins
 ; MIPS32R6EL:		# %bb.0:
+; MIPS32R6EL-NEXT:	min.s	$f0, $f14, $f14
+; MIPS32R6EL-NEXT:	min.s	$f1, $f12, $f12
 ; MIPS32R6EL-NEXT:	jr	$ra
-; MIPS32R6EL-NEXT:	min.s	$f0, $f12, $f14
+; MIPS32R6EL-NEXT:	min.s	$f0, $f1, $f0
 ;
 ; MIPS64R6EL-LABEL:	mins
 ; MIPS64R6EL:		# %bb.0:
+; MIPS64R6EL-NEXT:	min.s	$f0, $f13, $f13
+; MIPS64R6EL-NEXT:	min.s	$f1, $f12, $f12
 ; MIPS64R6EL-NEXT:	jr	$ra
-; MIPS64R6EL-NEXT:	min.s	$f0, $f12, $f13
+; MIPS64R6EL-NEXT:	min.s	$f0, $f1, $f0
 
   %r = tail call float @llvm.minnum.f32(float %x, float %y)
   ret float %r
@@ -21,13 +25,17 @@ define float @mins(float %x, float %y) {
 define float @maxs(float %x, float %y) {
 ; MIPS32R6EL-LABEL:	maxs
 ; MIPS32R6EL:		# %bb.0:
+; MIPS32R6EL-NEXT:	min.s	$f0, $f14, $f14
+; MIPS32R6EL-NEXT:	min.s	$f1, $f12, $f12
 ; MIPS32R6EL-NEXT:	jr	$ra
-; MIPS32R6EL-NEXT:	max.s	$f0, $f12, $f14
+; MIPS32R6EL-NEXT:	max.s	$f0, $f1, $f0
 ;
 ; MIPS64R6EL-LABEL:	maxs
 ; MIPS64R6EL:		# %bb.0:
+; MIPS64R6EL-NEXT:	min.s	$f0, $f13, $f13
+; MIPS64R6EL-NEXT:	min.s	$f1, $f12, $f12
 ; MIPS64R6EL-NEXT:	jr	$ra
-; MIPS64R6EL-NEXT:	max.s	$f0, $f12, $f13
+; MIPS64R6EL-NEXT:	max.s	$f0, $f1, $f0
 
   %r = tail call float @llvm.maxnum.f32(float %x, float %y)
   ret float %r
@@ -36,13 +44,17 @@ define float @maxs(float %x, float %y) {
 define double @mind(double %x, double %y) {
 ; MIPS32R6EL-LABEL:	mind
 ; MIPS32R6EL:		# %bb.0:
+; MIPS32R6EL-NEXT:	min.d	$f0, $f14, $f14
+; MIPS32R6EL-NEXT:	min.d	$f1, $f12, $f12
 ; MIPS32R6EL-NEXT:	jr	$ra
-; MIPS32R6EL-NEXT:	min.d	$f0, $f12, $f14
+; MIPS32R6EL-NEXT:	min.d	$f0, $f1, $f0
 ;
 ; MIPS64R6EL-LABEL:	mind
 ; MIPS64R6EL:		# %bb.0:
+; MIPS64R6EL-NEXT:	min.d	$f0, $f13, $f13
+; MIPS64R6EL-NEXT:	min.d	$f1, $f12, $f12
 ; MIPS64R6EL-NEXT:	jr	$ra
-; MIPS64R6EL-NEXT:	min.d	$f0, $f12, $f13
+; MIPS64R6EL-NEXT:	min.d	$f0, $f1, $f0
 
   %r = tail call double @llvm.minnum.f64(double %x, double %y)
   ret double %r
@@ -51,13 +63,17 @@ define double @mind(double %x, double %y) {
 define double @maxd(double %x, double %y) {
 ; MIPS32R6EL-LABEL:	maxd
 ; MIPS32R6EL:		# %bb.0:
+; MIPS32R6EL-NEXT:	min.d	$f0, $f14, $f14
+; MIPS32R6EL-NEXT:	min.d	$f1, $f12, $f12
 ; MIPS32R6EL-NEXT:	jr	$ra
-; MIPS32R6EL-NEXT:	max.d	$f0, $f12, $f14
+; MIPS32R6EL-NEXT:	max.d	$f0, $f1, $f0
 ;
 ; MIPS64R6EL-LABEL:	maxd
 ; MIPS64R6EL:		# %bb.0:
+; MIPS64R6EL-NEXT:	min.d	$f0, $f13, $f13
+; MIPS64R6EL-NEXT:	min.d	$f1, $f12, $f12
 ; MIPS64R6EL-NEXT:	jr	$ra
-; MIPS64R6EL-NEXT:	max.d	$f0, $f12, $f13
+; MIPS64R6EL-NEXT:	max.d	$f0, $f1, $f0
 
   %r = tail call double @llvm.maxnum.f64(double %x, double %y)
   ret double %r
diff --git a/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll b/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll
index fe68bee4..42b0f69 100644
--- a/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll
+++ b/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll
@@ -2466,13 +2466,14 @@ define void @fminnum(float %b) {
 ; MIPSR6-O32-NEXT:    lui $2, %hi(_gp_disp)
 ; MIPSR6-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
 ; MIPSR6-O32-NEXT:    addu $1, $2, $25
+; MIPSR6-O32-NEXT:    min.s $f0, $f12, $f12
 ; MIPSR6-O32-NEXT:    lw $1, %got(g)($1)
 ; MIPSR6-O32-NEXT:    lh $2, 0($1)
-; MIPSR6-O32-NEXT:    fill.h $w0, $2
-; MIPSR6-O32-NEXT:    fexupr.w $w0, $w0
-; MIPSR6-O32-NEXT:    copy_s.w $2, $w0[0]
-; MIPSR6-O32-NEXT:    mtc1 $2, $f0
-; MIPSR6-O32-NEXT:    min.s $f0, $f0, $f12
+; MIPSR6-O32-NEXT:    fill.h $w1, $2
+; MIPSR6-O32-NEXT:    fexupr.w $w1, $w1
+; MIPSR6-O32-NEXT:    copy_s.w $2, $w1[0]
+; MIPSR6-O32-NEXT:    mtc1 $2, $f1
+; MIPSR6-O32-NEXT:    min.s $f0, $f1, $f0
 ; MIPSR6-O32-NEXT:    mfc1 $2, $f0
 ; MIPSR6-O32-NEXT:    fill.w $w0, $2
 ; MIPSR6-O32-NEXT:    fexdo.h $w0, $w0, $w0
@@ -2485,13 +2486,14 @@ define void @fminnum(float %b) {
 ; MIPSR6-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fminnum)))
 ; MIPSR6-N32-NEXT:    addu $1, $1, $25
 ; MIPSR6-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fminnum)))
+; MIPSR6-N32-NEXT:    min.s $f0, $f12, $f12
 ; MIPSR6-N32-NEXT:    lw $1, %got_disp(g)($1)
 ; MIPSR6-N32-NEXT:    lh $2, 0($1)
-; MIPSR6-N32-NEXT:    fill.h $w0, $2
-; MIPSR6-N32-NEXT:    fexupr.w $w0, $w0
-; MIPSR6-N32-NEXT:    copy_s.w $2, $w0[0]
-; MIPSR6-N32-NEXT:    mtc1 $2, $f0
-; MIPSR6-N32-NEXT:    min.s $f0, $f0, $f12
+; MIPSR6-N32-NEXT:    fill.h $w1, $2
+; MIPSR6-N32-NEXT:    fexupr.w $w1, $w1
+; MIPSR6-N32-NEXT:    copy_s.w $2, $w1[0]
+; MIPSR6-N32-NEXT:    mtc1 $2, $f1
+; MIPSR6-N32-NEXT:    min.s $f0, $f1, $f0
 ; MIPSR6-N32-NEXT:    mfc1 $2, $f0
 ; MIPSR6-N32-NEXT:    fill.w $w0, $2
 ; MIPSR6-N32-NEXT:    fexdo.h $w0, $w0, $w0
@@ -2504,20 +2506,20 @@ define void @fminnum(float %b) {
 ; MIPSR6-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fminnum)))
 ; MIPSR6-N64-NEXT:    daddu $1, $1, $25
 ; MIPSR6-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fminnum)))
+; MIPSR6-N64-NEXT:    min.s $f0, $f12, $f12
 ; MIPSR6-N64-NEXT:    ld $1, %got_disp(g)($1)
 ; MIPSR6-N64-NEXT:    lh $2, 0($1)
-; MIPSR6-N64-NEXT:    fill.h $w0, $2
-; MIPSR6-N64-NEXT:    fexupr.w $w0, $w0
-; MIPSR6-N64-NEXT:    copy_s.w $2, $w0[0]
-; MIPSR6-N64-NEXT:    mtc1 $2, $f0
-; MIPSR6-N64-NEXT:    min.s $f0, $f0, $f12
+; MIPSR6-N64-NEXT:    fill.h $w1, $2
+; MIPSR6-N64-NEXT:    fexupr.w $w1, $w1
+; MIPSR6-N64-NEXT:    copy_s.w $2, $w1[0]
+; MIPSR6-N64-NEXT:    mtc1 $2, $f1
+; MIPSR6-N64-NEXT:    min.s $f0, $f1, $f0
 ; MIPSR6-N64-NEXT:    mfc1 $2, $f0
 ; MIPSR6-N64-NEXT:    fill.w $w0, $2
 ; MIPSR6-N64-NEXT:    fexdo.h $w0, $w0, $w0
 ; MIPSR6-N64-NEXT:    copy_u.h $2, $w0[0]
 ; MIPSR6-N64-NEXT:    jr $ra
 ; MIPSR6-N64-NEXT:    sh $2, 0($1)
-;
 entry:
   %0 = load i16, ptr @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
@@ -2632,17 +2634,18 @@ define void @fmaxnum(float %b) {
 ; MIPS64R5-N64-NEXT:    daddiu $sp, $sp, 32
 ;
 ; MIPSR6-O32-LABEL: fmaxnum:
-; MIPSR6-O32:       # %bb.0:
+; MIPSR6-O32:       # %bb.0: # %entry
 ; MIPSR6-O32-NEXT:    lui $2, %hi(_gp_disp)
 ; MIPSR6-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
 ; MIPSR6-O32-NEXT:    addu $1, $2, $25
+; MIPSR6-O32-NEXT:    min.s $f0, $f12, $f12
 ; MIPSR6-O32-NEXT:    lw $1, %got(g)($1)
 ; MIPSR6-O32-NEXT:    lh $2, 0($1)
-; MIPSR6-O32-NEXT:    fill.h $w0, $2
-; MIPSR6-O32-NEXT:    fexupr.w $w0, $w0
-; MIPSR6-O32-NEXT:    copy_s.w $2, $w0[0]
-; MIPSR6-O32-NEXT:    mtc1 $2, $f0
-; MIPSR6-O32-NEXT:    max.s $f0, $f0, $f12
+; MIPSR6-O32-NEXT:    fill.h $w1, $2
+; MIPSR6-O32-NEXT:    fexupr.w $w1, $w1
+; MIPSR6-O32-NEXT:    copy_s.w $2, $w1[0]
+; MIPSR6-O32-NEXT:    mtc1 $2, $f1
+; MIPSR6-O32-NEXT:    max.s $f0, $f1, $f0
 ; MIPSR6-O32-NEXT:    mfc1 $2, $f0
 ; MIPSR6-O32-NEXT:    fill.w $w0, $2
 ; MIPSR6-O32-NEXT:    fexdo.h $w0, $w0, $w0
@@ -2651,17 +2654,18 @@ define void @fmaxnum(float %b) {
 ; MIPSR6-O32-NEXT:    sh $2, 0($1)
 ;
 ; MIPSR6-N32-LABEL: fmaxnum:
-; MIPSR6-N32:       # %bb.0:
+; MIPSR6-N32:       # %bb.0: # %entry
 ; MIPSR6-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fmaxnum)))
 ; MIPSR6-N32-NEXT:    addu $1, $1, $25
 ; MIPSR6-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fmaxnum)))
+; MIPSR6-N32-NEXT:    min.s $f0, $f12, $f12
 ; MIPSR6-N32-NEXT:    lw $1, %got_disp(g)($1)
 ; MIPSR6-N32-NEXT:    lh $2, 0($1)
-; MIPSR6-N32-NEXT:    fill.h $w0, $2
-; MIPSR6-N32-NEXT:    fexupr.w $w0, $w0
-; MIPSR6-N32-NEXT:    copy_s.w $2, $w0[0]
-; MIPSR6-N32-NEXT:    mtc1 $2, $f0
-; MIPSR6-N32-NEXT:    max.s $f0, $f0, $f12
+; MIPSR6-N32-NEXT:    fill.h $w1, $2
+; MIPSR6-N32-NEXT:    fexupr.w $w1, $w1
+; MIPSR6-N32-NEXT:    copy_s.w $2, $w1[0]
+; MIPSR6-N32-NEXT:    mtc1 $2, $f1
+; MIPSR6-N32-NEXT:    max.s $f0, $f1, $f0
 ; MIPSR6-N32-NEXT:    mfc1 $2, $f0
 ; MIPSR6-N32-NEXT:    fill.w $w0, $2
 ; MIPSR6-N32-NEXT:    fexdo.h $w0, $w0, $w0
@@ -2670,17 +2674,18 @@ define void @fmaxnum(float %b) {
 ; MIPSR6-N32-NEXT:    sh $2, 0($1)
 ;
 ; MIPSR6-N64-LABEL: fmaxnum:
-; MIPSR6-N64:       # %bb.0:
+; MIPSR6-N64:       # %bb.0: # %entry
 ; MIPSR6-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fmaxnum)))
 ; MIPSR6-N64-NEXT:    daddu $1, $1, $25
 ; MIPSR6-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fmaxnum)))
+; MIPSR6-N64-NEXT:    min.s $f0, $f12, $f12
 ; MIPSR6-N64-NEXT:    ld $1, %got_disp(g)($1)
 ; MIPSR6-N64-NEXT:    lh $2, 0($1)
-; MIPSR6-N64-NEXT:    fill.h $w0, $2
-; MIPSR6-N64-NEXT:    fexupr.w $w0, $w0
-; MIPSR6-N64-NEXT:    copy_s.w $2, $w0[0]
-; MIPSR6-N64-NEXT:    mtc1 $2, $f0
-; MIPSR6-N64-NEXT:    max.s $f0, $f0, $f12
+; MIPSR6-N64-NEXT:    fill.h $w1, $2
+; MIPSR6-N64-NEXT:    fexupr.w $w1, $w1
+; MIPSR6-N64-NEXT:    copy_s.w $2, $w1[0]
+; MIPSR6-N64-NEXT:    mtc1 $2, $f1
+; MIPSR6-N64-NEXT:    max.s $f0, $f1, $f0
 ; MIPSR6-N64-NEXT:    mfc1 $2, $f0
 ; MIPSR6-N64-NEXT:    fill.w $w0, $2
 ; MIPSR6-N64-NEXT:    fexdo.h $w0, $w0, $w0
diff --git a/llvm/test/CodeGen/PowerPC/aix-tocdata-fastisel.ll b/llvm/test/CodeGen/PowerPC/aix-tocdata-fastisel.ll
index 5a7fcd1..6533891 100644
--- a/llvm/test/CodeGen/PowerPC/aix-tocdata-fastisel.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-tocdata-fastisel.ll
@@ -1,9 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s -mtriple=powerpc64-ibm-aix-xcoff -fast-isel -verify-machineinstrs \
 ; RUN:   -code-model=small | FileCheck %s --check-prefix=SMALL
-
-;; FIXME: when toc data for 64 big large code model is supported,
-;; add a run line for large code model too.
+; RUN: llc < %s -mtriple=powerpc64-ibm-aix-xcoff -fast-isel -verify-machineinstrs \
+; RUN:   -code-model=large | FileCheck %s --check-prefix=LARGE
 
 @a = global i32 0, align 4 #0
 
@@ -11,9 +10,15 @@ define signext i32 @foo() #1 {
 ; SMALL-LABEL: foo:
 ; SMALL:       # %bb.0: # %entry
 ; SMALL-NEXT:    la 3, a[TD](2)
-; SMALL-NEXT:    lwz 3, 0(3)
-; SMALL-NEXT:    extsw 3, 3
+; SMALL-NEXT:    lwa 3, 0(3)
 ; SMALL-NEXT:    blr
+;
+; LARGE-LABEL: foo:
+; LARGE:       # %bb.0: # %entry
+; LARGE-NEXT:    addis 3, a[TD]@u(2)
+; LARGE-NEXT:    la 3, a[TD]@l(3)
+; LARGE-NEXT:    lwa 3, 0(3)
+; LARGE-NEXT:    blr
 entry:
   %0 = load i32, ptr @a, align 4
   ret i32 %0
diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-le.ll b/llvm/test/CodeGen/PowerPC/ctrloop-le.ll
index 599e540..08ecd89 100644
--- a/llvm/test/CodeGen/PowerPC/ctrloop-le.ll
+++ b/llvm/test/CodeGen/PowerPC/ctrloop-le.ll
@@ -293,8 +293,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos1_rr_sle
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos1_rr_sle(ptr nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -323,8 +322,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos2_rr_sle
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos2_rr_sle(ptr nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -353,8 +351,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos4_rr_sle
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos4_rr_sle(ptr nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -383,8 +380,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos8_rr_sle
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos8_rr_sle(ptr nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -413,8 +409,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos16_rr_sle
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos16_rr_sle(ptr nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
diff --git a/llvm/test/CodeGen/PowerPC/toc-data-no-data-sections.ll b/llvm/test/CodeGen/PowerPC/toc-data-no-data-sections.ll
new file mode 100644
index 0000000..77851fb
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/toc-data-no-data-sections.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -data-sections=false -verify-machineinstrs < %s | FileCheck %s
+
+@a1 = global i32 0, align 4 #0
+
+define void @foo() {
+entry:
+  store i32 1, ptr @a1, align 4
+  ret void
+}
+
+attributes #0 = { "toc-data" }
+
+; CHECK: .toc
+; CHECK-NEXT: .csect a1[TD],2
+; CHECK-NEXT: .globl  a1[TD]
+; CHECK-NEXT: .align  2
+; CHECK-NOT: a1[TD]:
+; CHECK-NEXT: .vbyte  4, 0
diff --git a/llvm/test/CodeGen/PowerPC/toc-data.ll b/llvm/test/CodeGen/PowerPC/toc-data.ll
index 7f7afe7..1228665 100644
--- a/llvm/test/CodeGen/PowerPC/toc-data.ll
+++ b/llvm/test/CodeGen/PowerPC/toc-data.ll
@@ -16,6 +16,10 @@
 ; RUN:     -stop-before=ppc-vsx-copy | FileCheck %s --check-prefix CHECK32LARGE
 ; RUN: llc -mtriple powerpc-ibm-aix-xcoff -code-model=large -verify-machineinstrs < %s | FileCheck %s --check-prefix TEST32LARGE
 
+; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -code-model=large -verify-machineinstrs < %s \
+; RUN:     -stop-before=ppc-vsx-copy | FileCheck %s --check-prefix CHECK64LARGE
+; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -code-model=large -verify-machineinstrs < %s | FileCheck %s --check-prefix TEST64LARGE
+
 ; Global variables i and f have the toc-data attribute.
 ; In the following functions, those writing to or reading from
 ; variables i and f should use the toc-data access pattern.
@@ -45,8 +49,8 @@ define dso_local void @write_int(i32 signext %in) {
 
 ; CHECK64-NOOPT:  name: write_int
 ; CHECK64-NOOPT:    %[[SUBREG:[0-9]+]]:gprc = COPY %{{[0-9]}}.sub_32
-; CHECK64-NOOPT:    %[[ADDR:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDItoc8 @i, $x2 :: (load (s64) from got)
-; CHECK64-NOOPT:    STW %[[SUBREG]], 0, killed %[[ADDR]] :: (store (s32) into @i)
+; CHECK64-NOOPT:    %[[ADDR:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDItoc8 @i, $x2
+; CHECK64-NOOPT:    STW %[[SUBREG]], 0, %[[ADDR]]
 
 ; TEST64:         .write_int:
 ; TEST64:           la 4, i[TD](2)
@@ -63,6 +67,17 @@ define dso_local void @write_int(i32 signext %in) {
 ; TEST32LARGE-NEXT:	la 4, i[TD]@l(4)
 ; TEST32LARGE-NEXT:	stw 3, 0(4)
 
+
+; CHECK64LARGE: name:            write_int
+; CHECK64LARGE:      %[[SCRATCH1:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @i
+; CHECK64LARGE-NEXT: %[[SCRATCH2:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDItocL8 killed %[[SCRATCH1]], @i
+; CHECK64LARGE-NEXT: STW8 %{{[0-9]+}}, 0, killed %[[SCRATCH2]] :: (store (s32) into @i)
+
+; TEST64LARGE:         .write_int:
+; TEST64LARGE:          addis 4, i[TD]@u(2)
+; TEST64LARGE-NEXT:	la 4, i[TD]@l(4)
+; TEST64LARGE-NEXT:	stw 3, 0(4)
+
 define dso_local i64 @read_ll() {
   entry:
     %0 = load i64, ptr @ll, align 8
@@ -98,6 +113,15 @@ define dso_local i64 @read_ll() {
 ; TEST32LARGE-NEXT:	lwz 3, 0(4)
 ; TEST32LARGE-NEXT:	lwz 4, 4(4)
 
+; CHECK64LARGE: name:            read_ll
+; CHECK64LARGE: %[[SCRATCH1:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @ll
+; CHECK64LARGE: LDtocL @ll, killed %[[SCRATCH1]] :: (load (s64) from got)
+
+; TEST64LARGE:         .read_ll:
+; TEST64LARGE:          addis 3, L..C0@u(2)
+; TEST64LARGE-NEXT:	ld 3, L..C0@l(3)
+; TEST64LARGE-NEXT:	ld 3, 0(3)
+
 define dso_local float @read_float() {
   entry:
     %0 = load float, ptr @f, align 4
@@ -117,7 +141,7 @@ define dso_local float @read_float() {
 
 ; CHECK64-NOOPT: name:            read_float
 ; CHECK64-NOOPT:   %[[SCRATCH:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDItoc8 @f, $x2
-; CHECK64-NOOPT:   %{{[0-9]+}}:f4rc = LFS 0, killed %[[SCRATCH]]
+; CHECK64-NOOPT:   %{{[0-9]+}}:f4rc = LFS 0, %[[SCRATCH]]
 
 ; TEST64:       .read_float:
 ; TEST64:         la 3, f[TD](2)
@@ -134,6 +158,18 @@ define dso_local float @read_float() {
 ; TEST32LARGE-NEXT:	la 3, f[TD]@l(3)
 ; TEST32LARGE-NEXT:	lfs 1, 0(3)
 
+
+; CHECK64LARGE: name:            read_float
+; CHECK64LARGE:      %[[SCRATCH1:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @f
+; CHECK64LARGE-NEXT: %[[SCRATCH2:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDItocL8 killed %[[SCRATCH1]], @f
+; CHECK64LARGE-NEXT: LFS 0, killed %[[SCRATCH2]] :: (dereferenceable load (s32) from @f)
+
+
+; TEST64LARGE:         .read_float:
+; TEST64LARGE:          addis 3, f[TD]@u(2)
+; TEST64LARGE-NEXT:	la 3, f[TD]@l(3)
+; TEST64LARGE-NEXT:	lfs 1, 0(3)
+
 define dso_local void @write_double(double %in) {
   entry:
     store double %in, ptr @d, align 8
@@ -167,6 +203,15 @@ define dso_local void @write_double(double %in) {
 ; TEST32LARGE-NEXT:	lwz 3, L..C1@l(3)
 ; TEST32LARGE-NEXT:	stfd 1, 0(3)
 
+; CHECK64LARGE: name:            write_double
+; CHECK64LARGE: %[[SCRATCH1:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @d
+; CHECK64LARGE: LDtocL @d, killed %[[SCRATCH1]] :: (load (s64) from got)
+
+; TEST64LARGE:         .write_double:
+; TEST64LARGE:          addis 3, L..C1@u(2)
+; TEST64LARGE-NEXT:	ld 3, L..C1@l(3)
+; TEST64LARGE-NEXT:	stfd 1, 0(3)
+
 define dso_local nonnull ptr @addr() {
   entry:
     ret ptr @i
@@ -183,7 +228,7 @@ define dso_local nonnull ptr @addr() {
 ; CHECK64-NEXT:  $x3 = COPY %[[SCRATCH]]
 
 ; CHECK64-NOOPT: name:            addr
-; CHECK64-NOOPT:   %[[SCRATCH:[0-9]+]]:g8rc = ADDItoc8 @i, $x2
+; CHECK64-NOOPT:   %[[SCRATCH:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDItoc8 @i, $x2
 ; CHECK64-NOOPT:   $x3 = COPY %[[SCRATCH]]
 
 ; TEST64:       .addr
@@ -237,4 +282,26 @@ define dso_local nonnull ptr @addr() {
 ; TEST32LARGE-NEXT:      .globl f[TD]
 ; TEST32LARGE-NOT:       .tc f[TE],f[RW]
 
+; CHECK64LARGE: name:            addr
+; CHECK64LARGE:      %[[SCRATCH1:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @i
+; CHECK64LARGE-NEXT: %[[SCRATCH2:[0-9]+]]:g8rc = ADDItocL8 killed %[[SCRATCH1]], @i
+; CHECK64LARGE-NEXT: $x3 = COPY %[[SCRATCH2]]
+
+; TEST64LARGE:         .addr:
+; TEST64LARGE:          addis 3, i[TD]@u(2)
+; TEST64LARGE:          la 3, i[TD]@l(3)
+
+; TEST64LARGE:         .toc
+; TEST64LARGE:           .tc ll[TE],ll[RW]
+; TEST64LARGE-NOT:       .csect ll[TD]
+; TEST64LARGE:           .tc d[TE],d[RW]
+; TEST64LARGE-NOT:       .csect d[TD],2
+; TEST64LARGE:           .csect i[TD],2
+; TEST64LARGE-NEXT:      .globl  i[TD]
+; TEST64LARGE-NEXT:      .align  2
+; TEST64LARGE-NOT:       .tc i[TE],i[RW]
+; TEST64LARGE:           .csect f[TD],2
+; TEST64LARGE-NEXT:      .globl f[TD]
+; TEST64LARGE-NOT:       .tc f[TE],f[RW]
+
 attributes #0 = { "toc-data" }
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll
new file mode 100644
index 0000000..70d1b25
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -global-isel -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV32
+
+define i16 @constant_fold_barrier_i16(i16 %x, i16 %y) {
+; RV32-LABEL: constant_fold_barrier_i16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    slli a1, a1, 11
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    addi a1, a1, 289
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+entry:
+  %and = and i16 %x, 2048
+  %or = or i16 %and, 2337
+  ret i16 %or
+}
+
+define void @constant_fold_barrier_i128(ptr %p) {
+; RV32-LABEL: constant_fold_barrier_i128:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    slli a1, a1, 11
+; RV32-NEXT:    lw a2, 0(a0)
+; RV32-NEXT:    lw a3, 4(a0)
+; RV32-NEXT:    lw a4, 8(a0)
+; RV32-NEXT:    lw a5, 12(a0)
+; RV32-NEXT:    and a2, a2, a1
+; RV32-NEXT:    and a3, a3, zero
+; RV32-NEXT:    and a4, a4, zero
+; RV32-NEXT:    and a5, a5, zero
+; RV32-NEXT:    add a2, a2, a1
+; RV32-NEXT:    sltu a1, a2, a1
+; RV32-NEXT:    add a6, a3, zero
+; RV32-NEXT:    sltu a3, a6, a3
+; RV32-NEXT:    add a6, a6, a1
+; RV32-NEXT:    seqz a7, a6
+; RV32-NEXT:    and a1, a7, a1
+; RV32-NEXT:    or a1, a3, a1
+; RV32-NEXT:    add a3, a4, zero
+; RV32-NEXT:    sltu a4, a3, a4
+; RV32-NEXT:    add a3, a3, a1
+; RV32-NEXT:    seqz a7, a3
+; RV32-NEXT:    and a1, a7, a1
+; RV32-NEXT:    or a1, a4, a1
+; RV32-NEXT:    add a5, a5, zero
+; RV32-NEXT:    add a1, a5, a1
+; RV32-NEXT:    sw a2, 0(a0)
+; RV32-NEXT:    sw a6, 4(a0)
+; RV32-NEXT:    sw a3, 8(a0)
+; RV32-NEXT:    sw a1, 12(a0)
+; RV32-NEXT:    ret
+entry:
+  %x = load i128, ptr %p
+  %and = and i128 %x, 2048
+  %add = add i128 %and, 2048
+  store i128 %add, ptr %p
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv64.ll
new file mode 100644
index 0000000..21d7b1d
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv64.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv64 -global-isel -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV64
+
+define i16 @constant_fold_barrier_i16(i16 %x, i16 %y) {
+; RV64-LABEL: constant_fold_barrier_i16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    slli a1, a1, 11
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    addiw a1, a1, 289
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+entry:
+  %and = and i16 %x, 2048
+  %or = or i16 %and, 2337
+  ret i16 %or
+}
+
+define i128 @constant_fold_barrier_i128(i128 %x) {
+; RV64-LABEL: constant_fold_barrier_i128:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a2, 1
+; RV64-NEXT:    slli a2, a2, 11
+; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    and a1, a1, zero
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    sltu a2, a0, a2
+; RV64-NEXT:    add a1, a1, zero
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    ret
+entry:
+  %and = and i128 %x, 2048
+  %add = add i128 %and, 2048
+  ret i128 %add
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv32.mir
index 6b1fc20..bbe8ef4 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv32.mir
@@ -16,6 +16,38 @@ body:             |
 
 ...
 ---
+name:            constbarrier_i16
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: constbarrier_i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2048
+    ; CHECK-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:_(s32) = G_CONSTANT_FOLD_BARRIER [[C]]
+    ; CHECK-NEXT: $x10 = COPY [[CONSTANT_FOLD_BARRIER]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s16) = G_CONSTANT i16 2048
+    %2:_(s16) = G_CONSTANT_FOLD_BARRIER %1
+    %3:_(s32) = G_ANYEXT %2(s16)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            constbarrier_i128
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: constbarrier_i128
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2048
+    ; CHECK-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:_(s32) = G_CONSTANT_FOLD_BARRIER [[C]]
+    ; CHECK-NEXT: $x10 = COPY [[CONSTANT_FOLD_BARRIER]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s128) = G_CONSTANT i128 2048
+    %2:_(s128) = G_CONSTANT_FOLD_BARRIER %1
+    %3:_(s32) = G_TRUNC %2(s128)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
 name:            constbarrier_nxv2i1
 body:             |
   bb.0.entry:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv64.mir
index de6a82b..96b1aa5 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv64.mir
@@ -33,6 +33,39 @@ body:             |
 
 ...
 ---
+name:            constbarrier_i16
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: constbarrier_i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2048
+    ; CHECK-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:_(s32) = G_CONSTANT_FOLD_BARRIER [[C]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[CONSTANT_FOLD_BARRIER]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s16) = G_CONSTANT i16 2048
+    %2:_(s16) = G_CONSTANT_FOLD_BARRIER %1
+    %3:_(s64) = G_ANYEXT %2(s16)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            constbarrier_i128
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: constbarrier_i128
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2048
+    ; CHECK-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:_(s64) = G_CONSTANT_FOLD_BARRIER [[C]]
+    ; CHECK-NEXT: $x10 = COPY [[CONSTANT_FOLD_BARRIER]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s128) = G_CONSTANT i128 2048
+    %2:_(s128) = G_CONSTANT_FOLD_BARRIER %1
+    %3:_(s64) = G_TRUNC %2(s128)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
 name:            constbarrier_nxv2i1
 body:             |
   bb.0.entry:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv32.mir
index 4177a40..26d8785 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv32.mir
@@ -555,3 +555,93 @@ body:             |
     PseudoRET implicit $x10, implicit $x11
 
 ...
+---
+name:            udivrem_i32
+body:             |
+  bb.1.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-I-LABEL: name: udivrem_i32
+    ; CHECK-I: liveins: $x10, $x11
+    ; CHECK-I-NEXT: {{  $}}
+    ; CHECK-I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-I-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-I-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: $x10 = COPY [[COPY]](s32)
+    ; CHECK-I-NEXT: $x11 = COPY [[COPY1]](s32)
+    ; CHECK-I-NEXT: PseudoCALL target-flags(riscv-call) &__udivsi3, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: $x10 = COPY [[COPY]](s32)
+    ; CHECK-I-NEXT: $x11 = COPY [[COPY1]](s32)
+    ; CHECK-I-NEXT: PseudoCALL target-flags(riscv-call) &__umodsi3, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-I-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY3]]
+    ; CHECK-I-NEXT: $x10 = COPY [[ADD]](s32)
+    ; CHECK-I-NEXT: PseudoRET implicit $x10
+    ;
+    ; CHECK-M-LABEL: name: udivrem_i32
+    ; CHECK-M: liveins: $x10, $x11
+    ; CHECK-M-NEXT: {{  $}}
+    ; CHECK-M-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-M-NEXT: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[COPY]], [[COPY1]]
+    ; CHECK-M-NEXT: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[COPY]], [[COPY1]]
+    ; CHECK-M-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UDIV]], [[UREM]]
+    ; CHECK-M-NEXT: $x10 = COPY [[ADD]](s32)
+    ; CHECK-M-NEXT: PseudoRET implicit $x10
+    %0:_(s32) = COPY $x10
+    %1:_(s32) = COPY $x11
+    %2:_(s32), %3:_(s32) = G_UDIVREM %0, %1
+    %4:_(s32) = G_ADD %2, %3
+    $x10 = COPY %4(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            sdivrem_i32
+body:             |
+  bb.1.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-I-LABEL: name: sdivrem_i32
+    ; CHECK-I: liveins: $x10, $x11
+    ; CHECK-I-NEXT: {{  $}}
+    ; CHECK-I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-I-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-I-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: $x10 = COPY [[COPY]](s32)
+    ; CHECK-I-NEXT: $x11 = COPY [[COPY1]](s32)
+    ; CHECK-I-NEXT: PseudoCALL target-flags(riscv-call) &__divsi3, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: $x10 = COPY [[COPY]](s32)
+    ; CHECK-I-NEXT: $x11 = COPY [[COPY1]](s32)
+    ; CHECK-I-NEXT: PseudoCALL target-flags(riscv-call) &__modsi3, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-I-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY3]]
+    ; CHECK-I-NEXT: $x10 = COPY [[ADD]](s32)
+    ; CHECK-I-NEXT: PseudoRET implicit $x10
+    ;
+    ; CHECK-M-LABEL: name: sdivrem_i32
+    ; CHECK-M: liveins: $x10, $x11
+    ; CHECK-M-NEXT: {{  $}}
+    ; CHECK-M-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-M-NEXT: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[COPY]], [[COPY1]]
+    ; CHECK-M-NEXT: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[COPY]], [[COPY1]]
+    ; CHECK-M-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SDIV]], [[SREM]]
+    ; CHECK-M-NEXT: $x10 = COPY [[ADD]](s32)
+    ; CHECK-M-NEXT: PseudoRET implicit $x10
+    %0:_(s32) = COPY $x10
+    %1:_(s32) = COPY $x11
+    %2:_(s32), %3:_(s32) = G_SDIVREM %0, %1
+    %4:_(s32) = G_ADD %2, %3
+    $x10 = COPY %4(s32)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv64.mir
index 492f953..bbbe38f 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv64.mir
@@ -655,3 +655,93 @@ body:             |
     PseudoRET implicit $x10, implicit $x11
 
 ...
+---
+name:            udivrem_i64
+body:             |
+  bb.1.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-I-LABEL: name: udivrem_i64
+    ; CHECK-I: liveins: $x10, $x11
+    ; CHECK-I-NEXT: {{  $}}
+    ; CHECK-I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-I-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-I-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: $x10 = COPY [[COPY]](s64)
+    ; CHECK-I-NEXT: $x11 = COPY [[COPY1]](s64)
+    ; CHECK-I-NEXT: PseudoCALL target-flags(riscv-call) &__udivdi3, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: $x10 = COPY [[COPY]](s64)
+    ; CHECK-I-NEXT: $x11 = COPY [[COPY1]](s64)
+    ; CHECK-I-NEXT: PseudoCALL target-flags(riscv-call) &__umoddi3, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY2]], [[COPY3]]
+    ; CHECK-I-NEXT: $x10 = COPY [[ADD]](s64)
+    ; CHECK-I-NEXT: PseudoRET implicit $x10
+    ;
+    ; CHECK-M-LABEL: name: udivrem_i64
+    ; CHECK-M: liveins: $x10, $x11
+    ; CHECK-M-NEXT: {{  $}}
+    ; CHECK-M-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-M-NEXT: [[UDIV:%[0-9]+]]:_(s64) = G_UDIV [[COPY]], [[COPY1]]
+    ; CHECK-M-NEXT: [[UREM:%[0-9]+]]:_(s64) = G_UREM [[COPY]], [[COPY1]]
+    ; CHECK-M-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[UDIV]], [[UREM]]
+    ; CHECK-M-NEXT: $x10 = COPY [[ADD]](s64)
+    ; CHECK-M-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = COPY $x10
+    %1:_(s64) = COPY $x11
+    %2:_(s64), %3:_(s64) = G_UDIVREM %0, %1
+    %4:_(s64) = G_ADD %2, %3
+    $x10 = COPY %4(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            sdivrem_i64
+body:             |
+  bb.1.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-I-LABEL: name: sdivrem_i64
+    ; CHECK-I: liveins: $x10, $x11
+    ; CHECK-I-NEXT: {{  $}}
+    ; CHECK-I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-I-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-I-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: $x10 = COPY [[COPY]](s64)
+    ; CHECK-I-NEXT: $x11 = COPY [[COPY1]](s64)
+    ; CHECK-I-NEXT: PseudoCALL target-flags(riscv-call) &__divdi3, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: $x10 = COPY [[COPY]](s64)
+    ; CHECK-I-NEXT: $x11 = COPY [[COPY1]](s64)
+    ; CHECK-I-NEXT: PseudoCALL target-flags(riscv-call) &__moddi3, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY2]], [[COPY3]]
+    ; CHECK-I-NEXT: $x10 = COPY [[ADD]](s64)
+    ; CHECK-I-NEXT: PseudoRET implicit $x10
+    ;
+    ; CHECK-M-LABEL: name: sdivrem_i64
+    ; CHECK-M: liveins: $x10, $x11
+    ; CHECK-M-NEXT: {{  $}}
+    ; CHECK-M-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-M-NEXT: [[SDIV:%[0-9]+]]:_(s64) = G_SDIV [[COPY]], [[COPY1]]
+    ; CHECK-M-NEXT: [[SREM:%[0-9]+]]:_(s64) = G_SREM [[COPY]], [[COPY1]]
+    ; CHECK-M-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[SDIV]], [[SREM]]
+    ; CHECK-M-NEXT: $x10 = COPY [[ADD]](s64)
+    ; CHECK-M-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = COPY $x10
+    %1:_(s64) = COPY $x11
+    %2:_(s64), %3:_(s64) = G_SDIVREM %0, %1
+    %4:_(s64) = G_ADD %2, %3
+    $x10 = COPY %4(s64)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-frem-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-frem-rv32.mir
new file mode 100644
index 0000000..adf3f45
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-frem-rv32.mir
@@ -0,0 +1,130 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=riscv32 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck %s
+---
+name:            frem_f32
+body:             |
+  bb.1.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: frem_f32
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: $x10 = COPY [[COPY]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[COPY1]](s32)
+    ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &fmodf, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: $x10 = COPY [[COPY2]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s32) = COPY $x10
+    %1:_(s32) = COPY $x11
+    %2:_(s32) = G_FREM %0, %1
+    $x10 = COPY %2(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            frem_f64
+body:             |
+  bb.1.entry:
+    liveins: $x10, $x11, $x12, $x13
+
+    ; CHECK-LABEL: name: frem_f64
+    ; CHECK: liveins: $x10, $x11, $x12, $x13
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x12
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x13
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: $x10 = COPY [[COPY]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[COPY1]](s32)
+    ; CHECK-NEXT: $x12 = COPY [[COPY2]](s32)
+    ; CHECK-NEXT: $x13 = COPY [[COPY3]](s32)
+    ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &fmod, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit-def $x10, implicit-def $x11
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: $x10 = COPY [[COPY4]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[COPY5]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
+    %2:_(s32) = COPY $x10
+    %3:_(s32) = COPY $x11
+    %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+    %4:_(s32) = COPY $x12
+    %5:_(s32) = COPY $x13
+    %1:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s64) = G_FREM %0, %1
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(s64)
+    $x10 = COPY %7(s32)
+    $x11 = COPY %8(s32)
+    PseudoRET implicit $x10, implicit $x11
+
+...
+---
+name:            frem_f16
+body:             |
+  bb.0.entry:
+
+    ; CHECK-LABEL: name: frem_f16
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
+    ; CHECK-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: $x10 = COPY [[FPEXT]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[FPEXT1]](s32)
+    ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &fmodf, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[COPY2]](s32)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s32) = COPY $x10
+    %1:_(s32) = COPY $x11
+    %2:_(s16) = G_TRUNC %0(s32)
+    %3:_(s16) = G_TRUNC %1(s32)
+    %4:_(s16) = G_FREM %2, %3
+    %5:_(s32) = G_ANYEXT %4(s16)
+    $x10 = COPY %5(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            frem_v2f32
+body:             |
+  bb.0.entry:
+
+    ; CHECK-LABEL: name: frem_v2f32
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $v8
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $v9
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: $x10 = COPY [[UV]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[UV2]](s32)
+    ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &fmodf, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: $x10 = COPY [[UV1]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[UV3]](s32)
+    ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &fmodf, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<2 x s32>) = COPY $v8
+    %1:_(<2 x s32>) = COPY $v9
+    %2:_(<2 x s32>) = G_FREM %0, %1
+    $v8 = COPY %2(<2 x s32>)
+    PseudoRET implicit $v8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-frem-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-frem-rv64.mir
new file mode 100644
index 0000000..5db66bb
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-frem-rv64.mir
@@ -0,0 +1,130 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=riscv64 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck %s
+---
+name:            frem_f32
+body:             |
+  bb.1.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: frem_f32
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: $x10 = COPY [[COPY]](s64)
+    ; CHECK-NEXT: $x11 = COPY [[COPY1]](s64)
+    ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &fmodf, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: $x10 = COPY [[COPY2]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %2:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %2(s64)
+    %3:_(s64) = COPY $x11
+    %1:_(s32) = G_TRUNC %3(s64)
+    %4:_(s32) = G_FREM %0, %1
+    %5:_(s64) = G_ANYEXT %4(s32)
+    $x10 = COPY %5(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            frem_f64
+body:             |
+  bb.1.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: frem_f64
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: $x10 = COPY [[COPY]](s64)
+    ; CHECK-NEXT: $x11 = COPY [[COPY1]](s64)
+    ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &fmod, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: $x10 = COPY [[COPY2]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = COPY $x10
+    %1:_(s64) = COPY $x11
+    %2:_(s64) = G_FREM %0, %1
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            frem_f16
+body:             |
+  bb.0.entry:
+
+    ; CHECK-LABEL: name: frem_f16
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
+    ; CHECK-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[FPEXT]](s32)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[FPEXT1]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: $x11 = COPY [[ANYEXT1]](s64)
+    ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &fmodf, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64)
+    ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[TRUNC2]](s32)
+    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[FPTRUNC]](s16)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT2]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = COPY $x10
+    %1:_(s64) = COPY $x11
+    %2:_(s16) = G_TRUNC %0(s64)
+    %3:_(s16) = G_TRUNC %1(s64)
+    %4:_(s16) = G_FREM %2, %3
+    %5:_(s64) = G_ANYEXT %4(s16)
+    $x10 = COPY %5(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            frem_v2f32
+body:             |
+  bb.0.entry:
+
+    ; CHECK-LABEL: name: frem_v2f32
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $v8
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $v9
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV]](s32)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[UV2]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: $x11 = COPY [[ANYEXT1]](s64)
+    ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &fmodf, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[UV1]](s32)
+    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[UV3]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT2]](s64)
+    ; CHECK-NEXT: $x11 = COPY [[ANYEXT3]](s64)
+    ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &fmodf, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY3]](s64)
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[TRUNC]](s32), [[TRUNC1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<2 x s32>) = COPY $v8
+    %1:_(<2 x s32>) = COPY $v9
+    %2:_(<2 x s32>) = G_FREM %0, %1
+    $v8 = COPY %2(<2 x s32>)
+    PseudoRET implicit $v8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-lshr-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-lshr-rv64.mir
index 8cbae0f..4331811 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-lshr-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-lshr-rv64.mir
@@ -336,3 +336,29 @@ body:             |
     PseudoRET implicit $x10
 
 ...
+---
+name:            lshr_i32_i48
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: lshr_i32_i48
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC]], [[C]](s64)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s64) = COPY $x10
+    %0:_(s48) = G_TRUNC %1(s64)
+    %2:_(s48) = G_CONSTANT i48 16
+    %6:_(s32) = G_TRUNC %0(s48)
+    %7:_(s32) = G_LSHR %6, %2(s48)
+    %5:_(s64) = G_ANYEXT %7(s32)
+    $x10 = COPY %5(s64)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv32.mir
new file mode 100644
index 0000000..08aa92e
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv32.mir
@@ -0,0 +1,404 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=riscv32 -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s --check-prefixes=CHECK,RV32I
+# RUN: llc -mtriple=riscv32 -mattr=+zbb -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s --check-prefixes=CHECK,RV32ZBB
+
+---
+name:            uaddsat_i32
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; RV32I-LABEL: name: uaddsat_i32
+    ; RV32I: liveins: $x10, $x11
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; RV32I-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY1]]
+    ; RV32I-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
+    ; RV32I-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32I-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[C]], [[COPY2]]
+    ; RV32I-NEXT: $x10 = COPY [[SELECT]](s32)
+    ; RV32I-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV32ZBB-LABEL: name: uaddsat_i32
+    ; RV32ZBB: liveins: $x10, $x11
+    ; RV32ZBB-NEXT: {{  $}}
+    ; RV32ZBB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; RV32ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; RV32ZBB-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32ZBB-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[C]]
+    ; RV32ZBB-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[XOR]], [[COPY1]]
+    ; RV32ZBB-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[UMIN]]
+    ; RV32ZBB-NEXT: $x10 = COPY [[ADD]](s32)
+    ; RV32ZBB-NEXT: PseudoRET implicit $x10
+    %0:_(s32) = COPY $x10
+    %1:_(s32) = COPY $x11
+    %2:_(s32) = G_UADDSAT %0, %1(s32)
+    $x10 = COPY %2(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            uaddsat_i64
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+    ; CHECK-LABEL: name: uaddsat_i64
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x12
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x13
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY3]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]]
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32)
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY5]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY5]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY4]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP3]], [[ICMP1]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[C]], [[COPY4]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[C1]], [[COPY5]]
+    ; CHECK-NEXT: $x10 = COPY [[SELECT1]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[SELECT2]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
+    %2:_(s32) = COPY $x10
+    %3:_(s32) = COPY $x11
+    %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+    %4:_(s32) = COPY $x12
+    %5:_(s32) = COPY $x13
+    %1:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s64) = G_UADDSAT %0, %1(s64)
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(s64)
+    $x10 = COPY %7(s32)
+    $x11 = COPY %8(s32)
+    PseudoRET implicit $x10, implicit $x11
+
+...
+---
+name:            saddsat_i32
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; RV32I-LABEL: name: saddsat_i32
+    ; RV32I: liveins: $x10, $x11
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; RV32I-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
+    ; RV32I-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[ADD]](s32), [[COPY]]
+    ; RV32I-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY1]](s32), [[C]]
+    ; RV32I-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ICMP1]], [[ICMP]]
+    ; RV32I-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
+    ; RV32I-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+    ; RV32I-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY2]], [[C1]](s32)
+    ; RV32I-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; RV32I-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ASHR]], [[C2]]
+    ; RV32I-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[XOR]](s32), [[ADD1]], [[COPY2]]
+    ; RV32I-NEXT: $x10 = COPY [[SELECT]](s32)
+    ; RV32I-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV32ZBB-LABEL: name: saddsat_i32
+    ; RV32ZBB: liveins: $x10, $x11
+    ; RV32ZBB-NEXT: {{  $}}
+    ; RV32ZBB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; RV32ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; RV32ZBB-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
+    ; RV32ZBB-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; RV32ZBB-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32ZBB-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[COPY]], [[C2]]
+    ; RV32ZBB-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SMAX]]
+    ; RV32ZBB-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[COPY]], [[C2]]
+    ; RV32ZBB-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMIN]]
+    ; RV32ZBB-NEXT: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[COPY1]]
+    ; RV32ZBB-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB]]
+    ; RV32ZBB-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[SMIN1]]
+    ; RV32ZBB-NEXT: $x10 = COPY [[ADD]](s32)
+    ; RV32ZBB-NEXT: PseudoRET implicit $x10
+    %0:_(s32) = COPY $x10
+    %1:_(s32) = COPY $x11
+    %2:_(s32) = G_SADDSAT %0, %1(s32)
+    $x10 = COPY %2(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            saddsat_i64
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+    ; CHECK-LABEL: name: saddsat_i64
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x12
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x13
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY3]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]]
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY5]](s32), [[COPY1]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY5]](s32), [[COPY1]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY4]](s32), [[COPY]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP3]], [[ICMP1]]
+    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY3]](s32), [[C1]]
+    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY3]](s32), [[C1]]
+    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s32), [[ICMP6]], [[ICMP4]]
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SELECT1]], [[SELECT]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY5]], [[C2]](s32)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+    ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[COPY5]], [[C3]](s32)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ASHR]], [[C4]]
+    ; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD3]](s32), [[C4]]
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[ADD3]](s32)
+    ; CHECK-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ASHR1]], [[C5]]
+    ; CHECK-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[ADD4]], [[ICMP7]]
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ADD5]](s32)
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[XOR]](s32), [[COPY6]], [[COPY4]]
+    ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[XOR]](s32), [[COPY7]], [[COPY5]]
+    ; CHECK-NEXT: $x10 = COPY [[SELECT2]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[SELECT3]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
+    %2:_(s32) = COPY $x10
+    %3:_(s32) = COPY $x11
+    %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+    %4:_(s32) = COPY $x12
+    %5:_(s32) = COPY $x13
+    %1:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s64) = G_SADDSAT %0, %1(s64)
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(s64)
+    $x10 = COPY %7(s32)
+    $x11 = COPY %8(s32)
+    PseudoRET implicit $x10, implicit $x11
+
+...
+---
+name:            usubsat_i32
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; RV32I-LABEL: name: usubsat_i32
+    ; RV32I: liveins: $x10, $x11
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; RV32I-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[COPY1]]
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY1]]
+    ; RV32I-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32I-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[C]], [[SUB]]
+    ; RV32I-NEXT: $x10 = COPY [[SELECT]](s32)
+    ; RV32I-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV32ZBB-LABEL: name: usubsat_i32
+    ; RV32ZBB: liveins: $x10, $x11
+    ; RV32ZBB-NEXT: {{  $}}
+    ; RV32ZBB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; RV32ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; RV32ZBB-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[COPY]], [[COPY1]]
+    ; RV32ZBB-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[UMIN]]
+    ; RV32ZBB-NEXT: $x10 = COPY [[SUB]](s32)
+    ; RV32ZBB-NEXT: PseudoRET implicit $x10
+    %0:_(s32) = COPY $x10
+    %1:_(s32) = COPY $x11
+    %2:_(s32) = G_USUBSAT %0, %1(s32)
+    $x10 = COPY %2(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            usubsat_i64
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+    ; CHECK-LABEL: name: usubsat_i64
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x12
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x13
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY3]]
+    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY1]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP3]], [[ICMP1]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[C]], [[SUB]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[C1]], [[SUB2]]
+    ; CHECK-NEXT: $x10 = COPY [[SELECT1]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[SELECT2]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
+    %2:_(s32) = COPY $x10
+    %3:_(s32) = COPY $x11
+    %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+    %4:_(s32) = COPY $x12
+    %5:_(s32) = COPY $x13
+    %1:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s64) = G_USUBSAT %0, %1(s64)
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(s64)
+    $x10 = COPY %7(s32)
+    $x11 = COPY %8(s32)
+    PseudoRET implicit $x10, implicit $x11
+
+...
+---
+name:            ssubsat_i32
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; RV32I-LABEL: name: ssubsat_i32
+    ; RV32I: liveins: $x10, $x11
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; RV32I-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[COPY1]]
+    ; RV32I-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[SUB]](s32), [[COPY]]
+    ; RV32I-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY1]](s32), [[C]]
+    ; RV32I-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ICMP1]], [[ICMP]]
+    ; RV32I-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
+    ; RV32I-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+    ; RV32I-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY2]], [[C1]](s32)
+    ; RV32I-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; RV32I-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ASHR]], [[C2]]
+    ; RV32I-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[XOR]](s32), [[ADD]], [[COPY2]]
+    ; RV32I-NEXT: $x10 = COPY [[SELECT]](s32)
+    ; RV32I-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV32ZBB-LABEL: name: ssubsat_i32
+    ; RV32ZBB: liveins: $x10, $x11
+    ; RV32ZBB-NEXT: {{  $}}
+    ; RV32ZBB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; RV32ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; RV32ZBB-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
+    ; RV32ZBB-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; RV32ZBB-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32ZBB-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[COPY]], [[C2]]
+    ; RV32ZBB-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SMAX]], [[C]]
+    ; RV32ZBB-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[COPY]], [[C2]]
+    ; RV32ZBB-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SMIN]], [[C1]]
+    ; RV32ZBB-NEXT: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[COPY1]]
+    ; RV32ZBB-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB1]]
+    ; RV32ZBB-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[SMIN1]]
+    ; RV32ZBB-NEXT: $x10 = COPY [[SUB2]](s32)
+    ; RV32ZBB-NEXT: PseudoRET implicit $x10
+    %0:_(s32) = COPY $x10
+    %1:_(s32) = COPY $x11
+    %2:_(s32) = G_SSUBSAT %0, %1(s32)
+    $x10 = COPY %2(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            ssubsat_i64
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+    ; CHECK-LABEL: name: ssubsat_i64
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x12
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x13
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY3]]
+    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[ICMP]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[SUB2]](s32), [[COPY1]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB2]](s32), [[COPY1]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB]](s32), [[COPY]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP3]], [[ICMP1]]
+    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY3]](s32), [[C1]]
+    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY3]](s32), [[C1]]
+    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY2]](s32), [[C]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s32), [[ICMP6]], [[ICMP4]]
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SELECT1]], [[SELECT]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SUB2]], [[C2]](s32)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+    ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SUB2]], [[C3]](s32)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ASHR]], [[C4]]
+    ; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[C4]]
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ASHR1]], [[C5]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP7]]
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32)
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[XOR]](s32), [[COPY4]], [[SUB]]
+    ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[XOR]](s32), [[COPY5]], [[SUB2]]
+    ; CHECK-NEXT: $x10 = COPY [[SELECT2]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[SELECT3]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
+    %2:_(s32) = COPY $x10
+    %3:_(s32) = COPY $x11
+    %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+    %4:_(s32) = COPY $x12
+    %5:_(s32) = COPY $x13
+    %1:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s64) = G_SSUBSAT %0, %1(s64)
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(s64)
+    $x10 = COPY %7(s32)
+    $x11 = COPY %8(s32)
+    PseudoRET implicit $x10, implicit $x11
+
+...
+---
+name:            uaddsat_i8
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: uaddsat_i8
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[AND]](s32), [[AND1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[C2]], [[ADD]]
+    ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %2:_(s32) = COPY $x10
+    %0:_(s8) = G_TRUNC %2(s32)
+    %3:_(s32) = COPY $x11
+    %1:_(s8) = G_TRUNC %3(s32)
+    %4:_(s8) = G_UADDSAT %0, %1(s8)
+    %5:_(s32) = G_ANYEXT %4(s8)
+    $x10 = COPY %5(s32)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv64.mir
new file mode 100644
index 0000000..5eaf8b3
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv64.mir
@@ -0,0 +1,358 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=riscv64 -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s --check-prefixes=CHECK,RV64I
+# RUN: llc -mtriple=riscv64 -mattr=+zbb -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s --check-prefixes=CHECK,RV64ZBB
+
+---
+name:            uaddsat_i32
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: uaddsat_i32
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[ADD]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[ZEXT]](s64), [[AND]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s64), [[C1]], [[COPY2]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SELECT]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %2:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %2(s64)
+    %3:_(s64) = COPY $x11
+    %1:_(s32) = G_TRUNC %3(s64)
+    %4:_(s32) = G_UADDSAT %0, %1(s32)
+    %5:_(s64) = G_ANYEXT %4(s32)
+    $x10 = COPY %5(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            uaddsat_i64
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; RV64I-LABEL: name: uaddsat_i64
+    ; RV64I: liveins: $x10, $x11
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY1]]
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[ADD]](s64), [[COPY1]]
+    ; RV64I-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[ADD]](s64)
+    ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV64I-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[C]], [[COPY2]]
+    ; RV64I-NEXT: $x10 = COPY [[SELECT]](s64)
+    ; RV64I-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV64ZBB-LABEL: name: uaddsat_i64
+    ; RV64ZBB: liveins: $x10, $x11
+    ; RV64ZBB-NEXT: {{  $}}
+    ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; RV64ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; RV64ZBB-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV64ZBB-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY]], [[C]]
+    ; RV64ZBB-NEXT: [[UMIN:%[0-9]+]]:_(s64) = G_UMIN [[XOR]], [[COPY1]]
+    ; RV64ZBB-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[UMIN]]
+    ; RV64ZBB-NEXT: $x10 = COPY [[ADD]](s64)
+    ; RV64ZBB-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = COPY $x10
+    %1:_(s64) = COPY $x11
+    %2:_(s64) = G_UADDSAT %0, %1(s64)
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            saddsat_i32
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: saddsat_i32
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 32
+    ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[SEXT_INREG]], [[SEXT_INREG1]]
+    ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ne), [[ADD]](s64), [[SEXT_INREG2]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ADD]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 31
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[TRUNC]], [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ASHR]], [[C1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s64), [[ADD1]], [[TRUNC]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SELECT]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %2:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %2(s64)
+    %3:_(s64) = COPY $x11
+    %1:_(s32) = G_TRUNC %3(s64)
+    %4:_(s32) = G_SADDSAT %0, %1(s32)
+    %5:_(s64) = G_ANYEXT %4(s32)
+    $x10 = COPY %5(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            saddsat_i64
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; RV64I-LABEL: name: saddsat_i64
+    ; RV64I: liveins: $x10, $x11
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY1]]
+    ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), [[ADD]](s64), [[COPY]]
+    ; RV64I-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), [[COPY1]](s64), [[C]]
+    ; RV64I-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; RV64I-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; RV64I-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[TRUNC]], [[TRUNC1]]
+    ; RV64I-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[ADD]](s64)
+    ; RV64I-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
+    ; RV64I-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s64)
+    ; RV64I-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
+    ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[ASHR]], [[C2]]
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[XOR]](s32)
+    ; RV64I-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV64I-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C3]]
+    ; RV64I-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[ADD1]], [[COPY2]]
+    ; RV64I-NEXT: $x10 = COPY [[SELECT]](s64)
+    ; RV64I-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV64ZBB-LABEL: name: saddsat_i64
+    ; RV64ZBB: liveins: $x10, $x11
+    ; RV64ZBB-NEXT: {{  $}}
+    ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; RV64ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; RV64ZBB-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807
+    ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
+    ; RV64ZBB-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64ZBB-NEXT: [[SMAX:%[0-9]+]]:_(s64) = G_SMAX [[COPY]], [[C2]]
+    ; RV64ZBB-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[C]], [[SMAX]]
+    ; RV64ZBB-NEXT: [[SMIN:%[0-9]+]]:_(s64) = G_SMIN [[COPY]], [[C2]]
+    ; RV64ZBB-NEXT: [[SUB1:%[0-9]+]]:_(s64) = G_SUB [[C1]], [[SMIN]]
+    ; RV64ZBB-NEXT: [[SMAX1:%[0-9]+]]:_(s64) = G_SMAX [[SUB1]], [[COPY1]]
+    ; RV64ZBB-NEXT: [[SMIN1:%[0-9]+]]:_(s64) = G_SMIN [[SMAX1]], [[SUB]]
+    ; RV64ZBB-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[SMIN1]]
+    ; RV64ZBB-NEXT: $x10 = COPY [[ADD]](s64)
+    ; RV64ZBB-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = COPY $x10
+    %1:_(s64) = COPY $x11
+    %2:_(s64) = G_SADDSAT %0, %1(s64)
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            usubsat_i32
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: usubsat_i32
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[AND]](s64), [[AND1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s64), [[C2]], [[SUB]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SELECT]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %2:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %2(s64)
+    %3:_(s64) = COPY $x11
+    %1:_(s32) = G_TRUNC %3(s64)
+    %4:_(s32) = G_USUBSAT %0, %1(s32)
+    %5:_(s64) = G_ANYEXT %4(s32)
+    $x10 = COPY %5(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            usubsat_i64
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; RV64I-LABEL: name: usubsat_i64
+    ; RV64I: liveins: $x10, $x11
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[COPY1]]
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
+    ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64I-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[C]], [[SUB]]
+    ; RV64I-NEXT: $x10 = COPY [[SELECT]](s64)
+    ; RV64I-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV64ZBB-LABEL: name: usubsat_i64
+    ; RV64ZBB: liveins: $x10, $x11
+    ; RV64ZBB-NEXT: {{  $}}
+    ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; RV64ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; RV64ZBB-NEXT: [[UMIN:%[0-9]+]]:_(s64) = G_UMIN [[COPY]], [[COPY1]]
+    ; RV64ZBB-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[UMIN]]
+    ; RV64ZBB-NEXT: $x10 = COPY [[SUB]](s64)
+    ; RV64ZBB-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = COPY $x10
+    %1:_(s64) = COPY $x11
+    %2:_(s64) = G_USUBSAT %0, %1(s64)
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            ssubsat_i32
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: ssubsat_i32
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 32
+    ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[SEXT_INREG]], [[SEXT_INREG1]]
+    ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ne), [[SUB]](s64), [[SEXT_INREG2]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[SUB]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 31
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[TRUNC]], [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ASHR]], [[C1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s64), [[ADD]], [[TRUNC]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SELECT]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %2:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %2(s64)
+    %3:_(s64) = COPY $x11
+    %1:_(s32) = G_TRUNC %3(s64)
+    %4:_(s32) = G_SSUBSAT %0, %1(s32)
+    %5:_(s64) = G_ANYEXT %4(s32)
+    $x10 = COPY %5(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            ssubsat_i64
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; RV64I-LABEL: name: ssubsat_i64
+    ; RV64I: liveins: $x10, $x11
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[COPY1]]
+    ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), [[SUB]](s64), [[COPY]]
+    ; RV64I-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]]
+    ; RV64I-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; RV64I-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; RV64I-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[TRUNC]], [[TRUNC1]]
+    ; RV64I-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[SUB]](s64)
+    ; RV64I-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
+    ; RV64I-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s64)
+    ; RV64I-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
+    ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[ASHR]], [[C2]]
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[XOR]](s32)
+    ; RV64I-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV64I-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C3]]
+    ; RV64I-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[ADD]], [[COPY2]]
+    ; RV64I-NEXT: $x10 = COPY [[SELECT]](s64)
+    ; RV64I-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV64ZBB-LABEL: name: ssubsat_i64
+    ; RV64ZBB: liveins: $x10, $x11
+    ; RV64ZBB-NEXT: {{  $}}
+    ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; RV64ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; RV64ZBB-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807
+    ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
+    ; RV64ZBB-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV64ZBB-NEXT: [[SMAX:%[0-9]+]]:_(s64) = G_SMAX [[COPY]], [[C2]]
+    ; RV64ZBB-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[SMAX]], [[C]]
+    ; RV64ZBB-NEXT: [[SMIN:%[0-9]+]]:_(s64) = G_SMIN [[COPY]], [[C2]]
+    ; RV64ZBB-NEXT: [[SUB1:%[0-9]+]]:_(s64) = G_SUB [[SMIN]], [[C1]]
+    ; RV64ZBB-NEXT: [[SMAX1:%[0-9]+]]:_(s64) = G_SMAX [[SUB]], [[COPY1]]
+    ; RV64ZBB-NEXT: [[SMIN1:%[0-9]+]]:_(s64) = G_SMIN [[SMAX1]], [[SUB1]]
+    ; RV64ZBB-NEXT: [[SUB2:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[SMIN1]]
+    ; RV64ZBB-NEXT: $x10 = COPY [[SUB2]](s64)
+    ; RV64ZBB-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = COPY $x10
+    %1:_(s64) = COPY $x11
+    %2:_(s64) = G_SSUBSAT %0, %1(s64)
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            uaddsat_i8
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: uaddsat_i8
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ADD]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[AND]](s64), [[AND1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s64), [[C2]], [[ADD]]
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[SELECT]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT1]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %2:_(s64) = COPY $x10
+    %0:_(s8) = G_TRUNC %2(s64)
+    %3:_(s64) = COPY $x11
+    %1:_(s8) = G_TRUNC %3(s64)
+    %4:_(s8) = G_UADDSAT %0, %1(s8)
+    %5:_(s64) = G_ANYEXT %4(s8)
+    $x10 = COPY %5(s64)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/libcalls.ll b/llvm/test/CodeGen/RISCV/GlobalISel/libcalls.ll
new file mode 100644
index 0000000..aaef8d9
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/libcalls.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -global-isel -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefix=RV32
+; RUN: llc -mtriple=riscv64 -global-isel -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefix=RV64
+
+define float @test_f32(float %x, float %y) nounwind {
+; RV32-LABEL: test_f32:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    call fmodf
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_f32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    call fmodf
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+entry:
+  %z = frem float %x, %y
+  ret float %z
+}
+
+define double @test_f64(double %x, double %y) nounwind {
+; RV32-LABEL: test_f64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    call fmod
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_f64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    call fmod
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+entry:
+  %z = frem double %x, %y
+  ret double %z
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/shift.ll b/llvm/test/CodeGen/RISCV/GlobalISel/shift.ll
new file mode 100644
index 0000000..b75cbf8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/shift.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -global-isel -global-isel-abort=1 -verify-machineinstrs < %s 2>&1 | FileCheck %s --check-prefixes=RV32
+; RUN: llc -mtriple=riscv64 -global-isel -global-isel-abort=1 -verify-machineinstrs < %s 2>&1 | FileCheck %s --check-prefixes=RV64
+
+define i16 @test_lshr_i48(i48 %x) {
+; RV32-LABEL: test_lshr_i48:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srli a0, a0, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_lshr_i48:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srliw a0, a0, 16
+; RV64-NEXT:    ret
+  %lshr = lshr i48 %x, 16
+  %trunc = trunc i48 %lshr to i16
+  ret i16 %trunc
+}
+
+define i16 @test_ashr_i48(i48 %x) {
+; RV32-LABEL: test_ashr_i48:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srai a0, a0, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_ashr_i48:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sraiw a0, a0, 16
+; RV64-NEXT:    ret
+  %ashr = ashr i48 %x, 16
+  %trunc = trunc i48 %ashr to i16
+  ret i16 %trunc
+}
+
+define i16 @test_shl_i48(i48 %x) {
+; RV32-LABEL: test_shl_i48:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a0, a0, 8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_shl_i48:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slliw a0, a0, 8
+; RV64-NEXT:    ret
+  %shl = shl i48 %x, 8
+  %trunc = trunc i48 %shl to i16
+  ret i16 %trunc
+}
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index a1eb179..c90bb03 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -389,7 +389,7 @@
 ; RV32ZACAS: .attribute 5, "rv32i2p1_a2p1_zacas1p0"
 ; RV32ZALASR: .attribute 5, "rv32i2p1_zalasr0p1"
 ; RV32ZAMA16B: .attribute 5, "rv32i2p1_zama16b1p0"
-; RV32ZICFILP: .attribute 5, "rv32i2p1_zicfilp0p4"
+; RV32ZICFILP: .attribute 5, "rv32i2p1_zicfilp0p4_zicsr2p0"
 ; RV32ZABHA: .attribute 5, "rv32i2p1_a2p1_zabha1p0"
 ; RV32SSNPM: .attribute 5, "rv32i2p1_ssnpm0p8"
 ; RV32SMNPM: .attribute 5, "rv32i2p1_smnpm0p8"
@@ -520,7 +520,7 @@
 ; RV64ZVFBFWMA: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfbfmin1p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvfbfwma1p0_zvl32b1p0"
 ; RV64ZACAS: .attribute 5, "rv64i2p1_a2p1_zacas1p0"
 ; RV64ZALASR: .attribute 5, "rv64i2p1_zalasr0p1"
-; RV64ZICFILP: .attribute 5, "rv64i2p1_zicfilp0p4"
+; RV64ZICFILP: .attribute 5, "rv64i2p1_zicfilp0p4_zicsr2p0"
 ; RV64ZABHA: .attribute 5, "rv64i2p1_a2p1_zabha1p0"
 ; RV64SSNPM: .attribute 5, "rv64i2p1_ssnpm0p8"
 ; RV64SMNPM: .attribute 5, "rv64i2p1_smnpm0p8"
diff --git a/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll b/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll
index 8b22046..8693283 100644
--- a/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll
+++ b/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll
@@ -53,26 +53,24 @@ define void @test(i32 signext %row, i32 signext %N.in) nounwind {
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    blez a1, .LBB0_3
 ; RV64-NEXT:  # %bb.1: # %cond_true.preheader
-; RV64-NEXT:    negw a1, a1
 ; RV64-NEXT:    slli a0, a0, 6
 ; RV64-NEXT:    lui a2, %hi(A)
 ; RV64-NEXT:    addi a2, a2, %lo(A)
 ; RV64-NEXT:    add a0, a0, a2
 ; RV64-NEXT:    addi a2, a0, 4
+; RV64-NEXT:    addiw a1, a1, 2
 ; RV64-NEXT:    li a3, 2
 ; RV64-NEXT:    li a4, 4
 ; RV64-NEXT:    li a5, 5
-; RV64-NEXT:    li a6, 2
 ; RV64-NEXT:  .LBB0_2: # %cond_true
 ; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64-NEXT:    sw a4, 0(a2)
-; RV64-NEXT:    slli a7, a6, 2
-; RV64-NEXT:    add a7, a0, a7
-; RV64-NEXT:    sw a5, 0(a7)
-; RV64-NEXT:    addiw a6, a6, 1
-; RV64-NEXT:    addw a7, a1, a6
+; RV64-NEXT:    slli a6, a3, 2
+; RV64-NEXT:    add a6, a0, a6
+; RV64-NEXT:    sw a5, 0(a6)
+; RV64-NEXT:    addiw a3, a3, 1
 ; RV64-NEXT:    addi a2, a2, 4
-; RV64-NEXT:    bne a7, a3, .LBB0_2
+; RV64-NEXT:    bne a3, a1, .LBB0_2
 ; RV64-NEXT:  .LBB0_3: # %return
 ; RV64-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index 364e8c7..42ea425 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -1843,3 +1843,152 @@ define i8 @mulsub_demand_2(i8 %x, i8 %y) nounwind {
   %r = or i8 %a, 240
   ret i8 %r
 }
+
+define i64 @muland_demand(i64 %x) nounwind {
+; RV32I-LABEL: muland_demand:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    andi a0, a0, -8
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    srli a1, a1, 2
+; RV32I-NEXT:    li a2, 12
+; RV32I-NEXT:    li a3, 0
+; RV32I-NEXT:    call __muldi3
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IM-LABEL: muland_demand:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    andi a0, a0, -8
+; RV32IM-NEXT:    li a2, 12
+; RV32IM-NEXT:    mul a1, a1, a2
+; RV32IM-NEXT:    mulhu a3, a0, a2
+; RV32IM-NEXT:    add a1, a3, a1
+; RV32IM-NEXT:    mul a0, a0, a2
+; RV32IM-NEXT:    ret
+;
+; RV64I-LABEL: muland_demand:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, -29
+; RV64I-NEXT:    srli a1, a1, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    li a1, 12
+; RV64I-NEXT:    tail __muldi3
+;
+; RV64IM-LABEL: muland_demand:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    andi a0, a0, -8
+; RV64IM-NEXT:    li a1, 12
+; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    ret
+  %and = and i64 %x, 4611686018427387896
+  %mul = mul i64 %and, 12
+  ret i64 %mul
+}
+
+define i64 @mulzext_demand(i32 signext %x) nounwind {
+; RV32I-LABEL: mulzext_demand:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    li a3, 3
+; RV32I-NEXT:    li a2, 0
+; RV32I-NEXT:    call __muldi3
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IM-LABEL: mulzext_demand:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    slli a1, a0, 1
+; RV32IM-NEXT:    add a1, a1, a0
+; RV32IM-NEXT:    li a0, 0
+; RV32IM-NEXT:    ret
+;
+; RV64I-LABEL: mulzext_demand:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 3
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    tail __muldi3
+;
+; RV64IM-LABEL: mulzext_demand:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    li a1, 3
+; RV64IM-NEXT:    slli a1, a1, 32
+; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    ret
+  %ext = zext i32 %x to i64
+  %mul = mul i64 %ext, 12884901888
+  ret i64 %mul
+}
+
+define i32 @mulfshl_demand(i32 signext %x) nounwind {
+; RV32I-LABEL: mulfshl_demand:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a0, a0, 11
+; RV32I-NEXT:    lui a1, 92808
+; RV32I-NEXT:    tail __mulsi3
+;
+; RV32IM-LABEL: mulfshl_demand:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    srli a0, a0, 11
+; RV32IM-NEXT:    lui a1, 92808
+; RV32IM-NEXT:    mul a0, a0, a1
+; RV32IM-NEXT:    ret
+;
+; RV64I-LABEL: mulfshl_demand:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    srliw a0, a0, 11
+; RV64I-NEXT:    lui a1, 92808
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: mulfshl_demand:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    srliw a0, a0, 11
+; RV64IM-NEXT:    lui a1, 92808
+; RV64IM-NEXT:    mulw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %fshl = tail call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 21)
+  %mul = mul i32 %fshl, 380141568
+  ret i32 %mul
+}
+
+define i32 @mulor_demand(i32 signext %x, i32 signext %y) nounwind {
+; RV32I-LABEL: mulor_demand:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a1, 92808
+; RV32I-NEXT:    tail __mulsi3
+;
+; RV32IM-LABEL: mulor_demand:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    lui a1, 92808
+; RV32IM-NEXT:    mul a0, a0, a1
+; RV32IM-NEXT:    ret
+;
+; RV64I-LABEL: mulor_demand:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lui a1, 92808
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: mulor_demand:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    lui a1, 92808
+; RV64IM-NEXT:    mulw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %mul1 = mul i32 %y, 10485760
+  %or = or disjoint i32 %mul1, %x
+  %mul2 = mul i32 %or, 380141568
+  ret i32 %mul2
+}
diff --git a/llvm/test/CodeGen/RISCV/pr90730.ll b/llvm/test/CodeGen/RISCV/pr90730.ll
new file mode 100644
index 0000000..7c3f4b4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr90730.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv64 -mattr=+zbb | FileCheck %s
+
+define i32 @pr90730(i32 %x, i1 %y, ptr %p) {
+; CHECK-LABEL: pr90730:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    addiw a1, a1, -960
+; CHECK-NEXT:    andn a0, a1, a0
+; CHECK-NEXT:    sw zero, 0(a2)
+; CHECK-NEXT:    ret
+entry:
+  %ext = zext i1 %y to i32
+  %xor1 = xor i32 %ext, 31817
+  %and1 = and i32 %xor1, %x
+  store i32 %and1, ptr %p, align 4
+  %v = load i32, ptr %p, align 4
+  %and2 = and i32 %v, 31808
+  %xor2 = xor i32 %and2, 31808
+  store i32 0, ptr %p, align 4
+  ret i32 %xor2
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
index 2db0d40..cf7be57 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
@@ -637,8 +637,6 @@ define i64 @zext_mul288(i32 signext %a) {
 define i64 @zext_mul12884901888(i32 signext %a) {
 ; RV64I-LABEL: zext_mul12884901888:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    li a1, 3
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    mul a0, a0, a1
@@ -646,8 +644,8 @@ define i64 @zext_mul12884901888(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul12884901888:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    sh1add a0, a0, a0
+; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
   %b = zext i32 %a to i64
   %c = mul i64 %b, 12884901888
@@ -658,8 +656,6 @@ define i64 @zext_mul12884901888(i32 signext %a) {
 define i64 @zext_mul21474836480(i32 signext %a) {
 ; RV64I-LABEL: zext_mul21474836480:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    li a1, 5
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    mul a0, a0, a1
@@ -667,8 +663,8 @@ define i64 @zext_mul21474836480(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul21474836480:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    sh2add a0, a0, a0
+; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
   %b = zext i32 %a to i64
   %c = mul i64 %b, 21474836480
@@ -679,8 +675,6 @@ define i64 @zext_mul21474836480(i32 signext %a) {
 define i64 @zext_mul38654705664(i32 signext %a) {
 ; RV64I-LABEL: zext_mul38654705664:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    li a1, 9
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    mul a0, a0, a1
@@ -688,8 +682,8 @@ define i64 @zext_mul38654705664(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul38654705664:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    sh3add a0, a0, a0
+; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
   %b = zext i32 %a to i64
   %c = mul i64 %b, 38654705664
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index dc93c02..4a568fb 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -856,8 +856,6 @@ define i64 @zext_mul288(i32 signext %a) {
 define i64 @zext_mul12884901888(i32 signext %a) {
 ; RV64I-LABEL: zext_mul12884901888:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    li a1, 3
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    mul a0, a0, a1
@@ -865,8 +863,8 @@ define i64 @zext_mul12884901888(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul12884901888:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    sh1add a0, a0, a0
+; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
   %b = zext i32 %a to i64
   %c = mul i64 %b, 12884901888
@@ -877,8 +875,6 @@ define i64 @zext_mul12884901888(i32 signext %a) {
 define i64 @zext_mul21474836480(i32 signext %a) {
 ; RV64I-LABEL: zext_mul21474836480:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    li a1, 5
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    mul a0, a0, a1
@@ -886,8 +882,8 @@ define i64 @zext_mul21474836480(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul21474836480:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    sh2add a0, a0, a0
+; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
   %b = zext i32 %a to i64
   %c = mul i64 %b, 21474836480
@@ -898,8 +894,6 @@ define i64 @zext_mul21474836480(i32 signext %a) {
 define i64 @zext_mul38654705664(i32 signext %a) {
 ; RV64I-LABEL: zext_mul38654705664:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    li a1, 9
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    mul a0, a0, a1
@@ -907,8 +901,8 @@ define i64 @zext_mul38654705664(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul38654705664:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    sh3add a0, a0, a0
+; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
   %b = zext i32 %a to i64
   %c = mul i64 %b, 38654705664
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-costrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll
index f189354..f189354 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-costrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll
diff --git a/llvm/test/CodeGen/RISCV/rvv/fround-costrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll
index 3276f48..3276f48 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fround-costrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll
diff --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll
index f707cb3..8cf7855 100644
--- a/llvm/test/CodeGen/RISCV/sextw-removal.ll
+++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll
@@ -1047,25 +1047,25 @@ define signext i32 @bug(i32 signext %x) {
 ; CHECK-NEXT:    seqz a2, a2
 ; CHECK-NEXT:    slli a3, a2, 3
 ; CHECK-NEXT:    sllw a1, a1, a3
-; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    negw a2, a2
 ; CHECK-NEXT:    andi a2, a2, -8
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    srliw a2, a1, 28
 ; CHECK-NEXT:    seqz a2, a2
 ; CHECK-NEXT:    slli a3, a2, 2
 ; CHECK-NEXT:    sllw a1, a1, a3
-; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    negw a2, a2
 ; CHECK-NEXT:    andi a2, a2, -4
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    srliw a2, a1, 30
 ; CHECK-NEXT:    seqz a2, a2
 ; CHECK-NEXT:    slli a3, a2, 1
 ; CHECK-NEXT:    sllw a1, a1, a3
-; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    negw a2, a2
 ; CHECK-NEXT:    andi a2, a2, -2
 ; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    srai a1, a1, 31
 ; CHECK-NEXT:    not a1, a1
+; CHECK-NEXT:    srli a1, a1, 31
 ; CHECK-NEXT:    addw a0, a0, a1
 ; CHECK-NEXT:  .LBB18_4: # %cleanup
 ; CHECK-NEXT:    ret
@@ -1087,28 +1087,27 @@ define signext i32 @bug(i32 signext %x) {
 ; NOREMOVAL-NEXT:    seqz a2, a2
 ; NOREMOVAL-NEXT:    slli a3, a2, 3
 ; NOREMOVAL-NEXT:    sllw a1, a1, a3
-; NOREMOVAL-NEXT:    neg a2, a2
+; NOREMOVAL-NEXT:    negw a2, a2
 ; NOREMOVAL-NEXT:    andi a2, a2, -8
 ; NOREMOVAL-NEXT:    add a0, a0, a2
 ; NOREMOVAL-NEXT:    srliw a2, a1, 28
 ; NOREMOVAL-NEXT:    seqz a2, a2
 ; NOREMOVAL-NEXT:    slli a3, a2, 2
 ; NOREMOVAL-NEXT:    sllw a1, a1, a3
-; NOREMOVAL-NEXT:    neg a2, a2
+; NOREMOVAL-NEXT:    negw a2, a2
 ; NOREMOVAL-NEXT:    andi a2, a2, -4
 ; NOREMOVAL-NEXT:    add a0, a0, a2
 ; NOREMOVAL-NEXT:    srliw a2, a1, 30
 ; NOREMOVAL-NEXT:    seqz a2, a2
 ; NOREMOVAL-NEXT:    slli a3, a2, 1
 ; NOREMOVAL-NEXT:    sllw a1, a1, a3
-; NOREMOVAL-NEXT:    neg a2, a2
+; NOREMOVAL-NEXT:    negw a2, a2
 ; NOREMOVAL-NEXT:    andi a2, a2, -2
 ; NOREMOVAL-NEXT:    add a0, a0, a2
-; NOREMOVAL-NEXT:    srai a1, a1, 31
 ; NOREMOVAL-NEXT:    not a1, a1
-; NOREMOVAL-NEXT:    add a0, a0, a1
+; NOREMOVAL-NEXT:    srli a1, a1, 31
+; NOREMOVAL-NEXT:    addw a0, a0, a1
 ; NOREMOVAL-NEXT:  .LBB18_4: # %cleanup
-; NOREMOVAL-NEXT:    sext.w a0, a0
 ; NOREMOVAL-NEXT:    ret
 entry:
   %tobool.not = icmp eq i32 %x, 0
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_inline_assembly/inline_asm.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_inline_assembly/inline_asm.ll
new file mode 100644
index 0000000..449dd71
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_inline_assembly/inline_asm.ll
@@ -0,0 +1,93 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s --spirv-ext=+SPV_INTEL_inline_assembly -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s --spirv-ext=+SPV_INTEL_inline_assembly -o - -filetype=obj | spirv-val %}
+
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; CHECK-ERROR: Inline assembly instructions require the following SPIR-V extension: SPV_INTEL_inline_assembly
+
+; CHECK: OpCapability AsmINTEL
+; CHECK: OpExtension "SPV_INTEL_inline_assembly"
+
+; CHECK-COUNT-8: OpDecorate %[[#]] SideEffectsINTEL
+
+; CHECK-DAG: %[[#VoidTy:]] = OpTypeVoid
+; CHECK-DAG: %[[#Int8Ty:]] = OpTypeInt 8 0
+; CHECK-DAG: %[[#Int32Ty:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#Int64Ty:]] = OpTypeInt 64 0
+; CHECK-DAG: %[[#HalfTy:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#FloatTy:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#DoubleTy:]] = OpTypeFloat 64
+
+; CHECK-DAG: OpTypeFunction %[[#VoidTy]] %[[#]] %[[#]] %[[#]] %[[#Int64Ty]]
+; CHECK-DAG: %[[#Fun1Ty:]] = OpTypeFunction %[[#VoidTy]]
+; CHECK-DAG: %[[#Fun2Ty:]] = OpTypeFunction %[[#Int32Ty]]
+; CHECK-DAG: %[[#Fun3Ty:]] = OpTypeFunction %[[#Int32Ty]] %[[#Int32Ty]]
+; CHECK-DAG: %[[#Fun4Ty:]] = OpTypeFunction %[[#FloatTy]] %[[#FloatTy]]
+; CHECK-DAG: %[[#Fun5Ty:]] = OpTypeFunction %[[#HalfTy]] %[[#FloatTy]] %[[#FloatTy]]
+; CHECK-DAG: %[[#Fun6Ty:]] = OpTypeFunction %[[#Int8Ty]] %[[#FloatTy]] %[[#Int32Ty]] %[[#Int8Ty]]
+; CHECK-DAG: %[[#Fun7Ty:]] = OpTypeFunction %[[#Int64Ty]] %[[#Int64Ty]] %[[#Int32Ty]] %[[#Int8Ty]]
+; CHECK-DAG: %[[#Fun8Ty:]] = OpTypeFunction %[[#VoidTy]] %[[#Int32Ty]] %[[#DoubleTy]]
+
+; CHECK-DAG: %[[#Const2:]] = OpConstant %[[#FloatTy]] 2
+; CHECK-DAG: %[[#Const123:]] = OpConstant %[[#Int32Ty]] 123
+; CHECK-DAG: %[[#Const42:]] = OpConstant %[[#DoubleTy:]] 42
+
+; CHECK: %[[#Dialect:]] = OpAsmTargetINTEL "spirv64-unknown-unknown"
+; CHECK-NO: OpAsmTargetINTEL
+
+; CHECK: %[[#Asm1:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "" ""
+; CHECK: %[[#Asm2:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "nop" ""
+; CHECK: %[[#Asm3:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "" "~{cc},~{memory}"
+; CHECK: %[[#Asm4:]] = OpAsmINTEL %[[#Int32Ty]] %[[#Fun2Ty:]] %[[#Dialect]] "clobber_out $0" "=&r"
+; CHECK: %[[#Asm5:]] = OpAsmINTEL %[[#Int32Ty]] %[[#Fun3Ty]] %[[#Dialect]] "icmd $0 $1" "=r,r"
+; CHECK: %[[#Asm6:]] = OpAsmINTEL %[[#FloatTy]] %[[#Fun4Ty]] %[[#Dialect]] "fcmd $0 $1" "=r,r"
+; CHECK: %[[#Asm7:]] = OpAsmINTEL %[[#HalfTy]] %[[#Fun5Ty]] %[[#Dialect]] "fcmdext $0 $1 $2" "=r,r,r"
+; CHECK: %[[#Asm8:]] = OpAsmINTEL %[[#Int8Ty]] %[[#Fun6Ty]] %[[#Dialect]] "cmdext $0 $3 $1 $2" "=r,r,r,r"
+; CHECK: %[[#Asm9:]] = OpAsmINTEL %[[#Int64Ty]] %[[#Fun7Ty]] %[[#Dialect]] "icmdext $0 $3 $1 $2" "=r,r,r,r"
+; CHECK: %[[#Asm10:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun8Ty]] %[[#Dialect]] "constcmd $0 $1" "r,r"
+; CHECK: %[[#Asm11:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun8Ty]] %[[#Dialect]] "constcmd $0 $1" "i,i"
+; CHECK-NO: OpAsmINTEL
+
+; CHECK: OpFunction
+; CHECK: OpAsmCallINTEL %[[#VoidTy]] %[[#Asm1]]
+; CHECK: OpAsmCallINTEL %[[#VoidTy]] %[[#Asm2]]
+; CHECK: OpAsmCallINTEL %[[#VoidTy]] %[[#Asm3]]
+; CHECK: OpAsmCallINTEL %[[#Int32Ty]] %[[#Asm4]]
+; CHECK: OpAsmCallINTEL %[[#Int32Ty]] %[[#Asm5]] %[[#]]
+; CHECK: OpAsmCallINTEL %[[#FloatTy]] %[[#Asm6]] %[[#]]
+; CHECK: OpAsmCallINTEL %[[#HalfTy]] %[[#Asm7]] %[[#Const2]] %[[#]]
+; CHECK: OpAsmCallINTEL %[[#Int8Ty]] %[[#Asm8]] %[[#]] %[[#Const123]] %[[#]]
+; CHECK: OpAsmCallINTEL %[[#Int64Ty]] %[[#Asm9]] %[[#]] %[[#]] %[[#]]
+; CHECK: OpAsmCallINTEL %[[#VoidTy]] %[[#Asm10]] %[[#Const123]] %[[#Const42]]
+; CHECK: OpAsmCallINTEL %[[#VoidTy]] %[[#Asm11]] %[[#Const123]] %[[#Const42]]
+; CHECK-NO: OpAsmCallINTEL
+
+define spir_kernel void @foo(ptr addrspace(1) %_arg_int, ptr addrspace(1) %_arg_float, ptr addrspace(1) %_arg_half, i64 %_lng) {
+  %i1 = load i32, ptr addrspace(1) %_arg_int
+  %i2 = load i8, ptr addrspace(1) %_arg_int
+  %f1 = load float, ptr addrspace(1) %_arg_float
+  %h1 = load half, ptr addrspace(1) %_arg_half
+  ; inline asm
+  call void asm sideeffect "", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "", "~{cc},~{memory}"()
+  %res_i0 = call i32 asm "clobber_out $0", "=&r"()
+  store i32 %res_i0, ptr addrspace(1) %_arg_int
+  ; inline asm: integer
+  %res_i1 = call i32 asm sideeffect "icmd $0 $1", "=r,r"(i32 %i1)
+  store i32 %res_i1, ptr addrspace(1) %_arg_int
+  ; inline asm: float
+  %res_f1 = call float asm sideeffect "fcmd $0 $1", "=r,r"(float %f1)
+  store float %res_f1, ptr addrspace(1) %_arg_float
+  ; inline asm: mixed floats
+  %res_f2 = call half asm sideeffect "fcmdext $0 $1 $2", "=r,r,r"(float 2.0, float %f1)
+  store half %res_f2, ptr addrspace(1) %_arg_half
+  ; inline asm: mixed operands of different types
+  call i8 asm sideeffect "cmdext $0 $3 $1 $2", "=r,r,r,r"(float %f1, i32 123, i8 %i2)
+  ; inline asm: mixed integers
+  %res_i2 = call i64 asm sideeffect "icmdext $0 $3 $1 $2", "=r,r,r,r"(i64 %_lng, i32 %i1, i8 %i2)
+  store i64 %res_i2, ptr addrspace(1) %_arg_int
+  ; inline asm: constant arguments, misc constraints
+  call void asm "constcmd $0 $1", "r,r"(i32 123, double 42.0)
+  call void asm "constcmd $0 $1", "i,i"(i32 123, double 42.0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
new file mode 100644
index 0000000..e219f61b
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
@@ -0,0 +1,59 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_shader_clock %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_shader_clock %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: clock_read_device: the builtin requires the following SPIR-V extension: SPV_KHR_shader_clock
+
+; CHECK: OpCapability ShaderClockKHR
+; CHECK: OpExtension "SPV_KHR_shader_clock"
+; CHECK-DAG: [[uint:%[a-z0-9_]+]] = OpTypeInt 32
+; CHECK-DAG: [[ulong:%[a-z0-9_]+]] = OpTypeInt 64
+; CHECK-DAG: [[v2uint:%[a-z0-9_]+]] = OpTypeVector [[uint]] 2
+; CHECK-DAG: [[uint_1:%[a-z0-9_]+]] = OpConstant [[uint]] 1
+; CHECK-DAG: [[uint_2:%[a-z0-9_]+]] = OpConstant [[uint]] 2
+; CHECK-DAG: [[uint_3:%[a-z0-9_]+]] = OpConstant [[uint]] 3
+; CHECK: OpReadClockKHR [[ulong]] [[uint_1]]
+; CHECK: OpReadClockKHR [[ulong]] [[uint_2]]
+; CHECK: OpReadClockKHR [[ulong]] [[uint_3]]
+; CHECK: OpReadClockKHR [[v2uint]] [[uint_1]]
+; CHECK: OpReadClockKHR [[v2uint]] [[uint_2]]
+; CHECK: OpReadClockKHR [[v2uint]] [[uint_3]]
+
+define dso_local spir_kernel void @test_clocks(ptr addrspace(1) nocapture noundef writeonly align 8 %out64, ptr addrspace(1) nocapture noundef writeonly align 8 %outv2) {
+entry:
+  %call = tail call spir_func i64 @_Z17clock_read_devicev()
+  store i64 %call, ptr addrspace(1) %out64, align 8
+  %call1 = tail call spir_func i64 @_Z21clock_read_work_groupv()
+  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %out64, i32 8
+  store i64 %call1, ptr addrspace(1) %arrayidx2, align 8
+  %call3 = tail call spir_func i64 @_Z20clock_read_sub_groupv()
+  %arrayidx4 = getelementptr inbounds i8, ptr addrspace(1) %out64, i32 16
+  store i64 %call3, ptr addrspace(1) %arrayidx4, align 8
+  %call5 = tail call spir_func <2 x i32> @_Z22clock_read_hilo_devicev()
+  store <2 x i32> %call5, ptr addrspace(1) %outv2, align 8
+  %call7 = tail call spir_func <2 x i32> @_Z26clock_read_hilo_work_groupv()
+  %arrayidx8 = getelementptr inbounds i8, ptr addrspace(1) %outv2, i32 8
+  store <2 x i32> %call7, ptr addrspace(1) %arrayidx8, align 8
+  %call9 = tail call spir_func <2 x i32> @_Z25clock_read_hilo_sub_groupv()
+  %arrayidx10 = getelementptr inbounds i8, ptr addrspace(1) %outv2, i32 16
+  store <2 x i32> %call9, ptr addrspace(1) %arrayidx10, align 8
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func i64 @_Z17clock_read_devicev() local_unnamed_addr
+
+; Function Attrs: convergent nounwind
+declare spir_func i64 @_Z21clock_read_work_groupv() local_unnamed_addr
+
+; Function Attrs: convergent nounwind
+declare spir_func i64 @_Z20clock_read_sub_groupv() local_unnamed_addr
+
+; Function Attrs: convergent nounwind
+declare spir_func <2 x i32> @_Z22clock_read_hilo_devicev() local_unnamed_addr
+
+; Function Attrs: convergent nounwind
+declare spir_func <2 x i32> @_Z26clock_read_hilo_work_groupv() local_unnamed_addr
+
+; Function Attrs: convergent nounwind
+declare spir_func <2 x i32> @_Z25clock_read_hilo_sub_groupv() local_unnamed_addr
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
index 83d7275..3300d46 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
@@ -130,26 +130,26 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x) {
 ; CHECK-NEXT:    vmov.i64 q1, #0xffff
 ; CHECK-NEXT:    vand q2, q2, q1
 ; CHECK-NEXT:    vmov.u16 r3, q0[2]
-; CHECK-NEXT:    vmov r0, r1, d5
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov r1, r2, d4
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov.u16 r1, q0[3]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r1
 ; CHECK-NEXT:    vmov.u16 r3, q0[4]
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov.u16 r1, q0[5]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r1
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov r2, r3, d5
-; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov r1, r3, d5
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    vmov.u16 r2, q0[7]
-; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.u16 r3, q0[6]
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vand q0, q0, q1
@@ -228,8 +228,8 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i64 q1, #0xffff
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r0, r1, d1
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r2, r1, d0
 ; CHECK-NEXT:    add r0, r2
 ; CHECK-NEXT:    bx lr
 entry:
@@ -397,26 +397,26 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
 ; CHECK-NEXT:    vmov.i64 q1, #0xff
 ; CHECK-NEXT:    vand q2, q2, q1
 ; CHECK-NEXT:    vmov.u8 r3, q0[2]
-; CHECK-NEXT:    vmov r0, r1, d5
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[3]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov r1, r2, d4
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov.u8 r1, q0[3]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r1
 ; CHECK-NEXT:    vmov.u8 r3, q0[4]
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[5]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov.u8 r1, q0[5]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r1
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov r2, r3, d5
-; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov r1, r3, d5
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q0[7]
-; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[6]
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vand q2, q2, q1
@@ -540,26 +540,26 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_zext(<8 x i8> %x) {
 ; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r3, q0[2]
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r0, r1, d5
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov r1, r2, d4
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov.u16 r1, q0[3]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r1
 ; CHECK-NEXT:    vmov.u16 r3, q0[4]
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov.u16 r1, q0[5]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r1
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov r2, r3, d5
-; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov r1, r3, d5
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    vmov.u16 r2, q0[7]
-; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.u16 r3, q0[6]
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vand q0, q0, q1
@@ -648,8 +648,8 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i64 q1, #0xff
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r0, r1, d1
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r2, r1, d0
 ; CHECK-NEXT:    add r0, r2
 ; CHECK-NEXT:    bx lr
 entry:
@@ -834,8 +834,8 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) {
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vmov.i64 q1, #0xffff
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, r12, d5
-; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r3, r12, d4
 ; CHECK-NEXT:    add.w lr, r3, r2
 ; CHECK-NEXT:    vmov.u16 r3, q0[3]
 ; CHECK-NEXT:    vmov.u16 r2, q0[2]
@@ -943,8 +943,8 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i64 q1, #0xffff
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r2, r12, d1
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, r12, d0
 ; CHECK-NEXT:    add r2, r3
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adc.w r1, r1, r12
@@ -1130,8 +1130,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vmov.i64 q1, #0xff
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, r12, d5
-; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r3, r12, d4
 ; CHECK-NEXT:    add.w lr, r3, r2
 ; CHECK-NEXT:    vmov.u8 r3, q0[3]
 ; CHECK-NEXT:    vmov.u8 r2, q0[2]
@@ -1283,8 +1283,8 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) {
 ; CHECK-NEXT:    vmov.u16 r3, q0[0]
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, r12, d5
-; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r3, r12, d4
 ; CHECK-NEXT:    add.w lr, r3, r2
 ; CHECK-NEXT:    vmov.u16 r3, q0[3]
 ; CHECK-NEXT:    vmov.u16 r2, q0[2]
@@ -1402,8 +1402,8 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i64 q1, #0xff
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r2, r12, d1
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, r12, d0
 ; CHECK-NEXT:    add r2, r3
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adc.w r1, r1, r12
diff --git a/llvm/test/CodeGen/WebAssembly/fast-isel-call-indirect64.ll b/llvm/test/CodeGen/WebAssembly/fast-isel-call-indirect64.ll
deleted file mode 100644
index 8224c3b..0000000
--- a/llvm/test/CodeGen/WebAssembly/fast-isel-call-indirect64.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s -fast-isel --mtriple=wasm64 -asm-verbose=false -wasm-keep-registers | FileCheck %s
-
-target triple = "wasm64"
-
-; Ensure fast isel also lowers function pointers to 32-bit.
-
-; CHECK:       local.get $push[[L0:[0-9]+]]=, 0
-; CHECK-NEXT:  i32.wrap_i64 $push[[L1:[0-9]+]]=, $pop[[L0]]
-; CHECK-NEXT:  call_indirect $pop[[L1]]
-
-define hidden void @f(ptr %g) {
-  call void %g()
-  ret void
-}
diff --git a/llvm/test/CodeGen/WebAssembly/function-pointer64.ll b/llvm/test/CodeGen/WebAssembly/function-pointer64.ll
index c7c90f6..7f98d3e 100644
--- a/llvm/test/CodeGen/WebAssembly/function-pointer64.ll
+++ b/llvm/test/CodeGen/WebAssembly/function-pointer64.ll
@@ -34,7 +34,6 @@ entry:
 ; CHECK:      .functype foo (i64) -> ()
 ; CHECK-NEXT: i32.const 1
 ; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i32.wrap_i64
 ; CHECK-NEXT: call_indirect (i32) -> ()
 ; REF:        call_indirect __indirect_function_table, (i32) -> ()
 
@@ -53,10 +52,10 @@ entry:
 ; YAML:      - Type:   CODE
 ; YAML:      - Type:   R_WASM_TABLE_INDEX_SLEB64
 ; YAML-NEXT:   Index:  0
-; YAML-NEXT:   Offset: 0x16
+; YAML-NEXT:   Offset: 0x15
 ; YAML:      - Type:   R_WASM_TABLE_INDEX_SLEB64
 ; YAML-NEXT:   Index:  0
-; YAML-NEXT:   Offset: 0x29
+; YAML-NEXT:   Offset: 0x28
 
 ; YAML:      - Type:   DATA
 ; YAML:      - Type:   R_WASM_TABLE_INDEX_I64
diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll
index 89e9c42..d9d3f6b 100644
--- a/llvm/test/CodeGen/WebAssembly/half-precision.ll
+++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s --mtriple=wasm32-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+half-precision | FileCheck %s
-; RUN: llc < %s --mtriple=wasm64-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+half-precision | FileCheck %s
+; RUN: llc < %s --mtriple=wasm32-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+half-precision,+simd128 | FileCheck %s
+; RUN: llc < %s --mtriple=wasm64-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+half-precision,+simd128 | FileCheck %s
 
 declare float @llvm.wasm.loadf32.f16(ptr)
 declare void @llvm.wasm.storef16.f32(float, ptr)
@@ -19,3 +19,19 @@ define void @stf16_32(float %v, ptr %p) {
   tail call void @llvm.wasm.storef16.f32(float %v, ptr %p)
   ret void
 }
+
+; CHECK-LABEL: splat_v8f16:
+; CHECK:       f16x8.splat $push0=, $0
+; CHECK-NEXT:  return $pop0
+define <8 x half> @splat_v8f16(float %x) {
+  %v = call <8 x half> @llvm.wasm.splat.f16x8(float %x)
+  ret <8 x half> %v
+}
+
+; CHECK-LABEL: extract_lane_v8f16:
+; CHECK:       f16x8.extract_lane $push0=, $0, 1
+; CHECK-NEXT:  return $pop0
+define float @extract_lane_v8f16(<8 x half> %v) {
+  %r = call float @llvm.wasm.extract.lane.f16x8(<8 x half> %v, i32 1)
+  ret float %r
+}
diff --git a/llvm/test/CodeGen/X86/abds-vector-128.ll b/llvm/test/CodeGen/X86/abds-vector-128.ll
index 3143bf6..bcb4200 100644
--- a/llvm/test/CodeGen/X86/abds-vector-128.ll
+++ b/llvm/test/CodeGen/X86/abds-vector-128.ll
@@ -12,14 +12,12 @@
 define <16 x i8> @abd_ext_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-LABEL: abd_ext_v16i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psubb %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
 ; SSE2-NEXT:    psubb %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm3
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubb %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_ext_v16i8:
@@ -47,14 +45,12 @@ define <16 x i8> @abd_ext_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 define <16 x i8> @abd_ext_v16i8_undef(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-LABEL: abd_ext_v16i8_undef:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psubb %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
 ; SSE2-NEXT:    psubb %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm3
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubb %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_ext_v16i8_undef:
@@ -128,14 +124,12 @@ define <8 x i16> @abd_ext_v8i16_undef(<8 x i16> %a, <8 x i16> %b) nounwind {
 define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-LABEL: abd_ext_v4i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm3
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_ext_v4i32:
@@ -163,14 +157,12 @@ define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-LABEL: abd_ext_v4i32_undef:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm3
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_ext_v4i32_undef:
@@ -198,61 +190,48 @@ define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind {
 define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-LABEL: abd_ext_v2i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm2, %rax
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    sarq $63, %rcx
-; SSE2-NEXT:    movq %xmm0, %rdx
-; SSE2-NEXT:    movq %rdx, %rsi
-; SSE2-NEXT:    sarq $63, %rsi
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rdi
-; SSE2-NEXT:    movq %rdi, %r8
-; SSE2-NEXT:    sarq $63, %r8
-; SSE2-NEXT:    movq %xmm1, %r9
-; SSE2-NEXT:    movq %r9, %r10
-; SSE2-NEXT:    sarq $63, %r10
-; SSE2-NEXT:    subq %r9, %rdx
-; SSE2-NEXT:    sbbq %r10, %rsi
-; SSE2-NEXT:    subq %rdi, %rax
-; SSE2-NEXT:    sbbq %r8, %rcx
-; SSE2-NEXT:    sarq $63, %rcx
-; SSE2-NEXT:    xorq %rcx, %rax
-; SSE2-NEXT:    subq %rcx, %rax
-; SSE2-NEXT:    sarq $63, %rsi
-; SSE2-NEXT:    xorq %rsi, %rdx
-; SSE2-NEXT:    subq %rsi, %rdx
-; SSE2-NEXT:    movq %rdx, %xmm0
-; SSE2-NEXT:    movq %rax, %xmm1
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_ext_v2i64:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
 ; SSE42-NEXT:    pcmpgtq %xmm1, %xmm2
-; SSE42-NEXT:    movdqa %xmm0, %xmm3
-; SSE42-NEXT:    psubq %xmm1, %xmm3
-; SSE42-NEXT:    psubq %xmm0, %xmm1
+; SSE42-NEXT:    psubq %xmm1, %xmm0
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    psubq %xmm0, %xmm2
 ; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: abd_ext_v2i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v2i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v2i64:
@@ -272,61 +251,48 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-LABEL: abd_ext_v2i64_undef:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm2, %rax
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    sarq $63, %rcx
-; SSE2-NEXT:    movq %xmm0, %rdx
-; SSE2-NEXT:    movq %rdx, %rsi
-; SSE2-NEXT:    sarq $63, %rsi
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rdi
-; SSE2-NEXT:    movq %rdi, %r8
-; SSE2-NEXT:    sarq $63, %r8
-; SSE2-NEXT:    movq %xmm1, %r9
-; SSE2-NEXT:    movq %r9, %r10
-; SSE2-NEXT:    sarq $63, %r10
-; SSE2-NEXT:    subq %r9, %rdx
-; SSE2-NEXT:    sbbq %r10, %rsi
-; SSE2-NEXT:    subq %rdi, %rax
-; SSE2-NEXT:    sbbq %r8, %rcx
-; SSE2-NEXT:    sarq $63, %rcx
-; SSE2-NEXT:    xorq %rcx, %rax
-; SSE2-NEXT:    subq %rcx, %rax
-; SSE2-NEXT:    sarq $63, %rsi
-; SSE2-NEXT:    xorq %rsi, %rdx
-; SSE2-NEXT:    subq %rsi, %rdx
-; SSE2-NEXT:    movq %rdx, %xmm0
-; SSE2-NEXT:    movq %rax, %xmm1
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_ext_v2i64_undef:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
 ; SSE42-NEXT:    pcmpgtq %xmm1, %xmm2
-; SSE42-NEXT:    movdqa %xmm0, %xmm3
-; SSE42-NEXT:    psubq %xmm1, %xmm3
-; SSE42-NEXT:    psubq %xmm0, %xmm1
+; SSE42-NEXT:    psubq %xmm1, %xmm0
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    psubq %xmm0, %xmm2
 ; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: abd_ext_v2i64_undef:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v2i64_undef:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v2i64_undef:
@@ -350,14 +316,12 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
 define <16 x i8> @abd_minmax_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-LABEL: abd_minmax_v16i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psubb %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
 ; SSE2-NEXT:    psubb %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm3
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubb %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_minmax_v16i8:
@@ -404,14 +368,12 @@ define <8 x i16> @abd_minmax_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 define <4 x i32> @abd_minmax_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-LABEL: abd_minmax_v4i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm3
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_minmax_v4i32:
@@ -445,47 +407,40 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    pandn %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm4
-; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm2
 ; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_minmax_v2i64:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
 ; SSE42-NEXT:    pcmpgtq %xmm1, %xmm2
-; SSE42-NEXT:    movdqa %xmm0, %xmm3
-; SSE42-NEXT:    psubq %xmm1, %xmm3
-; SSE42-NEXT:    psubq %xmm0, %xmm1
+; SSE42-NEXT:    psubq %xmm1, %xmm0
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    psubq %xmm0, %xmm2
 ; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: abd_minmax_v2i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_minmax_v2i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_minmax_v2i64:
@@ -507,14 +462,12 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 define <16 x i8> @abd_cmp_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-LABEL: abd_cmp_v16i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psubb %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
 ; SSE2-NEXT:    psubb %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm3
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubb %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_cmp_v16i8:
@@ -563,14 +516,12 @@ define <8 x i16> @abd_cmp_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 define <4 x i32> @abd_cmp_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-LABEL: abd_cmp_v4i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm3
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_cmp_v4i32:
@@ -598,9 +549,9 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-LABEL: abd_cmp_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
 ; SSE2-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
@@ -609,12 +560,9 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-NEXT:    pand %xmm5, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psubq %xmm1, %xmm3
-; SSE2-NEXT:    psubq %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm0, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -622,28 +570,26 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
 ; SSE42-NEXT:    pcmpgtq %xmm1, %xmm2
-; SSE42-NEXT:    movdqa %xmm0, %xmm3
-; SSE42-NEXT:    psubq %xmm1, %xmm3
-; SSE42-NEXT:    psubq %xmm0, %xmm1
+; SSE42-NEXT:    psubq %xmm1, %xmm0
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    psubq %xmm0, %xmm2
 ; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: abd_cmp_v2i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_cmp_v2i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_cmp_v2i64:
@@ -790,50 +736,52 @@ define <2 x i64> @abd_subnsw_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 define <2 x i64> @abd_cmp_v2i64_multiuse_cmp(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-LABEL: abd_cmp_v2i64_multiuse_cmp:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    psubq %xmm1, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    psubq %xmm0, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    psubq %xmm0, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm1, %xmm3
 ; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm1
-; SSE2-NEXT:    por %xmm3, %xmm1
-; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    paddq %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_cmp_v2i64_multiuse_cmp:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
-; SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
-; SSE42-NEXT:    movdqa %xmm2, %xmm3
+; SSE42-NEXT:    pcmpgtq %xmm1, %xmm2
+; SSE42-NEXT:    movdqa %xmm0, %xmm3
 ; SSE42-NEXT:    psubq %xmm1, %xmm3
-; SSE42-NEXT:    movdqa %xmm1, %xmm4
-; SSE42-NEXT:    psubq %xmm2, %xmm4
-; SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm4
-; SSE42-NEXT:    pcmpgtq %xmm2, %xmm1
+; SSE42-NEXT:    pxor %xmm2, %xmm3
+; SSE42-NEXT:    psubq %xmm3, %xmm2
+; SSE42-NEXT:    pcmpgtq %xmm0, %xmm1
 ; SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE42-NEXT:    pxor %xmm1, %xmm0
-; SSE42-NEXT:    paddq %xmm4, %xmm0
+; SSE42-NEXT:    paddq %xmm2, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: abd_cmp_v2i64_multiuse_cmp:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm4
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm3, %xmm4, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
@@ -844,8 +792,8 @@ define <2 x i64> @abd_cmp_v2i64_multiuse_cmp(<2 x i64> %a, <2 x i64> %b) nounwin
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm4
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm3, %xmm4, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm3, %xmm3
+; AVX2-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/abds-vector-256.ll b/llvm/test/CodeGen/X86/abds-vector-256.ll
index 78190d2..cc63ad0 100644
--- a/llvm/test/CodeGen/X86/abds-vector-256.ll
+++ b/llvm/test/CodeGen/X86/abds-vector-256.ll
@@ -223,22 +223,22 @@ define <4 x i64> @abd_ext_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
-; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm5
-; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm3, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm3
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v4i64:
@@ -261,22 +261,22 @@ define <4 x i64> @abd_ext_v4i64_undef(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
-; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm5
-; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm3, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v4i64_undef:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm3
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v4i64_undef:
@@ -402,22 +402,22 @@ define <4 x i64> @abd_minmax_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
-; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm5
-; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm3, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_minmax_v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm3
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_minmax_v4i64:
@@ -544,22 +544,22 @@ define <4 x i64> @abd_cmp_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
-; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm5
-; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm3, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_cmp_v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm3
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_cmp_v4i64:
diff --git a/llvm/test/CodeGen/X86/abdu-vector-128.ll b/llvm/test/CodeGen/X86/abdu-vector-128.ll
index 0c33e89..78b315a 100644
--- a/llvm/test/CodeGen/X86/abdu-vector-128.ll
+++ b/llvm/test/CodeGen/X86/abdu-vector-128.ll
@@ -125,12 +125,10 @@ define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NEXT:    pxor %xmm2, %xmm3
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    psubd %xmm0, %xmm3
 ; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_ext_v4i32:
@@ -163,12 +161,10 @@ define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NEXT:    pxor %xmm2, %xmm3
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    psubd %xmm0, %xmm3
 ; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_ext_v4i32_undef:
@@ -196,27 +192,22 @@ define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind {
 define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-LABEL: abd_ext_v2i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm2, %rax
-; SSE2-NEXT:    movq %xmm0, %rcx
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rdx
-; SSE2-NEXT:    movq %xmm1, %rsi
-; SSE2-NEXT:    xorl %edi, %edi
-; SSE2-NEXT:    subq %rsi, %rcx
-; SSE2-NEXT:    movl $0, %esi
-; SSE2-NEXT:    sbbq %rsi, %rsi
-; SSE2-NEXT:    subq %rdx, %rax
-; SSE2-NEXT:    sbbq %rdi, %rdi
-; SSE2-NEXT:    sarq $63, %rdi
-; SSE2-NEXT:    xorq %rdi, %rax
-; SSE2-NEXT:    subq %rdi, %rax
-; SSE2-NEXT:    sarq $63, %rsi
-; SSE2-NEXT:    xorq %rsi, %rcx
-; SSE2-NEXT:    subq %rsi, %rcx
-; SSE2-NEXT:    movq %rcx, %xmm0
-; SSE2-NEXT:    movq %rax, %xmm1
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_ext_v2i64:
@@ -226,12 +217,10 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE42-NEXT:    pxor %xmm2, %xmm3
 ; SSE42-NEXT:    pxor %xmm0, %xmm2
 ; SSE42-NEXT:    pcmpgtq %xmm3, %xmm2
-; SSE42-NEXT:    movdqa %xmm0, %xmm3
-; SSE42-NEXT:    psubq %xmm1, %xmm3
-; SSE42-NEXT:    psubq %xmm0, %xmm1
+; SSE42-NEXT:    psubq %xmm1, %xmm0
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    psubq %xmm0, %xmm2
 ; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: abd_ext_v2i64:
@@ -241,9 +230,9 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v2i64:
@@ -252,9 +241,9 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v2i64:
@@ -274,27 +263,22 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-LABEL: abd_ext_v2i64_undef:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm2, %rax
-; SSE2-NEXT:    movq %xmm0, %rcx
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rdx
-; SSE2-NEXT:    movq %xmm1, %rsi
-; SSE2-NEXT:    xorl %edi, %edi
-; SSE2-NEXT:    subq %rsi, %rcx
-; SSE2-NEXT:    movl $0, %esi
-; SSE2-NEXT:    sbbq %rsi, %rsi
-; SSE2-NEXT:    subq %rdx, %rax
-; SSE2-NEXT:    sbbq %rdi, %rdi
-; SSE2-NEXT:    sarq $63, %rdi
-; SSE2-NEXT:    xorq %rdi, %rax
-; SSE2-NEXT:    subq %rdi, %rax
-; SSE2-NEXT:    sarq $63, %rsi
-; SSE2-NEXT:    xorq %rsi, %rcx
-; SSE2-NEXT:    subq %rsi, %rcx
-; SSE2-NEXT:    movq %rcx, %xmm0
-; SSE2-NEXT:    movq %rax, %xmm1
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_ext_v2i64_undef:
@@ -304,12 +288,10 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE42-NEXT:    pxor %xmm2, %xmm3
 ; SSE42-NEXT:    pxor %xmm0, %xmm2
 ; SSE42-NEXT:    pcmpgtq %xmm3, %xmm2
-; SSE42-NEXT:    movdqa %xmm0, %xmm3
-; SSE42-NEXT:    psubq %xmm1, %xmm3
-; SSE42-NEXT:    psubq %xmm0, %xmm1
+; SSE42-NEXT:    psubq %xmm1, %xmm0
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    psubq %xmm0, %xmm2
 ; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: abd_ext_v2i64_undef:
@@ -319,9 +301,9 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v2i64_undef:
@@ -330,9 +312,9 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v2i64_undef:
@@ -411,12 +393,10 @@ define <4 x i32> @abd_minmax_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NEXT:    pxor %xmm2, %xmm3
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    psubd %xmm0, %xmm3
 ; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_minmax_v4i32:
@@ -450,19 +430,14 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    pandn %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm4
-; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm2
 ; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_minmax_v2i64:
@@ -472,12 +447,10 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE42-NEXT:    pxor %xmm2, %xmm3
 ; SSE42-NEXT:    pxor %xmm0, %xmm2
 ; SSE42-NEXT:    pcmpgtq %xmm3, %xmm2
-; SSE42-NEXT:    movdqa %xmm0, %xmm3
-; SSE42-NEXT:    psubq %xmm1, %xmm3
-; SSE42-NEXT:    psubq %xmm0, %xmm1
+; SSE42-NEXT:    psubq %xmm1, %xmm0
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    psubq %xmm0, %xmm2
 ; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: abd_minmax_v2i64:
@@ -487,9 +460,9 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_minmax_v2i64:
@@ -498,9 +471,9 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_minmax_v2i64:
@@ -579,12 +552,10 @@ define <4 x i32> @abd_cmp_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NEXT:    pxor %xmm2, %xmm3
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    psubd %xmm0, %xmm3
 ; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_cmp_v4i32:
@@ -612,9 +583,9 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-LABEL: abd_cmp_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
 ; SSE2-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
@@ -623,12 +594,9 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-NEXT:    pand %xmm5, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psubq %xmm1, %xmm3
-; SSE2-NEXT:    psubq %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm0, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -639,12 +607,10 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE42-NEXT:    pxor %xmm2, %xmm3
 ; SSE42-NEXT:    pxor %xmm0, %xmm2
 ; SSE42-NEXT:    pcmpgtq %xmm3, %xmm2
-; SSE42-NEXT:    movdqa %xmm0, %xmm3
-; SSE42-NEXT:    psubq %xmm1, %xmm3
-; SSE42-NEXT:    psubq %xmm0, %xmm1
+; SSE42-NEXT:    psubq %xmm1, %xmm0
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    psubq %xmm0, %xmm2
 ; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: abd_cmp_v2i64:
@@ -654,9 +620,9 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_cmp_v2i64:
@@ -665,9 +631,9 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_cmp_v2i64:
@@ -692,63 +658,59 @@ define <2 x i64> @abd_cmp_v2i64_multiuse_cmp(<2 x i64> %a, <2 x i64> %b) nounwin
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    psubq %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    psubq %xmm0, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    pxor %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    pandn %xmm3, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm0, %xmm2
-; SSE2-NEXT:    paddq %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm2, %xmm0
+; SSE2-NEXT:    paddq %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_cmp_v2i64_multiuse_cmp:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
 ; SSE42-NEXT:    psubq %xmm1, %xmm2
-; SSE42-NEXT:    movdqa %xmm1, %xmm3
-; SSE42-NEXT:    psubq %xmm0, %xmm3
-; SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
-; SSE42-NEXT:    pxor %xmm4, %xmm1
-; SSE42-NEXT:    pxor %xmm4, %xmm0
+; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT:    pxor %xmm3, %xmm1
+; SSE42-NEXT:    pxor %xmm3, %xmm0
 ; SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
-; SSE42-NEXT:    paddq %xmm3, %xmm0
+; SSE42-NEXT:    pxor %xmm0, %xmm2
+; SSE42-NEXT:    movdqa %xmm0, %xmm1
+; SSE42-NEXT:    psubq %xmm2, %xmm1
+; SSE42-NEXT:    paddq %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: abd_cmp_v2i64_multiuse_cmp:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm3
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    # xmm4 = mem[0,0]
-; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm0, %xmm2, %xmm3, %xmm1
+; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_cmp_v2i64_multiuse_cmp:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm3
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vpxor %xmm4, %xmm1, %xmm1
-; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vblendvpd %xmm0, %xmm2, %xmm3, %xmm1
+; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm1
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/abdu-vector-256.ll b/llvm/test/CodeGen/X86/abdu-vector-256.ll
index 884515c..080fb77 100644
--- a/llvm/test/CodeGen/X86/abdu-vector-256.ll
+++ b/llvm/test/CodeGen/X86/abdu-vector-256.ll
@@ -227,15 +227,15 @@ define <4 x i64> @abd_ext_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
 ; AVX1-NEXT:    vpxor %xmm3, %xmm5, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm6
-; AVX1-NEXT:    vpsubq %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
 ; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm3, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -245,9 +245,9 @@ define <4 x i64> @abd_ext_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
 ; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm3
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v4i64:
@@ -274,15 +274,15 @@ define <4 x i64> @abd_ext_v4i64_undef(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
 ; AVX1-NEXT:    vpxor %xmm3, %xmm5, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm6
-; AVX1-NEXT:    vpsubq %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
 ; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm3, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -292,9 +292,9 @@ define <4 x i64> @abd_ext_v4i64_undef(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
 ; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm3
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v4i64_undef:
@@ -424,15 +424,15 @@ define <4 x i64> @abd_minmax_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
 ; AVX1-NEXT:    vpxor %xmm3, %xmm5, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm6
-; AVX1-NEXT:    vpsubq %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
 ; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm3, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -442,9 +442,9 @@ define <4 x i64> @abd_minmax_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
 ; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm3
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_minmax_v4i64:
@@ -575,15 +575,15 @@ define <4 x i64> @abd_cmp_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
 ; AVX1-NEXT:    vpxor %xmm3, %xmm5, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm6
-; AVX1-NEXT:    vpsubq %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
 ; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm3, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -593,9 +593,9 @@ define <4 x i64> @abd_cmp_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
 ; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm3
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_cmp_v4i64:
diff --git a/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll b/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
index bb86f30..b4ba239 100644
--- a/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
+++ b/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
@@ -48,5 +48,5 @@ entry:
 ; Function Attrs: nounwind readnone
 declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32) #1
 
-attributes #0 = { nounwind readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="knl" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll
index 8d09497..77053e2 100644
--- a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll
+++ b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll
@@ -268,30 +268,6 @@ define void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, ptr %stbuf)
   ret void
 }
 
-declare  void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, ptr , i32, i32);
-declare  void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, ptr , i32, i32);
-define void @prefetch(<8 x i64> %ind, ptr %base) {
-; CHECK-LABEL: prefetch:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
-; CHECK-NEXT:    vgatherpf0qps (%rdi,%zmm0,4) {%k1}
-; CHECK-NEXT:    kxorw %k0, %k0, %k1
-; CHECK-NEXT:    vgatherpf1qps (%rdi,%zmm0,4) {%k1}
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vscatterpf0qps (%rdi,%zmm0,2) {%k1}
-; CHECK-NEXT:    movb $120, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vscatterpf1qps (%rdi,%zmm0,2) {%k1}
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, ptr %base, i32 4, i32 3)
-  call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, ptr %base, i32 4, i32 2)
-  call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, ptr %base, i32 2, i32 3)
-  call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, ptr %base, i32 2, i32 2)
-  ret void
-}
-
 declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, ptr, <2 x i64>, i8, i32)
 
 define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, ptr %x1, <2 x i64> %x2, i8 %x3) {
diff --git a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
index acbf438..df71e3c 100644
--- a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
+++ b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -265,30 +265,6 @@ define dso_local void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, p
   ret void
 }
 
-declare  void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, ptr , i32, i32);
-declare  void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, ptr , i32, i32);
-define dso_local void @prefetch(<8 x i64> %ind, ptr %base) {
-; CHECK-LABEL: prefetch:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
-; CHECK-NEXT:    vgatherpf0qps (%rdi,%zmm0,4) {%k1}
-; CHECK-NEXT:    kxorw %k0, %k0, %k1
-; CHECK-NEXT:    vgatherpf1qps (%rdi,%zmm0,4) {%k1}
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vscatterpf0qps (%rdi,%zmm0,2) {%k1}
-; CHECK-NEXT:    movb $120, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vscatterpf1qps (%rdi,%zmm0,2) {%k1}
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, ptr %base, i32 4, i32 3)
-  call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, ptr %base, i32 4, i32 2)
-  call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, ptr %base, i32 2, i32 3)
-  call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, ptr %base, i32 2, i32 2)
-  ret void
-}
-
 define <2 x double> @test_int_x86_avx512_mask_gather3div2_df(<2 x double> %x0, ptr %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3div2_df:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/avx512er-intrinsics.ll b/llvm/test/CodeGen/X86/avx512er-intrinsics.ll
deleted file mode 100644
index fa4025f..0000000
--- a/llvm/test/CodeGen/X86/avx512er-intrinsics.ll
+++ /dev/null
@@ -1,306 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512er --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512er --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
-
-define <16 x float> @test_rsqrt28_ps(<16 x float> %a0) {
-; CHECK-LABEL: test_rsqrt28_ps:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vrsqrt28ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test1_rsqrt28_ps(<16 x float> %a0, <16 x float> %a1) {
-; CHECK-LABEL: test1_rsqrt28_ps:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movw $6, %ax # encoding: [0x66,0xb8,0x06,0x00]
-; CHECK-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; CHECK-NEXT:    vrsqrt28ps {sae}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcc,0xc8]
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> %a1, i16 6, i32 8)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test2_rsqrt28_ps(<16 x float> %a0) {
-; CHECK-LABEL: test2_rsqrt28_ps:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movw $6, %ax # encoding: [0x66,0xb8,0x06,0x00]
-; CHECK-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; CHECK-NEXT:    vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 4)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test3_rsqrt28_ps(<16 x float> %a0) {
-; CHECK-LABEL: test3_rsqrt28_ps:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movw $6, %ax # encoding: [0x66,0xb8,0x06,0x00]
-; CHECK-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; CHECK-NEXT:    vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 6, i32 4)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test4_rsqrt28_ps(<16 x float> %a0) {
-; CHECK-LABEL: test4_rsqrt28_ps:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movw $6, %ax # encoding: [0x66,0xb8,0x06,0x00]
-; CHECK-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; CHECK-NEXT:    vrsqrt28ps {sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcc,0xc0]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 8)
-  ret <16 x float> %res
-}
-
-declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
-
-define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) {
-; CHECK-LABEL: test_rcp28_ps_512:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vrcp28ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
-
-define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) {
-; CHECK-LABEL: test_rcp28_pd_512:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vrcp28pd {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8)
-  ret <8 x double> %res
-}
-declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
-
-define <16 x float> @test_exp2_ps_512(<16 x float> %a0) {
-; CHECK-LABEL: test_exp2_ps_512:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vexp2ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xc8,0xc0]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
-
-define <8 x double> @test_exp2_pd_512(<8 x double> %a0) {
-; CHECK-LABEL: test_exp2_pd_512:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vexp2pd {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x18,0xc8,0xc0]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8)
-  ret <8 x double> %res
-}
-declare <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
-
-define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) {
-; CHECK-LABEL: test_rsqrt28_ss:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
-
-define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
-; CHECK-LABEL: test_rcp28_ss:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vrcp28ss {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
-
-define <4 x float> @test_rcp28_ss_load(<4 x float> %a0, ptr %a1ptr) {
-; X86-LABEL: test_rcp28_ss_load:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    vrcp28ss (%eax), %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0xcb,0x00]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_rcp28_ss_load:
-; X64:       # %bb.0:
-; X64-NEXT:    vrcp28ss (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0xcb,0x07]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %a1 = load <4 x float>, ptr %a1ptr
-  %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_rsqrt28_ss_load(<4 x float> %a0, ptr %a1ptr) {
-; X86-LABEL: test_rsqrt28_ss_load:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    vrsqrt28ss (%eax), %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0xcd,0x00]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_rsqrt28_ss_load:
-; X64:       # %bb.0:
-; X64-NEXT:    vrsqrt28ss (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0xcd,0x07]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %a1 = load <4 x float>, ptr %a1ptr
-  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0, i8 %mask) {
-; X86-LABEL: test_rsqrt28_ss_maskz:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_rsqrt28_ss_maskz:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
-; X64-NEXT:    vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 %mask, i32 8) ;
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 %mask) {
-; X86-LABEL: test_rsqrt28_ss_mask:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1]
-; X86-NEXT:    vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_rsqrt28_ss_mask:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
-; X64-NEXT:    vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1]
-; X64-NEXT:    vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 %mask, i32 8) ;
-  ret <4 x float> %res
-}
-
-define <2 x double> @test_rcp28_sd_mask_load(<2 x double> %a0, ptr %a1ptr, <2 x double> %a2, i8 %mask) {
-; X86-LABEL: test_rcp28_sd_mask_load:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vrcp28sd %xmm0, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xcb,0xc8]
-; X86-NEXT:    vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_rcp28_sd_mask_load:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
-; X64-NEXT:    vrcp28sd %xmm0, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xcb,0xc8]
-; X64-NEXT:    vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %a1 = load <2 x double>, ptr %a1ptr
-  %res = call <2 x double> @llvm.x86.avx512.rcp28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> %a2, i8 %mask, i32 4) ;
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.avx512.rcp28.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
-
-define <2 x double> @test_rsqrt28_sd_maskz_load(<2 x double> %a0, ptr %a1ptr, i8 %mask) {
-; X86-LABEL: test_rsqrt28_sd_maskz_load:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vrsqrt28sd %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0xc0]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_rsqrt28_sd_maskz_load:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
-; X64-NEXT:    vrsqrt28sd %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0xc0]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %a1 = load <2 x double>, ptr %a1ptr
-  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 %mask, i32 4) ;
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0, i8 %mask) {
-; X86-LABEL: test_rsqrt28_sd_maskz:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_rsqrt28_sd_maskz:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
-; X64-NEXT:    vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 %mask, i32 8) ;
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_rsqrt28_sd_mask(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 %mask) {
-; X86-LABEL: test_rsqrt28_sd_mask:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vrsqrt28sd {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0xcd,0xd1]
-; X86-NEXT:    vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_rsqrt28_sd_mask:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
-; X64-NEXT:    vrsqrt28sd {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0xcd,0xd1]
-; X64-NEXT:    vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 %mask, i32 8) ;
-  ret <2 x double> %res
-}
-
-declare <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
-
-define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, ptr %ptr, i8 %mask) {
-; X86-LABEL: test_rsqrt28_sd_maskz_mem:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    vrsqrt28sd (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x00]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_rsqrt28_sd_maskz_mem:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
-; X64-NEXT:    vrsqrt28sd (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x07]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %mem = load double , ptr %ptr, align 8
-  %mem_v = insertelement <2 x double> undef, double %mem, i32 0
-  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 %mask, i32 4) ;
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_rsqrt28_sd_maskz_mem_offset(<2 x double> %a0, ptr %ptr, i8 %mask) {
-; X86-LABEL: test_rsqrt28_sd_maskz_mem_offset:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    vrsqrt28sd 144(%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x40,0x12]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_rsqrt28_sd_maskz_mem_offset:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
-; X64-NEXT:    vrsqrt28sd 144(%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x47,0x12]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %ptr1 = getelementptr double, ptr %ptr, i32 18
-  %mem = load double , ptr %ptr1, align 8
-  %mem_v = insertelement <2 x double> undef, double %mem, i32 0
-  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 %mask, i32 4) ;
-  ret <2 x double> %res
-}
-
diff --git a/llvm/test/CodeGen/X86/coalescer-add-implicit-def-subreg-to-reg-regression.ll b/llvm/test/CodeGen/X86/coalescer-add-implicit-def-subreg-to-reg-regression.ll
new file mode 100644
index 0000000..0e6cb7a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/coalescer-add-implicit-def-subreg-to-reg-regression.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+; Not from issue 76416, but separate testcase reported on the same
+; regressing commit.
+define void @other_regression(i1 %cmp.not.i.i.i) {
+; CHECK-LABEL: other_regression:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movl 0, %eax
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    sarl %cl, %eax
+; CHECK-NEXT:    movl $1, %edx
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    shrl %cl, %edx
+; CHECK-NEXT:    imull %eax, %edx
+; CHECK-NEXT:    movslq %edx, %rsi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    callq *%rax
+entry:
+  br label %for.cond10.preheader
+
+trap:                                             ; preds = %for.body13
+  unreachable
+
+for.cond10.preheader:                             ; preds = %while.cond.i.i.i, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ 1, %while.cond.i.i.i ]
+  %i = trunc i64 %indvars.iv to i32
+  br label %for.body13
+
+for.body13:                                       ; preds = %for.cond10.preheader
+  %i1 = load i32, ptr null, align 4
+  %shr = ashr i32 %i1, %i
+  %shr15 = ashr i32 1, %i
+  %mul16 = mul i32 %shr15, %shr
+  %conv = sext i32 %mul16 to i64
+  call void null(ptr null, i64 %conv, ptr null)
+  br i1 false, label %while.cond.i.i.i, label %trap
+
+while.cond.i.i.i:                                 ; preds = %while.cond.i.i.i, %for.body13
+  br i1 %cmp.not.i.i.i, label %for.cond10.preheader, label %while.cond.i.i.i
+}
diff --git a/llvm/test/CodeGen/X86/combine-srem.ll b/llvm/test/CodeGen/X86/combine-srem.ll
index 49ce245..4ed00a9 100644
--- a/llvm/test/CodeGen/X86/combine-srem.ll
+++ b/llvm/test/CodeGen/X86/combine-srem.ll
@@ -329,7 +329,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b_neg(<4 x i32> %x) {
 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    psrad $3, %xmm2
-; SSE-NEXT:    psrad $1, %xmm1
+; SSE-NEXT:    psrld $1, %xmm1
 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
 ; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -351,7 +351,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b_neg(<4 x i32> %x) {
 ; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm3
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
 ; AVX1-NEXT:    vpsrad $3, %xmm1, %xmm3
-; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
 ; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/crc32-target-feature.ll b/llvm/test/CodeGen/X86/crc32-target-feature.ll
index ef4fafc..9dfe27e 100644
--- a/llvm/test/CodeGen/X86/crc32-target-feature.ll
+++ b/llvm/test/CodeGen/X86/crc32-target-feature.ll
@@ -25,5 +25,5 @@ define i32 @test3(i32 %a, i8 %b) nounwind #2 {
 declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
 
 attributes #0 = { "target-features"="+crc32" }
-attributes #1 = { "target-features"="+cx8,+fxsr,-3dnow,-3dnowa,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-mmx,-pclmul,-sha,-sse,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-x87,-xop,+crc32" }
-attributes #2 = { "target-features"="+crc32,+cx8,+fxsr,-3dnow,-3dnowa,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-mmx,-pclmul,-sha,-sse,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-x87,-xop" }
+attributes #1 = { "target-features"="+cx8,+fxsr,-3dnow,-3dnowa,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-mmx,-pclmul,-sha,-sse,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-x87,-xop,+crc32" }
+attributes #2 = { "target-features"="+crc32,+cx8,+fxsr,-3dnow,-3dnowa,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-mmx,-pclmul,-sha,-sse,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-x87,-xop" }
diff --git a/llvm/test/CodeGen/X86/fat-lto-section.ll b/llvm/test/CodeGen/X86/fat-lto-section.ll
index 30c5622..f3ca843 100644
--- a/llvm/test/CodeGen/X86/fat-lto-section.ll
+++ b/llvm/test/CodeGen/X86/fat-lto-section.ll
@@ -5,6 +5,6 @@
 ; RUN:   | FileCheck %s --check-prefix=EXCLUDE
 
 ; EXCLUDE: Name               Type     {{.*}} ES Flg Lk Inf Al
-; EXCLUDE: .llvm.lto          PROGBITS {{.*}} 00   E  0   0  1
+; EXCLUDE: .llvm.lto          LLVM_LTO {{.*}} 00   E  0   0  1
 
 @a = global i32 1
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index dbc0274..1209e26 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -546,9 +546,8 @@ define <8 x i16> @freeze_ashr_vec(<8 x i16> %a0) nounwind {
 define <4 x i32> @freeze_ashr_vec_outofrange(<4 x i32> %a0) nounwind {
 ; X86-LABEL: freeze_ashr_vec_outofrange:
 ; X86:       # %bb.0:
-; X86-NEXT:    psrad $1, %xmm0
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; X86-NEXT:    psrad $2, %xmm0
+; X86-NEXT:    psrad $3, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_ashr_vec_outofrange:
@@ -660,9 +659,8 @@ define <8 x i16> @freeze_lshr_vec(<8 x i16> %a0) nounwind {
 define <4 x i32> @freeze_lshr_vec_outofrange(<4 x i32> %a0) nounwind {
 ; X86-LABEL: freeze_lshr_vec_outofrange:
 ; X86:       # %bb.0:
-; X86-NEXT:    psrld $1, %xmm0
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; X86-NEXT:    psrld $2, %xmm0
+; X86-NEXT:    psrld $3, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_lshr_vec_outofrange:
diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index c6f0662..a464d78 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686--   -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,X86-SSE2
-; RUN: llc < %s -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,X64-AVX,X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64-AVX,X64-VBMI2
 
 declare i8 @llvm.fshl.i8(i8, i8, i8)
 declare i16 @llvm.fshl.i16(i16, i16, i16)
@@ -26,13 +27,13 @@ define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) nounwind {
 ; X86-SSE2-NEXT:    shldl %cl, %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shldl %cl, %esi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shldl %cl, %esi, %eax
+; X64-AVX-NEXT:    retq
   %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
   ret i32 %f
 }
@@ -58,13 +59,13 @@ define i64 @fshl_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-SSE2-NEXT:    popl %edi
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i64:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movq %rdx, %rcx
-; X64-AVX2-NEXT:    movq %rdi, %rax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-AVX2-NEXT:    shldq %cl, %rsi, %rax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i64:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movq %rdx, %rcx
+; X64-AVX-NEXT:    movq %rdi, %rax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-AVX-NEXT:    shldq %cl, %rsi, %rax
+; X64-AVX-NEXT:    retq
   %f = call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 %z)
   ret i64 %f
 }
@@ -116,18 +117,18 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl $4
 ;
-; X64-AVX2-LABEL: fshl_i128:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    testb $64, %r8b
-; X64-AVX2-NEXT:    cmovneq %rdi, %rsi
-; X64-AVX2-NEXT:    cmoveq %rcx, %rdx
-; X64-AVX2-NEXT:    cmovneq %rcx, %rdi
-; X64-AVX2-NEXT:    movq %rdi, %rax
-; X64-AVX2-NEXT:    movl %r8d, %ecx
-; X64-AVX2-NEXT:    shldq %cl, %rdx, %rax
-; X64-AVX2-NEXT:    shldq %cl, %rdi, %rsi
-; X64-AVX2-NEXT:    movq %rsi, %rdx
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i128:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    testb $64, %r8b
+; X64-AVX-NEXT:    cmovneq %rdi, %rsi
+; X64-AVX-NEXT:    cmoveq %rcx, %rdx
+; X64-AVX-NEXT:    cmovneq %rcx, %rdi
+; X64-AVX-NEXT:    movq %rdi, %rax
+; X64-AVX-NEXT:    movl %r8d, %ecx
+; X64-AVX-NEXT:    shldq %cl, %rdx, %rax
+; X64-AVX-NEXT:    shldq %cl, %rdi, %rsi
+; X64-AVX-NEXT:    movq %rsi, %rdx
+; X64-AVX-NEXT:    retq
   %f = call i128 @llvm.fshl.i128(i128 %x, i128 %y, i128 %z)
   ret i128 %f
 }
@@ -173,21 +174,21 @@ define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) nounwind {
 ; X86-SSE2-NEXT:    popl %ebx
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i37:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movq %rdx, %rcx
-; X64-AVX2-NEXT:    movabsq $137438953471, %rax # imm = 0x1FFFFFFFFF
-; X64-AVX2-NEXT:    andq %rdx, %rax
-; X64-AVX2-NEXT:    movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5
-; X64-AVX2-NEXT:    mulq %rdx
-; X64-AVX2-NEXT:    leal (%rdx,%rdx,8), %eax
-; X64-AVX2-NEXT:    leal (%rdx,%rax,4), %eax
-; X64-AVX2-NEXT:    subl %eax, %ecx
-; X64-AVX2-NEXT:    shlq $27, %rsi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-AVX2-NEXT:    shldq %cl, %rsi, %rdi
-; X64-AVX2-NEXT:    movq %rdi, %rax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i37:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movq %rdx, %rcx
+; X64-AVX-NEXT:    movabsq $137438953471, %rax # imm = 0x1FFFFFFFFF
+; X64-AVX-NEXT:    andq %rdx, %rax
+; X64-AVX-NEXT:    movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5
+; X64-AVX-NEXT:    mulq %rdx
+; X64-AVX-NEXT:    leal (%rdx,%rdx,8), %eax
+; X64-AVX-NEXT:    leal (%rdx,%rax,4), %eax
+; X64-AVX-NEXT:    subl %eax, %ecx
+; X64-AVX-NEXT:    shlq $27, %rsi
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-AVX-NEXT:    shldq %cl, %rsi, %rdi
+; X64-AVX-NEXT:    movq %rdi, %rax
+; X64-AVX-NEXT:    retq
   %f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z)
   ret i37 %f
 }
@@ -214,11 +215,11 @@ define i32 @fshl_i32_const_shift(i32 %x, i32 %y) nounwind {
 ; X86-SSE2-NEXT:    shldl $9, %ecx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_const_shift:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shldl $9, %esi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_const_shift:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shldl $9, %esi, %eax
+; X64-AVX-NEXT:    retq
   %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 9)
   ret i32 %f
 }
@@ -233,11 +234,11 @@ define i32 @fshl_i32_const_overshift(i32 %x, i32 %y) nounwind {
 ; X86-SSE2-NEXT:    shldl $9, %ecx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_const_overshift:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shldl $9, %esi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_const_overshift:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shldl $9, %esi, %eax
+; X64-AVX-NEXT:    retq
   %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 41)
   ret i32 %f
 }
@@ -254,11 +255,11 @@ define i64 @fshl_i64_const_overshift(i64 %x, i64 %y) nounwind {
 ; X86-SSE2-NEXT:    shrdl $23, %ecx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i64_const_overshift:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movq %rdi, %rax
-; X64-AVX2-NEXT:    shldq $41, %rsi, %rax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i64_const_overshift:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movq %rdi, %rax
+; X64-AVX-NEXT:    shldq $41, %rsi, %rax
+; X64-AVX-NEXT:    retq
   %f = call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 105)
   ret i64 %f
 }
@@ -287,13 +288,13 @@ define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) nounwind {
 ; X86-SSE2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrdl %cl, %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shrdl %cl, %edi, %eax
+; X64-AVX-NEXT:    retq
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
   ret i32 %f
 }
@@ -340,22 +341,22 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) nounwind {
 ; X86-SSE2-NEXT:    popl %ebx
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i37:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movq %rdx, %rcx
-; X64-AVX2-NEXT:    movabsq $137438953471, %rax # imm = 0x1FFFFFFFFF
-; X64-AVX2-NEXT:    andq %rdx, %rax
-; X64-AVX2-NEXT:    movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5
-; X64-AVX2-NEXT:    mulq %rdx
-; X64-AVX2-NEXT:    leal (%rdx,%rdx,8), %eax
-; X64-AVX2-NEXT:    leal (%rdx,%rax,4), %eax
-; X64-AVX2-NEXT:    subl %eax, %ecx
-; X64-AVX2-NEXT:    addl $27, %ecx
-; X64-AVX2-NEXT:    shlq $27, %rsi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-AVX2-NEXT:    shrdq %cl, %rdi, %rsi
-; X64-AVX2-NEXT:    movq %rsi, %rax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i37:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movq %rdx, %rcx
+; X64-AVX-NEXT:    movabsq $137438953471, %rax # imm = 0x1FFFFFFFFF
+; X64-AVX-NEXT:    andq %rdx, %rax
+; X64-AVX-NEXT:    movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5
+; X64-AVX-NEXT:    mulq %rdx
+; X64-AVX-NEXT:    leal (%rdx,%rdx,8), %eax
+; X64-AVX-NEXT:    leal (%rdx,%rax,4), %eax
+; X64-AVX-NEXT:    subl %eax, %ecx
+; X64-AVX-NEXT:    addl $27, %ecx
+; X64-AVX-NEXT:    shlq $27, %rsi
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-AVX-NEXT:    shrdq %cl, %rdi, %rsi
+; X64-AVX-NEXT:    movq %rsi, %rax
+; X64-AVX-NEXT:    retq
   %f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z)
   ret i37 %f
 }
@@ -382,11 +383,11 @@ define i32 @fshl_i32_demandedbits(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shldl $9, %ecx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_demandedbits:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shldl $9, %esi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_demandedbits:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shldl $9, %esi, %eax
+; X64-AVX-NEXT:    retq
   %x = or i32 %a0, 2147483648
   %y = or i32 %a1, 1
   %res = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 9)
@@ -401,11 +402,11 @@ define i32 @fshr_i32_demandedbits(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shrdl $9, %ecx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_demandedbits:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shldl $23, %esi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_demandedbits:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shldl $23, %esi, %eax
+; X64-AVX-NEXT:    retq
   %x = or i32 %a0, 2147483648
   %y = or i32 %a1, 1
   %res = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 9)
@@ -422,12 +423,12 @@ define i32 @fshl_i32_undef0(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shldl %cl, %eax, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_undef0:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shldl %cl, %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_undef0:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shldl %cl, %edi, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshl.i32(i32 undef, i32 %a0, i32 %a1)
   ret i32 %res
 }
@@ -442,13 +443,13 @@ define i32 @fshl_i32_undef0_msk(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shldl %cl, %eax, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_undef0_msk:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    andl $7, %ecx
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shldl %cl, %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_undef0_msk:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    andl $7, %ecx
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shldl %cl, %edi, %eax
+; X64-AVX-NEXT:    retq
   %m = and i32 %a1, 7
   %res = call i32 @llvm.fshl.i32(i32 undef, i32 %a0, i32 %m)
   ret i32 %res
@@ -461,15 +462,43 @@ define i32 @fshl_i32_undef0_cst(i32 %a0) nounwind {
 ; X86-SSE2-NEXT:    shrl $23, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_undef0_cst:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shrl $23, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_undef0_cst:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shrl $23, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshl.i32(i32 undef, i32 %a0, i32 9)
   ret i32 %res
 }
 
+define <4 x i32> @fshl_v4i32_undef0_cst(<4 x i32> %a0) nounwind {
+; X86-SSE2-LABEL: fshl_v4i32_undef0_cst:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $20, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    psrld $21, %xmm2
+; X86-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $22, %xmm1
+; X86-SSE2-NEXT:    psrld $23, %xmm0
+; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; X86-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: fshl_v4i32_undef0_cst:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    retq
+;
+; X64-VBMI2-LABEL: fshl_v4i32_undef0_cst:
+; X64-VBMI2:       # %bb.0:
+; X64-VBMI2-NEXT:    vpshldvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-VBMI2-NEXT:    retq
+  %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> undef, <4 x i32> %a0, <4 x i32> <i32 9, i32 10, i32 11, i32 12>)
+  ret <4 x i32> %res
+}
+
 define i32 @fshl_i32_undef1(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-LABEL: fshl_i32_undef1:
 ; X86-SSE2:       # %bb.0:
@@ -478,13 +507,13 @@ define i32 @fshl_i32_undef1(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shldl %cl, %eax, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_undef1:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shldl %cl, %eax, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_undef1:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shldl %cl, %eax, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshl.i32(i32 %a0, i32 undef, i32 %a1)
   ret i32 %res
 }
@@ -498,14 +527,14 @@ define i32 @fshl_i32_undef1_msk(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shll %cl, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_undef1_msk:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    andb $7, %cl
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shll %cl, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_undef1_msk:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    andb $7, %cl
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shll %cl, %eax
+; X64-AVX-NEXT:    retq
   %m = and i32 %a1, 7
   %res = call i32 @llvm.fshl.i32(i32 %a0, i32 undef, i32 %m)
   ret i32 %res
@@ -518,15 +547,34 @@ define i32 @fshl_i32_undef1_cst(i32 %a0) nounwind {
 ; X86-SSE2-NEXT:    shll $9, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_undef1_cst:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shll $9, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_undef1_cst:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shll $9, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshl.i32(i32 %a0, i32 undef, i32 9)
   ret i32 %res
 }
 
+define <4 x i32> @fshl_v4i32_undef1_cst(<4 x i32> %a0) nounwind {
+; X86-SSE2-LABEL: fshl_v4i32_undef1_cst:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-SSE2-NEXT:    retl
+;
+; X64-AVX-LABEL: fshl_v4i32_undef1_cst:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    retq
+  %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 9, i32 10, i32 11, i32 12>)
+  ret <4 x i32> %res
+}
+
 define i32 @fshl_i32_undef2(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-LABEL: fshl_i32_undef2:
 ; X86-SSE2:       # %bb.0:
@@ -535,11 +583,11 @@ define i32 @fshl_i32_undef2(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shldl %cl, %ecx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_undef2:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shldl %cl, %esi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_undef2:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shldl %cl, %esi, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshl.i32(i32 %a0, i32 %a1, i32 undef)
   ret i32 %res
 }
@@ -552,13 +600,13 @@ define i32 @fshr_i32_undef0(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shrdl %cl, %eax, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_undef0:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrdl %cl, %eax, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_undef0:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shrdl %cl, %eax, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshr.i32(i32 undef, i32 %a0, i32 %a1)
   ret i32 %res
 }
@@ -572,14 +620,14 @@ define i32 @fshr_i32_undef0_msk(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shrl %cl, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_undef0_msk:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    andb $7, %cl
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrl %cl, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_undef0_msk:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    andb $7, %cl
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shrl %cl, %eax
+; X64-AVX-NEXT:    retq
   %m = and i32 %a1, 7
   %res = call i32 @llvm.fshr.i32(i32 undef, i32 %a0, i32 %m)
   ret i32 %res
@@ -592,15 +640,38 @@ define i32 @fshr_i32_undef0_cst(i32 %a0) nounwind {
 ; X86-SSE2-NEXT:    shrl $9, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_undef0_cst:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shrl $9, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_undef0_cst:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shrl $9, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshr.i32(i32 undef, i32 %a0, i32 9)
   ret i32 %res
 }
 
+define <4 x i32> @fshr_v4i32_undef0_cst(<4 x i32> %a0) nounwind {
+; X86-SSE2-LABEL: fshr_v4i32_undef0_cst:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $12, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    psrld $11, %xmm2
+; X86-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $10, %xmm1
+; X86-SSE2-NEXT:    psrld $9, %xmm0
+; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; X86-SSE2-NEXT:    retl
+;
+; X64-AVX-LABEL: fshr_v4i32_undef0_cst:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    retq
+  %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> undef, <4 x i32> %a0, <4 x i32> <i32 9, i32 10, i32 11, i32 12>)
+  ret <4 x i32> %res
+}
+
 define i32 @fshr_i32_undef1(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-LABEL: fshr_i32_undef1:
 ; X86-SSE2:       # %bb.0:
@@ -609,12 +680,12 @@ define i32 @fshr_i32_undef1(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shrdl %cl, %eax, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_undef1:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrdl %cl, %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_undef1:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shrdl %cl, %edi, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshr.i32(i32 %a0, i32 undef, i32 %a1)
   ret i32 %res
 }
@@ -629,13 +700,13 @@ define i32 @fshr_i32_undef1_msk(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shrdl %cl, %eax, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_undef1_msk:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    andl $7, %ecx
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrdl %cl, %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_undef1_msk:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    andl $7, %ecx
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shrdl %cl, %edi, %eax
+; X64-AVX-NEXT:    retq
   %m = and i32 %a1, 7
   %res = call i32 @llvm.fshr.i32(i32 %a0, i32 undef, i32 %m)
   ret i32 %res
@@ -648,15 +719,39 @@ define i32 @fshr_i32_undef1_cst(i32 %a0) nounwind {
 ; X86-SSE2-NEXT:    shll $23, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_undef1_cst:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shll $23, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_undef1_cst:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shll $23, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshr.i32(i32 %a0, i32 undef, i32 9)
   ret i32 %res
 }
 
+define <4 x i32> @fshr_v4i32_undef1_cst(<4 x i32> %a0) nounwind {
+; X86-SSE2-LABEL: fshr_v4i32_undef1_cst:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: fshr_v4i32_undef1_cst:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    retq
+;
+; X64-VBMI2-LABEL: fshr_v4i32_undef1_cst:
+; X64-VBMI2:       # %bb.0:
+; X64-VBMI2-NEXT:    vpshrdvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-VBMI2-NEXT:    retq
+  %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 9, i32 10, i32 11, i32 12>)
+  ret <4 x i32> %res
+}
+
 define i32 @fshr_i32_undef2(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-LABEL: fshr_i32_undef2:
 ; X86-SSE2:       # %bb.0:
@@ -665,11 +760,11 @@ define i32 @fshr_i32_undef2(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shrdl %cl, %ecx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_undef2:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    shrdl %cl, %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_undef2:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    shrdl %cl, %edi, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshr.i32(i32 %a0, i32 %a1, i32 undef)
   ret i32 %res
 }
@@ -685,13 +780,13 @@ define i32 @fshl_i32_zero0(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shldl %cl, %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_zero0:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    xorl %eax, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shldl %cl, %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_zero0:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    xorl %eax, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shldl %cl, %edi, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshl.i32(i32 0, i32 %a0, i32 %a1)
   ret i32 %res
 }
@@ -703,11 +798,11 @@ define i32 @fshl_i32_zero0_cst(i32 %a0) nounwind {
 ; X86-SSE2-NEXT:    shrl $23, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_zero0_cst:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shrl $23, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_zero0_cst:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shrl $23, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshl.i32(i32 0, i32 %a0, i32 9)
   ret i32 %res
 }
@@ -721,14 +816,14 @@ define i32 @fshl_i32_zero1(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shldl %cl, %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_zero1:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shldl %cl, %edx, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_zero1:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    xorl %edx, %edx
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shldl %cl, %edx, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshl.i32(i32 %a0, i32 0, i32 %a1)
   ret i32 %res
 }
@@ -740,11 +835,11 @@ define i32 @fshl_i32_zero1_cst(i32 %a0) nounwind {
 ; X86-SSE2-NEXT:    shll $9, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_zero1_cst:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shll $9, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_zero1_cst:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shll $9, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshl.i32(i32 %a0, i32 0, i32 9)
   ret i32 %res
 }
@@ -758,14 +853,14 @@ define i32 @fshr_i32_zero0(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_zero0:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrdl %cl, %edx, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_zero0:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    xorl %edx, %edx
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shrdl %cl, %edx, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshr.i32(i32 0, i32 %a0, i32 %a1)
   ret i32 %res
 }
@@ -777,11 +872,11 @@ define i32 @fshr_i32_zero0_cst(i32 %a0) nounwind {
 ; X86-SSE2-NEXT:    shrl $9, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_zero0_cst:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shrl $9, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_zero0_cst:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shrl $9, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshr.i32(i32 0, i32 %a0, i32 9)
   ret i32 %res
 }
@@ -795,13 +890,13 @@ define i32 @fshr_i32_zero1(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_zero1:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    xorl %eax, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrdl %cl, %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_zero1:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    xorl %eax, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shrdl %cl, %edi, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshr.i32(i32 %a0, i32 0, i32 %a1)
   ret i32 %res
 }
@@ -813,11 +908,11 @@ define i32 @fshr_i32_zero1_cst(i32 %a0) nounwind {
 ; X86-SSE2-NEXT:    shll $23, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_zero1_cst:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shll $23, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_zero1_cst:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shll $23, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshr.i32(i32 %a0, i32 0, i32 9)
   ret i32 %res
 }
@@ -830,10 +925,10 @@ define i32 @fshl_i32_zero2(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_zero2:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_zero2:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshl.i32(i32 %a0, i32 %a1, i32 0)
   ret i32 %res
 }
@@ -844,10 +939,10 @@ define i32 @fshr_i32_zero2(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_zero2:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_zero2:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshr.i32(i32 %a0, i32 %a1, i32 0)
   ret i32 %res
 }
@@ -862,11 +957,11 @@ define i32 @fshr_i32_const_shift(i32 %x, i32 %y) nounwind {
 ; X86-SSE2-NEXT:    shrdl $9, %ecx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_const_shift:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shldl $23, %esi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_const_shift:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shldl $23, %esi, %eax
+; X64-AVX-NEXT:    retq
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 9)
   ret i32 %f
 }
@@ -881,11 +976,11 @@ define i32 @fshr_i32_const_overshift(i32 %x, i32 %y) nounwind {
 ; X86-SSE2-NEXT:    shrdl $9, %ecx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_const_overshift:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shldl $23, %esi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_const_overshift:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shldl $23, %esi, %eax
+; X64-AVX-NEXT:    retq
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 41)
   ret i32 %f
 }
@@ -902,11 +997,11 @@ define i64 @fshr_i64_const_overshift(i64 %x, i64 %y) nounwind {
 ; X86-SSE2-NEXT:    shldl $23, %ecx, %edx
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i64_const_overshift:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movq %rdi, %rax
-; X64-AVX2-NEXT:    shldq $23, %rsi, %rax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i64_const_overshift:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movq %rdi, %rax
+; X64-AVX-NEXT:    shldq $23, %rsi, %rax
+; X64-AVX-NEXT:    retq
   %f = call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 105)
   ret i64 %f
 }
@@ -928,10 +1023,10 @@ define i32 @fshl_i32_shift_by_bitwidth(i32 %x, i32 %y) nounwind {
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_shift_by_bitwidth:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_shift_by_bitwidth:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    retq
   %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 32)
   ret i32 %f
 }
@@ -942,10 +1037,10 @@ define i32 @fshr_i32_shift_by_bitwidth(i32 %x, i32 %y) nounwind {
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_shift_by_bitwidth:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_shift_by_bitwidth:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    retq
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 32)
   ret i32 %f
 }
@@ -964,10 +1059,10 @@ define <4 x i32> @fshr_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) nounw
 ; X86-SSE2-NEXT:    movaps %xmm1, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_v4i32_shift_by_bitwidth:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovaps %xmm1, %xmm0
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_v4i32_shift_by_bitwidth:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovaps %xmm1, %xmm0
+; X64-AVX-NEXT:    retq
   %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
   ret <4 x i32> %f
 }
@@ -996,30 +1091,30 @@ define void @PR45265(i32 %0, ptr nocapture readonly %1) nounwind {
 ; X86-SSE2-NEXT:    shldl $24, %edx, %ecx
 ; X86-SSE2-NEXT:    xorl %eax, %ecx
 ; X86-SSE2-NEXT:    orl %ecx, %edi
-; X86-SSE2-NEXT:    jne .LBB46_1
+; X86-SSE2-NEXT:    jne .LBB50_1
 ; X86-SSE2-NEXT:  # %bb.2:
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %edi
 ; X86-SSE2-NEXT:    jmp _Z3foov # TAILCALL
-; X86-SSE2-NEXT:  .LBB46_1:
+; X86-SSE2-NEXT:  .LBB50_1:
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %edi
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: PR45265:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movslq %edi, %rax
-; X64-AVX2-NEXT:    leaq (%rax,%rax,2), %rcx
-; X64-AVX2-NEXT:    movsbq 10(%rsi,%rcx,4), %rdx
-; X64-AVX2-NEXT:    shlq $16, %rdx
-; X64-AVX2-NEXT:    movzwl 8(%rsi,%rcx,4), %edi
-; X64-AVX2-NEXT:    orq %rdx, %rdi
-; X64-AVX2-NEXT:    movq (%rsi,%rcx,4), %rcx
-; X64-AVX2-NEXT:    shrdq $40, %rdi, %rcx
-; X64-AVX2-NEXT:    cmpq %rax, %rcx
-; X64-AVX2-NEXT:    je _Z3foov # TAILCALL
-; X64-AVX2-NEXT:  # %bb.1:
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: PR45265:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movslq %edi, %rax
+; X64-AVX-NEXT:    leaq (%rax,%rax,2), %rcx
+; X64-AVX-NEXT:    movsbq 10(%rsi,%rcx,4), %rdx
+; X64-AVX-NEXT:    shlq $16, %rdx
+; X64-AVX-NEXT:    movzwl 8(%rsi,%rcx,4), %edi
+; X64-AVX-NEXT:    orq %rdx, %rdi
+; X64-AVX-NEXT:    movq (%rsi,%rcx,4), %rcx
+; X64-AVX-NEXT:    shrdq $40, %rdi, %rcx
+; X64-AVX-NEXT:    cmpq %rax, %rcx
+; X64-AVX-NEXT:    je _Z3foov # TAILCALL
+; X64-AVX-NEXT:  # %bb.1:
+; X64-AVX-NEXT:    retq
   %3 = sext i32 %0 to i64
   %4 = getelementptr inbounds %struct.S, ptr %1, i64 %3
   %5 = bitcast ptr %4 to ptr
@@ -1052,15 +1147,15 @@ define i32 @or_shl_fshl(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: or_shl_fshl:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    shll %cl, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shldl %cl, %esi, %edi
-; X64-AVX2-NEXT:    orl %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: or_shl_fshl:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    shll %cl, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shldl %cl, %esi, %edi
+; X64-AVX-NEXT:    orl %edi, %eax
+; X64-AVX-NEXT:    retq
   %shy = shl i32 %y, %s
   %fun = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %s)
   %or = or i32 %fun, %shy
@@ -1078,15 +1173,15 @@ define i32 @or_shl_rotl(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-NEXT:    orl %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: or_shl_rotl:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    shll %cl, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    roll %cl, %eax
-; X64-AVX2-NEXT:    orl %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: or_shl_rotl:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    shll %cl, %edi
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    roll %cl, %eax
+; X64-AVX-NEXT:    orl %edi, %eax
+; X64-AVX-NEXT:    retq
   %shx = shl i32 %x, %s
   %rot = call i32 @llvm.fshl.i32(i32 %y, i32 %y, i32 %s)
   %or = or i32 %rot, %shx
@@ -1107,15 +1202,15 @@ define i32 @or_shl_fshl_commute(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: or_shl_fshl_commute:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    shll %cl, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shldl %cl, %esi, %edi
-; X64-AVX2-NEXT:    orl %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: or_shl_fshl_commute:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    shll %cl, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shldl %cl, %esi, %edi
+; X64-AVX-NEXT:    orl %edi, %eax
+; X64-AVX-NEXT:    retq
   %shy = shl i32 %y, %s
   %fun = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %s)
   %or = or i32 %shy, %fun
@@ -1133,15 +1228,15 @@ define i32 @or_shl_rotl_commute(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-NEXT:    orl %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: or_shl_rotl_commute:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    shll %cl, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    roll %cl, %eax
-; X64-AVX2-NEXT:    orl %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: or_shl_rotl_commute:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    shll %cl, %edi
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    roll %cl, %eax
+; X64-AVX-NEXT:    orl %edi, %eax
+; X64-AVX-NEXT:    retq
   %shx = shl i32 %x, %s
   %rot = call i32 @llvm.fshl.i32(i32 %y, i32 %y, i32 %s)
   %or = or i32 %shx, %rot
@@ -1162,15 +1257,15 @@ define i32 @or_lshr_fshr(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: or_lshr_fshr:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    shrl %cl, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrdl %cl, %esi, %edi
-; X64-AVX2-NEXT:    orl %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: or_lshr_fshr:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    shrl %cl, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shrdl %cl, %esi, %edi
+; X64-AVX-NEXT:    orl %edi, %eax
+; X64-AVX-NEXT:    retq
   %shy = lshr i32 %y, %s
   %fun = call i32 @llvm.fshr.i32(i32 %y, i32 %x, i32 %s)
   %or = or i32 %fun, %shy
@@ -1188,15 +1283,15 @@ define i32 @or_lshr_rotr(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-NEXT:    orl %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: or_lshr_rotr:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    shrl %cl, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    rorl %cl, %eax
-; X64-AVX2-NEXT:    orl %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: or_lshr_rotr:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    shrl %cl, %edi
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    rorl %cl, %eax
+; X64-AVX-NEXT:    orl %edi, %eax
+; X64-AVX-NEXT:    retq
   %shx = lshr i32 %x, %s
   %rot = call i32 @llvm.fshr.i32(i32 %y, i32 %y, i32 %s)
   %or = or i32 %rot, %shx
@@ -1217,15 +1312,15 @@ define i32 @or_lshr_fshr_commute(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: or_lshr_fshr_commute:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    shrl %cl, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrdl %cl, %esi, %edi
-; X64-AVX2-NEXT:    orl %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: or_lshr_fshr_commute:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    shrl %cl, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shrdl %cl, %esi, %edi
+; X64-AVX-NEXT:    orl %edi, %eax
+; X64-AVX-NEXT:    retq
   %shy = lshr i32 %y, %s
   %fun = call i32 @llvm.fshr.i32(i32 %y, i32 %x, i32 %s)
   %or = or i32 %shy, %fun
@@ -1243,15 +1338,15 @@ define i32 @or_lshr_rotr_commute(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-NEXT:    orl %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: or_lshr_rotr_commute:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    shrl %cl, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    rorl %cl, %eax
-; X64-AVX2-NEXT:    orl %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: or_lshr_rotr_commute:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    shrl %cl, %edi
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    rorl %cl, %eax
+; X64-AVX-NEXT:    orl %edi, %eax
+; X64-AVX-NEXT:    retq
   %shx = lshr i32 %x, %s
   %rot = call i32 @llvm.fshr.i32(i32 %y, i32 %y, i32 %s)
   %or = or i32 %shx, %rot
@@ -1267,13 +1362,13 @@ define i32 @or_shl_fshl_simplify(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-NEXT:    shldl %cl, %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: or_shl_fshl_simplify:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shldl %cl, %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: or_shl_fshl_simplify:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shldl %cl, %edi, %eax
+; X64-AVX-NEXT:    retq
   %shy = shl i32 %y, %s
   %fun = call i32 @llvm.fshl.i32(i32 %y, i32 %x, i32 %s)
   %or = or i32 %fun, %shy
@@ -1289,13 +1384,13 @@ define i32 @or_lshr_fshr_simplify(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: or_lshr_fshr_simplify:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrdl %cl, %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: or_lshr_fshr_simplify:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shrdl %cl, %edi, %eax
+; X64-AVX-NEXT:    retq
   %shy = lshr i32 %y, %s
   %fun = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %s)
   %or = or i32 %shy, %fun
diff --git a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll b/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll
index 2f5a368..f8e2502 100644
--- a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll
+++ b/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll
@@ -8,17 +8,13 @@ target triple = "x86_64-unknown-linux-gnu"
 define dso_local i32 @main() local_unnamed_addr #0 !dbg !7 {
 entry:
   tail call void @llvm.prefetch(ptr inttoptr (i64 291 to ptr), i32 0, i32 0, i32 1), !dbg !9
-  tail call void @llvm.x86.avx512.gatherpf.dpd.512(i8 97, <8 x i32> undef, ptr null, i32 1, i32 2), !dbg !10
   ret i32 291, !dbg !11
 }
 
 ; Function Attrs: inaccessiblemem_or_argmemonly nounwind
 declare void @llvm.prefetch(ptr nocapture readonly, i32, i32, i32) #1
 
-; Function Attrs: argmemonly nounwind
-declare void @llvm.x86.avx512.gatherpf.dpd.512(i8, <8 x i32>, ptr, i32, i32) #2
-
-attributes #0 = {"target-cpu"="x86-64" "target-features"="+avx512pf,+sse4.2,+ssse3"}
+attributes #0 = {"target-cpu"="x86-64" "target-features"="+sse4.2,+ssse3"}
 attributes #1 = { inaccessiblemem_or_argmemonly nounwind }
 attributes #2 = { argmemonly nounwind }
 
@@ -43,4 +39,3 @@ attributes #2 = { argmemonly nounwind }
 ;CHECK:       # %bb.0:
 ;CHECK:       prefetchnta 291
 ;CHECK-NOT:   prefetchnta 42(%rax,%ymm0)
-;CHECK:       vgatherpf1dpd (%rax,%ymm0) {%k1}
diff --git a/llvm/test/CodeGen/X86/issue76416.ll b/llvm/test/CodeGen/X86/issue76416.ll
new file mode 100644
index 0000000..d0f7fe6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/issue76416.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=x86_64-unknown-freebsd15.0 < %s | FileCheck %s
+
+%struct.anon.5.28.78.99.149.119 = type { [4 x i8] }
+
+@vga_load_state_p = external dso_local global ptr, align 8
+@vga_load_state_data = external dso_local global i8, align 1
+
+define dso_local void @vga_load_state() #0 {
+; CHECK-LABEL: vga_load_state:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    cmpl $3, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    jg .LBB0_3
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    incl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    cmpl $3, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    jle .LBB0_2
+; CHECK-NEXT:  .LBB0_3: # %for.end
+; CHECK-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_4: # %for.cond1
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movq vga_load_state_p(%rip), %rax
+; CHECK-NEXT:    movslq -{{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    movzbl (%rax,%rcx), %eax
+; CHECK-NEXT:    movb %al, vga_load_state_data(%rip)
+; CHECK-NEXT:    leal 1(%rcx), %eax
+; CHECK-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    jmp .LBB0_4
+entry:
+  %i = alloca i32, align 4
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i1 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %i1, 4
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  call void asm sideeffect "", "{ax},~{dirflag},~{fpsr},~{flags}"(i8 0) #1
+  %i2 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %i2, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  store i32 0, ptr %i, align 4
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.cond1, %for.end
+  call void asm sideeffect "", "N{dx},~{dirflag},~{fpsr},~{flags}"(i32 poison) #1
+  %i3 = load ptr, ptr @vga_load_state_p, align 8
+  %regs = getelementptr inbounds %struct.anon.5.28.78.99.149.119, ptr %i3, i32 0, i32 0
+  %i4 = load i32, ptr %i, align 4
+  %idxprom = sext i32 %i4 to i64
+  %arrayidx = getelementptr inbounds [4 x i8], ptr %regs, i64 0, i64 %idxprom
+  %i5 = load i8, ptr %arrayidx, align 1
+  store i8 %i5, ptr @vga_load_state_data, align 1
+  %i6 = load i32, ptr %i, align 4
+  %inc5 = add nsw i32 %i6, 1
+  store i32 %inc5, ptr %i, align 4
+  br label %for.cond1, !llvm.loop !0
+}
+
+attributes #0 = { "tune-cpu"="generic" }
+attributes #1 = { nounwind }
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.mustprogress"}
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
index c6e8b75..3b5ff12 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
@@ -31,10 +31,8 @@ define <4 x i32> @vec128_i32_signed_reg_reg(<4 x i32> %a1, <4 x i32> %a2) nounwi
 ; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    psubd %xmm1, %xmm4
-; SSE2-NEXT:    psubd %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm2, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    psubd %xmm4, %xmm2
 ; SSE2-NEXT:    psrld $1, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pmuludq %xmm3, %xmm2
@@ -179,25 +177,22 @@ define <4 x i32> @vec128_i32_unsigned_reg_reg(<4 x i32> %a1, <4 x i32> %a2) noun
 ; SSE2-LABEL: vec128_i32_unsigned_reg_reg:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psubd %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,1,1,1]
-; SSE2-NEXT:    por %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    psubd %xmm1, %xmm4
-; SSE2-NEXT:    psubd %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm2, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,1,1]
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    psubd %xmm3, %xmm2
 ; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pmuludq %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; SSE2-NEXT:    paddd %xmm2, %xmm0
 ; SSE2-NEXT:    retq
@@ -349,10 +344,8 @@ define <4 x i32> @vec128_i32_signed_mem_reg(ptr %a1_addr, <4 x i32> %a2) nounwin
 ; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    psubd %xmm0, %xmm4
-; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm2, %xmm4
-; SSE2-NEXT:    pandn %xmm0, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    psubd %xmm4, %xmm2
 ; SSE2-NEXT:    psrld $1, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pmuludq %xmm3, %xmm2
@@ -511,10 +504,8 @@ define <4 x i32> @vec128_i32_signed_reg_mem(<4 x i32> %a1, ptr %a2_addr) nounwin
 ; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    psubd %xmm1, %xmm4
-; SSE2-NEXT:    psubd %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm2, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    psubd %xmm4, %xmm2
 ; SSE2-NEXT:    psrld $1, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pmuludq %xmm3, %xmm2
@@ -674,10 +665,8 @@ define <4 x i32> @vec128_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    psubd %xmm0, %xmm4
-; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm2, %xmm4
-; SSE2-NEXT:    pandn %xmm0, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    psubd %xmm4, %xmm2
 ; SSE2-NEXT:    psrld $1, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pmuludq %xmm3, %xmm2
@@ -844,74 +833,66 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; SSE2-LABEL: vec128_i64_signed_reg_reg:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psubq %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1]
-; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm0, %xmm4
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    pandn %xmm1, %xmm5
-; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    por %xmm5, %xmm3
-; SSE2-NEXT:    psubq %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    psrlq $1, %xmm1
-; SSE2-NEXT:    psrlq $33, %xmm3
-; SSE2-NEXT:    pmuludq %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,1]
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    psubq %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    psrlq $1, %xmm3
+; SSE2-NEXT:    psrlq $33, %xmm2
+; SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    psrlq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm1, %xmm4
-; SSE2-NEXT:    paddq %xmm3, %xmm4
+; SSE2-NEXT:    pmuludq %xmm3, %xmm4
+; SSE2-NEXT:    paddq %xmm2, %xmm4
 ; SSE2-NEXT:    psllq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    pmuludq %xmm1, %xmm3
+; SSE2-NEXT:    paddq %xmm3, %xmm0
 ; SSE2-NEXT:    paddq %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: vec128_i64_signed_reg_reg:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    pxor %xmm0, %xmm3
-; SSE41-NEXT:    pxor %xmm2, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
-; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm3 = [1,1]
-; SSE41-NEXT:    por %xmm0, %xmm3
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psubq %xmm1, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm4
-; SSE41-NEXT:    psubq %xmm1, %xmm4
-; SSE41-NEXT:    psubq %xmm2, %xmm1
-; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
-; SSE41-NEXT:    movapd %xmm1, %xmm0
-; SSE41-NEXT:    psrlq $1, %xmm0
-; SSE41-NEXT:    psrlq $33, %xmm1
-; SSE41-NEXT:    pmuludq %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm1, %xmm2
+; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm1 = [1,1]
+; SSE41-NEXT:    por %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm3
+; SSE41-NEXT:    psubq %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlq $1, %xmm3
+; SSE41-NEXT:    psrlq $33, %xmm2
+; SSE41-NEXT:    pmuludq %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
 ; SSE41-NEXT:    psrlq $32, %xmm4
-; SSE41-NEXT:    pmuludq %xmm0, %xmm4
-; SSE41-NEXT:    paddq %xmm1, %xmm4
+; SSE41-NEXT:    pmuludq %xmm3, %xmm4
+; SSE41-NEXT:    paddq %xmm2, %xmm4
 ; SSE41-NEXT:    psllq $32, %xmm4
-; SSE41-NEXT:    pmuludq %xmm3, %xmm0
-; SSE41-NEXT:    paddq %xmm2, %xmm0
+; SSE41-NEXT:    pmuludq %xmm1, %xmm3
+; SSE41-NEXT:    paddq %xmm3, %xmm0
 ; SSE41-NEXT:    paddq %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -919,9 +900,9 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
+; AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpsrlq $1, %xmm1, %xmm2
 ; AVX-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
@@ -938,9 +919,9 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; XOP:       # %bb.0:
 ; XOP-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm2
 ; XOP-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; XOP-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; XOP-NEXT:    vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; XOP-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm2
 ; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; XOP-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
@@ -1027,74 +1008,66 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; SSE2-LABEL: vec128_i64_unsigned_reg_reg:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psubq %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1]
-; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm0, %xmm4
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    pandn %xmm1, %xmm5
-; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    por %xmm5, %xmm3
-; SSE2-NEXT:    psubq %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    psrlq $1, %xmm1
-; SSE2-NEXT:    psrlq $33, %xmm3
-; SSE2-NEXT:    pmuludq %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,1]
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    psubq %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    psrlq $1, %xmm3
+; SSE2-NEXT:    psrlq $33, %xmm2
+; SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    psrlq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm1, %xmm4
-; SSE2-NEXT:    paddq %xmm3, %xmm4
+; SSE2-NEXT:    pmuludq %xmm3, %xmm4
+; SSE2-NEXT:    paddq %xmm2, %xmm4
 ; SSE2-NEXT:    psllq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    pmuludq %xmm1, %xmm3
+; SSE2-NEXT:    paddq %xmm3, %xmm0
 ; SSE2-NEXT:    paddq %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: vec128_i64_unsigned_reg_reg:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    pxor %xmm0, %xmm3
-; SSE41-NEXT:    pxor %xmm2, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
-; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm3 = [1,1]
-; SSE41-NEXT:    por %xmm0, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psubq %xmm1, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm4
-; SSE41-NEXT:    psubq %xmm1, %xmm4
-; SSE41-NEXT:    psubq %xmm2, %xmm1
-; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
-; SSE41-NEXT:    movapd %xmm1, %xmm0
-; SSE41-NEXT:    psrlq $1, %xmm0
-; SSE41-NEXT:    psrlq $33, %xmm1
-; SSE41-NEXT:    pmuludq %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm1, %xmm2
+; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm1 = [1,1]
+; SSE41-NEXT:    por %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm3
+; SSE41-NEXT:    psubq %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlq $1, %xmm3
+; SSE41-NEXT:    psrlq $33, %xmm2
+; SSE41-NEXT:    pmuludq %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
 ; SSE41-NEXT:    psrlq $32, %xmm4
-; SSE41-NEXT:    pmuludq %xmm0, %xmm4
-; SSE41-NEXT:    paddq %xmm1, %xmm4
+; SSE41-NEXT:    pmuludq %xmm3, %xmm4
+; SSE41-NEXT:    paddq %xmm2, %xmm4
 ; SSE41-NEXT:    psllq $32, %xmm4
-; SSE41-NEXT:    pmuludq %xmm3, %xmm0
-; SSE41-NEXT:    paddq %xmm2, %xmm0
+; SSE41-NEXT:    pmuludq %xmm1, %xmm3
+; SSE41-NEXT:    paddq %xmm3, %xmm0
 ; SSE41-NEXT:    paddq %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1106,9 +1079,9 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm2
 ; AVX1-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
@@ -1128,9 +1101,9 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpsrlq $1, %xmm1, %xmm2
 ; AVX2-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX2-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
@@ -1147,9 +1120,9 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; XOP:       # %bb.0:
 ; XOP-NEXT:    vpcomgtuq %xmm1, %xmm0, %xmm2
 ; XOP-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; XOP-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; XOP-NEXT:    vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; XOP-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm2
 ; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; XOP-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
@@ -1239,76 +1212,67 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa (%rdi), %xmm1
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    psubq %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1]
-; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    pandn %xmm0, %xmm5
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    por %xmm4, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm3
-; SSE2-NEXT:    por %xmm5, %xmm3
-; SSE2-NEXT:    psubq %xmm0, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1]
+; SSE2-NEXT:    por %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    psubq %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    psrlq $1, %xmm0
-; SSE2-NEXT:    psrlq $33, %xmm3
-; SSE2-NEXT:    pmuludq %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    psrlq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm0, %xmm4
-; SSE2-NEXT:    paddq %xmm3, %xmm4
-; SSE2-NEXT:    psllq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm2, %xmm0
+; SSE2-NEXT:    psrlq $33, %xmm2
+; SSE2-NEXT:    pmuludq %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    psrlq $32, %xmm3
+; SSE2-NEXT:    pmuludq %xmm0, %xmm3
+; SSE2-NEXT:    paddq %xmm2, %xmm3
+; SSE2-NEXT:    psllq $32, %xmm3
+; SSE2-NEXT:    pmuludq %xmm4, %xmm0
 ; SSE2-NEXT:    paddq %xmm1, %xmm0
-; SSE2-NEXT:    paddq %xmm4, %xmm0
+; SSE2-NEXT:    paddq %xmm3, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: vec128_i64_signed_mem_reg:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    movdqa (%rdi), %xmm2
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa (%rdi), %xmm1
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    pxor %xmm0, %xmm3
+; SSE41-NEXT:    psubq %xmm0, %xmm3
 ; SSE41-NEXT:    pxor %xmm2, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pxor %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm6
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm5, %xmm0
-; SSE41-NEXT:    por %xmm4, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    psubq %xmm1, %xmm3
-; SSE41-NEXT:    psubq %xmm2, %xmm1
-; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm0, %xmm2
+; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm4 = [1,1]
+; SSE41-NEXT:    por %xmm2, %xmm4
+; SSE41-NEXT:    pxor %xmm2, %xmm3
+; SSE41-NEXT:    psubq %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    psrlq $1, %xmm0
-; SSE41-NEXT:    psrlq $33, %xmm1
-; SSE41-NEXT:    pmuludq %xmm6, %xmm1
-; SSE41-NEXT:    movdqa %xmm6, %xmm3
+; SSE41-NEXT:    psrlq $33, %xmm2
+; SSE41-NEXT:    pmuludq %xmm4, %xmm2
+; SSE41-NEXT:    movdqa %xmm4, %xmm3
 ; SSE41-NEXT:    psrlq $32, %xmm3
 ; SSE41-NEXT:    pmuludq %xmm0, %xmm3
-; SSE41-NEXT:    paddq %xmm1, %xmm3
+; SSE41-NEXT:    paddq %xmm2, %xmm3
 ; SSE41-NEXT:    psllq $32, %xmm3
-; SSE41-NEXT:    pmuludq %xmm6, %xmm0
-; SSE41-NEXT:    paddq %xmm2, %xmm0
+; SSE41-NEXT:    pmuludq %xmm4, %xmm0
+; SSE41-NEXT:    paddq %xmm1, %xmm0
 ; SSE41-NEXT:    paddq %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1317,9 +1281,9 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
 ; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm4
-; AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vblendvpd %xmm2, %xmm4, %xmm0, %xmm0
+; AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX-NEXT:    vpsrlq $1, %xmm0, %xmm2
 ; AVX-NEXT:    vpsrlq $33, %xmm0, %xmm0
 ; AVX-NEXT:    vpmuludq %xmm3, %xmm0, %xmm0
@@ -1337,9 +1301,9 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin
 ; XOP-NEXT:    vmovdqa (%rdi), %xmm1
 ; XOP-NEXT:    vpcomgtq %xmm0, %xmm1, %xmm2
 ; XOP-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; XOP-NEXT:    vpsubq %xmm0, %xmm1, %xmm4
-; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; XOP-NEXT:    vblendvpd %xmm2, %xmm4, %xmm0, %xmm0
+; XOP-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
+; XOP-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; XOP-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; XOP-NEXT:    vpsrlq $1, %xmm0, %xmm2
 ; XOP-NEXT:    vpsrlq $33, %xmm0, %xmm0
 ; XOP-NEXT:    vpmuludq %xmm3, %xmm0, %xmm0
@@ -1442,15 +1406,10 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin
 ; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1]
 ; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm0, %xmm4
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    pandn %xmm1, %xmm5
-; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    por %xmm5, %xmm3
-; SSE2-NEXT:    psubq %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psubq %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm4
+; SSE2-NEXT:    psubq %xmm4, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; SSE2-NEXT:    psrlq $1, %xmm1
 ; SSE2-NEXT:    psrlq $33, %xmm3
@@ -1467,39 +1426,37 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin
 ;
 ; SSE41-LABEL: vec128_i64_signed_reg_mem:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    movdqa (%rdi), %xmm2
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    pxor %xmm0, %xmm3
-; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa (%rdi), %xmm1
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm3
+; SSE41-NEXT:    pxor %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm6
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE41-NEXT:    pand %xmm5, %xmm0
-; SSE41-NEXT:    por %xmm4, %xmm0
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    psubq %xmm2, %xmm3
-; SSE41-NEXT:    psubq %xmm1, %xmm2
-; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
-; SSE41-NEXT:    movapd %xmm2, %xmm0
-; SSE41-NEXT:    psrlq $1, %xmm0
-; SSE41-NEXT:    psrlq $33, %xmm2
-; SSE41-NEXT:    pmuludq %xmm6, %xmm2
-; SSE41-NEXT:    movdqa %xmm6, %xmm3
-; SSE41-NEXT:    psrlq $32, %xmm3
-; SSE41-NEXT:    pmuludq %xmm0, %xmm3
-; SSE41-NEXT:    paddq %xmm2, %xmm3
-; SSE41-NEXT:    psllq $32, %xmm3
-; SSE41-NEXT:    pmuludq %xmm6, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm2, %xmm3
+; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm2 = [1,1]
+; SSE41-NEXT:    por %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    psubq %xmm1, %xmm4
+; SSE41-NEXT:    pxor %xmm3, %xmm4
+; SSE41-NEXT:    psubq %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm1
+; SSE41-NEXT:    psrlq $1, %xmm1
+; SSE41-NEXT:    psrlq $33, %xmm3
+; SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    psrlq $32, %xmm4
+; SSE41-NEXT:    pmuludq %xmm1, %xmm4
+; SSE41-NEXT:    paddq %xmm3, %xmm4
+; SSE41-NEXT:    psllq $32, %xmm4
+; SSE41-NEXT:    pmuludq %xmm2, %xmm1
 ; SSE41-NEXT:    paddq %xmm1, %xmm0
-; SSE41-NEXT:    paddq %xmm3, %xmm0
+; SSE41-NEXT:    paddq %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: vec128_i64_signed_reg_mem:
@@ -1507,9 +1464,9 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
+; AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpsrlq $1, %xmm1, %xmm2
 ; AVX-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
@@ -1527,9 +1484,9 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin
 ; XOP-NEXT:    vmovdqa (%rdi), %xmm1
 ; XOP-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm2
 ; XOP-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; XOP-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; XOP-NEXT:    vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; XOP-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm2
 ; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; XOP-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
@@ -1620,75 +1577,67 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; SSE2-NEXT:    movdqa (%rdi), %xmm1
 ; SSE2-NEXT:    movdqa (%rsi), %xmm0
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    psubq %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1]
-; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    pandn %xmm0, %xmm5
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    por %xmm4, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm3
-; SSE2-NEXT:    por %xmm5, %xmm3
-; SSE2-NEXT:    psubq %xmm0, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1]
+; SSE2-NEXT:    por %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    psubq %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    psrlq $1, %xmm0
-; SSE2-NEXT:    psrlq $33, %xmm3
-; SSE2-NEXT:    pmuludq %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    psrlq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm0, %xmm4
-; SSE2-NEXT:    paddq %xmm3, %xmm4
-; SSE2-NEXT:    psllq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm2, %xmm0
+; SSE2-NEXT:    psrlq $33, %xmm2
+; SSE2-NEXT:    pmuludq %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    psrlq $32, %xmm3
+; SSE2-NEXT:    pmuludq %xmm0, %xmm3
+; SSE2-NEXT:    paddq %xmm2, %xmm3
+; SSE2-NEXT:    psllq $32, %xmm3
+; SSE2-NEXT:    pmuludq %xmm4, %xmm0
 ; SSE2-NEXT:    paddq %xmm1, %xmm0
-; SSE2-NEXT:    paddq %xmm4, %xmm0
+; SSE2-NEXT:    paddq %xmm3, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: vec128_i64_signed_mem_mem:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa (%rdi), %xmm1
-; SSE41-NEXT:    movdqa (%rsi), %xmm2
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    pxor %xmm0, %xmm3
-; SSE41-NEXT:    pxor %xmm1, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    movdqa (%rsi), %xmm0
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psubq %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm6
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm5, %xmm0
-; SSE41-NEXT:    por %xmm4, %xmm0
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    psubq %xmm2, %xmm3
-; SSE41-NEXT:    psubq %xmm1, %xmm2
-; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
-; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm0, %xmm2
+; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm4 = [1,1]
+; SSE41-NEXT:    por %xmm2, %xmm4
+; SSE41-NEXT:    pxor %xmm2, %xmm3
+; SSE41-NEXT:    psubq %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    psrlq $1, %xmm0
 ; SSE41-NEXT:    psrlq $33, %xmm2
-; SSE41-NEXT:    pmuludq %xmm6, %xmm2
-; SSE41-NEXT:    movdqa %xmm6, %xmm3
+; SSE41-NEXT:    pmuludq %xmm4, %xmm2
+; SSE41-NEXT:    movdqa %xmm4, %xmm3
 ; SSE41-NEXT:    psrlq $32, %xmm3
 ; SSE41-NEXT:    pmuludq %xmm0, %xmm3
 ; SSE41-NEXT:    paddq %xmm2, %xmm3
 ; SSE41-NEXT:    psllq $32, %xmm3
-; SSE41-NEXT:    pmuludq %xmm6, %xmm0
+; SSE41-NEXT:    pmuludq %xmm4, %xmm0
 ; SSE41-NEXT:    paddq %xmm1, %xmm0
 ; SSE41-NEXT:    paddq %xmm3, %xmm0
 ; SSE41-NEXT:    retq
@@ -1699,9 +1648,9 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX-NEXT:    vmovdqa (%rsi), %xmm1
 ; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
+; AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpsrlq $1, %xmm1, %xmm2
 ; AVX-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
@@ -1720,9 +1669,9 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; XOP-NEXT:    vmovdqa (%rsi), %xmm1
 ; XOP-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm2
 ; XOP-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; XOP-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; XOP-NEXT:    vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; XOP-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm2
 ; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; XOP-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
@@ -2389,10 +2338,8 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
 ; SSE2-NEXT:    por %xmm3, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    psubb %xmm1, %xmm4
-; SSE2-NEXT:    psubb %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm3
-; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    pxor %xmm3, %xmm4
+; SSE2-NEXT:    psubb %xmm4, %xmm3
 ; SSE2-NEXT:    psrlw $1, %xmm3
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm1
@@ -2852,10 +2799,8 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
 ; SSE2-NEXT:    por %xmm3, %xmm0
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; SSE2-NEXT:    psubb %xmm1, %xmm4
-; SSE2-NEXT:    psubb %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm3
-; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    pxor %xmm3, %xmm4
+; SSE2-NEXT:    psubb %xmm4, %xmm3
 ; SSE2-NEXT:    psrlw $1, %xmm3
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm1
@@ -3083,30 +3028,28 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
 define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind {
 ; SSE2-LABEL: vec128_i8_signed_reg_mem:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm3
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtb %xmm3, %xmm2
+; SSE2-NEXT:    movdqa (%rdi), %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    psubb %xmm3, %xmm4
-; SSE2-NEXT:    psubb %xmm0, %xmm3
-; SSE2-NEXT:    pand %xmm2, %xmm4
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    psrlw $1, %xmm2
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psubb %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm4
+; SSE2-NEXT:    psubb %xmm4, %xmm3
+; SSE2-NEXT:    psrlw $1, %xmm3
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    pmullw %xmm3, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT:    pand %xmm3, %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw %xmm2, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pmullw %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pmullw %xmm3, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    packuswb %xmm4, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    retq
@@ -3321,30 +3264,28 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; SSE2-LABEL: vec128_i8_signed_mem_mem:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    movdqa (%rsi), %xmm3
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pcmpgtb %xmm3, %xmm2
+; SSE2-NEXT:    movdqa (%rsi), %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    psubb %xmm3, %xmm4
-; SSE2-NEXT:    psubb %xmm1, %xmm3
-; SSE2-NEXT:    pand %xmm2, %xmm4
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    psrlw $1, %xmm2
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psubb %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm4
+; SSE2-NEXT:    psubb %xmm4, %xmm3
+; SSE2-NEXT:    psrlw $1, %xmm3
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    pmullw %xmm3, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT:    pand %xmm3, %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw %xmm2, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pmullw %xmm2, %xmm0
-; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pmullw %xmm3, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    packuswb %xmm4, %xmm0
 ; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index cc08396..92060ae 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -390,12 +390,12 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm6
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vblendvpd %xmm5, %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm6
-; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm6
 ; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm7
 ; AVX1-NEXT:    vpsrlq $33, %xmm1, %xmm1
@@ -427,9 +427,9 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
 ; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm3
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm4
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
@@ -448,12 +448,12 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; XOP-NEXT:    vpcomgtq %xmm2, %xmm3, %xmm4
 ; XOP-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm5
-; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm6
-; XOP-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; XOP-NEXT:    vblendvpd %xmm5, %xmm6, %xmm1, %xmm1
-; XOP-NEXT:    vpsubq %xmm2, %xmm3, %xmm6
-; XOP-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vblendvpd %xmm4, %xmm6, %xmm2, %xmm2
+; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; XOP-NEXT:    vpxor %xmm5, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; XOP-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; XOP-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; XOP-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; XOP-NEXT:    vpsrlq $1, %xmm2, %xmm6
 ; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm7
 ; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
@@ -561,25 +561,25 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
 ; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm6
 ; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm4, %xmm4
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm6
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm6
-; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vblendvpd %xmm5, %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm3
+; AVX1-NEXT:    vpxor %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubq %xmm3, %xmm5, %xmm3
 ; AVX1-NEXT:    vpsrlq $1, %xmm3, %xmm6
 ; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm7
+; AVX1-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [1,1]
 ; AVX1-NEXT:    vpor %xmm4, %xmm8, %xmm4
-; AVX1-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmuludq %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm9
 ; AVX1-NEXT:    vpmuludq %xmm7, %xmm9, %xmm9
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm9, %xmm1
 ; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
-; AVX1-NEXT:    vpor %xmm5, %xmm8, %xmm5
 ; AVX1-NEXT:    vpsrlq $33, %xmm3, %xmm3
+; AVX1-NEXT:    vpor %xmm5, %xmm8, %xmm5
 ; AVX1-NEXT:    vpmuludq %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm7
 ; AVX1-NEXT:    vpmuludq %xmm7, %xmm6, %xmm7
@@ -601,9 +601,9 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
 ; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
 ; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm3
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm4
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
@@ -622,12 +622,12 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; XOP-NEXT:    vpcomgtuq %xmm2, %xmm3, %xmm4
 ; XOP-NEXT:    vpcomgtuq %xmm1, %xmm0, %xmm5
-; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm6
-; XOP-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; XOP-NEXT:    vblendvpd %xmm5, %xmm6, %xmm1, %xmm1
-; XOP-NEXT:    vpsubq %xmm2, %xmm3, %xmm6
-; XOP-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vblendvpd %xmm4, %xmm6, %xmm2, %xmm2
+; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; XOP-NEXT:    vpxor %xmm5, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; XOP-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; XOP-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; XOP-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; XOP-NEXT:    vpsrlq $1, %xmm2, %xmm6
 ; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm7
 ; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
@@ -732,12 +732,12 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm6
-; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm5, %xmm6, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm6
-; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm6
 ; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm7
 ; AVX1-NEXT:    vpsrlq $33, %xmm0, %xmm0
@@ -770,9 +770,9 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
 ; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
 ; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm3
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm4
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm2
 ; AVX2-NEXT:    vpsrlq $33, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm0
@@ -792,12 +792,12 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
 ; XOP-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; XOP-NEXT:    vpcomgtq %xmm1, %xmm3, %xmm4
 ; XOP-NEXT:    vpcomgtq %xmm0, %xmm2, %xmm5
-; XOP-NEXT:    vpsubq %xmm0, %xmm2, %xmm6
-; XOP-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
-; XOP-NEXT:    vblendvpd %xmm5, %xmm6, %xmm0, %xmm0
-; XOP-NEXT:    vpsubq %xmm1, %xmm3, %xmm6
-; XOP-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
-; XOP-NEXT:    vblendvpd %xmm4, %xmm6, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    vpxor %xmm5, %xmm0, %xmm0
+; XOP-NEXT:    vpsubq %xmm0, %xmm5, %xmm0
+; XOP-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
+; XOP-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
 ; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm6
 ; XOP-NEXT:    vpsrlq $1, %xmm0, %xmm7
 ; XOP-NEXT:    vpsrlq $33, %xmm0, %xmm0
@@ -902,12 +902,12 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm6
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vblendvpd %xmm5, %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm6
-; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm6
 ; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm7
 ; AVX1-NEXT:    vpsrlq $33, %xmm1, %xmm1
@@ -940,9 +940,9 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
 ; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm3
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm4
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
@@ -962,12 +962,12 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; XOP-NEXT:    vpcomgtq %xmm2, %xmm3, %xmm4
 ; XOP-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm5
-; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm6
-; XOP-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; XOP-NEXT:    vblendvpd %xmm5, %xmm6, %xmm1, %xmm1
-; XOP-NEXT:    vpsubq %xmm2, %xmm3, %xmm6
-; XOP-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vblendvpd %xmm4, %xmm6, %xmm2, %xmm2
+; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; XOP-NEXT:    vpxor %xmm5, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; XOP-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; XOP-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; XOP-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; XOP-NEXT:    vpsrlq $1, %xmm2, %xmm6
 ; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm7
 ; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
@@ -1073,12 +1073,12 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm6
-; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm5, %xmm6, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm6
-; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm6
 ; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm7
 ; AVX1-NEXT:    vpsrlq $33, %xmm0, %xmm0
@@ -1112,9 +1112,9 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
 ; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm3
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm4
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
@@ -1135,12 +1135,12 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; XOP-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; XOP-NEXT:    vpcomgtq %xmm1, %xmm3, %xmm4
 ; XOP-NEXT:    vpcomgtq %xmm0, %xmm2, %xmm5
-; XOP-NEXT:    vpsubq %xmm0, %xmm2, %xmm6
-; XOP-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
-; XOP-NEXT:    vblendvpd %xmm5, %xmm6, %xmm0, %xmm0
-; XOP-NEXT:    vpsubq %xmm1, %xmm3, %xmm6
-; XOP-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
-; XOP-NEXT:    vblendvpd %xmm4, %xmm6, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    vpxor %xmm5, %xmm0, %xmm0
+; XOP-NEXT:    vpsubq %xmm0, %xmm5, %xmm0
+; XOP-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
+; XOP-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
 ; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm6
 ; XOP-NEXT:    vpsrlq $1, %xmm0, %xmm7
 ; XOP-NEXT:    vpsrlq $33, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/misched-critical-path.ll b/llvm/test/CodeGen/X86/misched-critical-path.ll
new file mode 100644
index 0000000..2a95aaa
--- /dev/null
+++ b/llvm/test/CodeGen/X86/misched-critical-path.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin8 -misched-print-dags -o - 2>&1 > /dev/null | FileCheck %s
+; REQUIRES: asserts
+
+@sc = common global i8 0
+@uc = common global i8 0
+@ui = common global i32 0
+
+; Regression Test for PR92368.
+;
+; CHECK: SU(8):   CMP8rr %4:gr8, %3:gr8, implicit-def $eflags
+; CHECK:   Predecessors:
+; CHECK-NEXT:    SU(6): Data Latency=0 Reg=%4
+; CHECK-NEXT:    SU(7): Out  Latency=0
+; CHECK-NEXT:    SU(5): Out  Latency=0
+; CHECK-NEXT:    SU(3): Data Latency=4 Reg=%3
+define void @misched_bug() nounwind {
+entry:
+  %v0 = load i8, ptr @sc, align 1
+  %v1 = zext i8 %v0 to i32
+  %v2 = load i8, ptr @uc, align 1
+  %v3 = zext i8 %v2 to i32
+  %v4 = trunc i32 %v3 to i8
+  %v5 = trunc i32 %v1 to i8
+  %pair74 = cmpxchg ptr @sc, i8 %v4, i8 %v5 monotonic monotonic
+  %v6 = extractvalue { i8, i1 } %pair74, 0
+  %v7 = icmp eq i8 %v6, %v4
+  %v8 = zext i1 %v7 to i8
+  %v9 = zext i8 %v8 to i32
+  store i32 %v9, ptr @ui, align 4
+  br label %return
+
+return:                                           ; preds = %ventry
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 43589dc..3f57a03 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -197,8 +197,6 @@
 ; CHECK-NEXT:       BreakFalseDeps
 ; CHECK-NEXT:       X86 Indirect Branch Tracking
 ; CHECK-NEXT:       X86 vzeroupper inserter
-; CHECK-NEXT:       MachineDominator Tree Construction
-; CHECK-NEXT:       Machine Natural Loop Construction
 ; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       X86 Byte/Word Instruction Fixup
 ; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index dcded7a..1f82c4a 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -1173,13 +1173,14 @@ define <4 x i32> @mul_v4i64_zero_lower(<4 x i32> %val1, <4 x i64> %val2) {
 ;
 ; SSE41-LABEL: mul_v4i64_zero_lower:
 ; SSE41:       # %bb.0: # %entry
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
 ; SSE41-NEXT:    psrlq $32, %xmm2
-; SSE41-NEXT:    pmuludq %xmm3, %xmm2
+; SSE41-NEXT:    pmuludq %xmm0, %xmm2
 ; SSE41-NEXT:    psrlq $32, %xmm1
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT:    pmuludq %xmm1, %xmm0
-; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; SSE41-NEXT:    pmuludq %xmm1, %xmm3
+; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2]
+; SSE41-NEXT:    movaps %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: mul_v4i64_zero_lower:
diff --git a/llvm/test/CodeGen/X86/pr59305.ll b/llvm/test/CodeGen/X86/pr59305.ll
index 4d59192..46c9da5 100644
--- a/llvm/test/CodeGen/X86/pr59305.ll
+++ b/llvm/test/CodeGen/X86/pr59305.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s --check-prefix=X64
-; RUN: llc -mtriple=i686-pc-linux < %s | FileCheck %s --check-prefix=X86
+; RUN: sed -e "s/SETROUND/ldmxcsr/g" %s | llc -mtriple=x86_64-pc-linux - | FileCheck %s --check-prefix=X64
+; RUN: sed -e "s/SETROUND/fldcw/g" %s | llc -mtriple=i686-pc-linux - | FileCheck %s --check-prefix=X86
 
 define double @foo(double %0) #0 {
 ; X64-LABEL: foo:
@@ -74,6 +74,71 @@ define double @foo(double %0) #0 {
     ret double %8
 }
 
+define double @bar(double %0) #0 {
+; X64-LABEL: bar:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    #APP
+; X64-NEXT:    ldmxcsr 0
+; X64-NEXT:    #NO_APP
+; X64-NEXT:    wait
+; X64-NEXT:    movsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0]
+; X64-NEXT:    movapd %xmm2, %xmm3
+; X64-NEXT:    divsd %xmm0, %xmm3
+; X64-NEXT:    #APP
+; X64-NEXT:    ldmxcsr 0
+; X64-NEXT:    #NO_APP
+; X64-NEXT:    wait
+; X64-NEXT:    movapd %xmm2, %xmm1
+; X64-NEXT:    divsd %xmm0, %xmm1
+; X64-NEXT:    #APP
+; X64-NEXT:    ldmxcsr 0
+; X64-NEXT:    #NO_APP
+; X64-NEXT:    wait
+; X64-NEXT:    divsd %xmm0, %xmm2
+; X64-NEXT:    movapd %xmm3, %xmm0
+; X64-NEXT:    callq fma@PLT
+; X64-NEXT:    popq %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: bar:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $28, %esp
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    #APP
+; X86-NEXT:    fldcw 0
+; X86-NEXT:    #NO_APP
+; X86-NEXT:    fld1
+; X86-NEXT:    fld %st(0)
+; X86-NEXT:    fdiv %st(2), %st
+; X86-NEXT:    #APP
+; X86-NEXT:    fldcw 0
+; X86-NEXT:    #NO_APP
+; X86-NEXT:    fld %st(1)
+; X86-NEXT:    fdiv %st(3), %st
+; X86-NEXT:    #APP
+; X86-NEXT:    fldcw 0
+; X86-NEXT:    #NO_APP
+; X86-NEXT:    fxch %st(2)
+; X86-NEXT:    fdivp %st, %st(3)
+; X86-NEXT:    fxch %st(2)
+; X86-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpl (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll fma
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    retl
+    call void asm sideeffect "SETROUND $0", "*m,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) null)
+    %2 = call double @llvm.experimental.constrained.fdiv.f64(double 1.000000e+00, double %0, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+    call void asm sideeffect "SETROUND $0", "*m,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) null)
+    %3 = call double @llvm.experimental.constrained.fdiv.f64(double 1.000000e+00, double %0, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+    call void asm sideeffect "SETROUND $0", "*m,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) null)
+    %4 = call double @llvm.experimental.constrained.fdiv.f64(double 1.000000e+00, double %0, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+    %5 = call double @llvm.experimental.constrained.fma.f64(double %2, double %3, double %4, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+    ret double %5
+}
+
 declare i32 @fesetround(i32) #0
 declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata) #0
 declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata) #0
diff --git a/llvm/test/CodeGen/X86/pr90703.ll b/llvm/test/CodeGen/X86/pr90703.ll
new file mode 100644
index 0000000..c02342f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr90703.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi | FileCheck %s
+
+define i64 @pr90730(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: pr90730:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movabsq $33181731808, %rax # imm = 0x7B9C90BE0
+; CHECK-NEXT:    andnq %rax, %rdi, %rax
+; CHECK-NEXT:    movq $0, (%rdx)
+; CHECK-NEXT:    retq
+entry:
+  %ext = and i64 %y, 1
+  %xor1 = xor i64 %ext, 33181731817
+  %and1 = and i64 %xor1, %x
+  store i64 %and1, ptr %p, align 4
+  %v = load i64, ptr %p, align 4
+  %and2 = and i64 %v, 33181731808
+  %xor2 = xor i64 %and2, 33181731808
+  store i64 0, ptr %p, align 4
+  ret i64 %xor2
+}
diff --git a/llvm/test/CodeGen/X86/pr90844.ll b/llvm/test/CodeGen/X86/pr90844.ll
index 6feece7..b250c3f 100644
--- a/llvm/test/CodeGen/X86/pr90844.ll
+++ b/llvm/test/CodeGen/X86/pr90844.ll
@@ -17,3 +17,20 @@ entry:
   store <2 x i64> %5, ptr poison, align 16
   ret void
 }
+
+define void @foo(ptr %0) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    vpxor 32(%rdi), %ymm0, %ymm1
+; CHECK-NEXT:    vpxor (%rdi), %ymm0, %ymm0
+; CHECK-NEXT:    vmovdqa %ymm0, (%rdi)
+; CHECK-NEXT:    vmovdqa %ymm1, 32(%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %1 = load <32 x half>, ptr %0
+  %2 = fneg <32 x half> %1
+  store <32 x half> %2, ptr %0
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/pr92720.ll b/llvm/test/CodeGen/X86/pr92720.ll
new file mode 100644
index 0000000..b2543c0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr92720.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s
+
+; Make sure we don't crash when shrinking the shift amount before legalization.
+define i64 @pr92720(i64 %x) {
+; CHECK-LABEL: pr92720:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movabsq $8589934592, %rax # imm = 0x200000000
+; CHECK-NEXT:    retq
+  %or = or i64 %x, 255
+  %sub = sub i64 0, %or
+  %shl = shl i64 1, %sub
+  %sext = shl i64 %shl, 32
+  ret i64 %sext
+}
diff --git a/llvm/test/CodeGen/X86/pr93000.ll b/llvm/test/CodeGen/X86/pr93000.ll
new file mode 100644
index 0000000..0bd5da4
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr93000.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=x86_64- -mcpu=x86-64-v4 | FileCheck %s
+
+define void @PR93000(ptr %a0, ptr %a1, ptr %a2, <32 x i16> %a3) {
+; CHECK-LABEL: PR93000:
+; CHECK:       # %bb.0: # %Entry
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    addq $4, %rdi
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_1: # %Loop
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    knotd %k1, %k2
+; CHECK-NEXT:    vpblendmw (%rsi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqu16 (%rdx), %zmm1 {%k2}
+; CHECK-NEXT:    vmovdqu64 %zmm1, (%rsi)
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    addq $4, %rdi
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %Then
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+Entry:
+  %pre = load i32, ptr %a0, align 4
+  br label %Loop
+
+Loop:                                             ; preds = %Loop, %Entry
+  %p = phi i32 [ %limit, %Loop ], [ %pre, %Entry ]
+  %lsr.iv.pn = phi ptr [ %lsr.iv, %Loop ], [ %a0, %Entry ]
+  %lsr.iv = getelementptr i8, ptr %lsr.iv.pn, i64 4
+  %pn = xor i32 %p, -1
+  %m = bitcast i32 %p to <32 x i1>
+  %mn = bitcast i32 %pn to <32 x i1>
+  %mload0 = tail call <32 x i16> @llvm.masked.load.v32i16.p0(ptr %a1, i32 2, <32 x i1> %m, <32 x i16> %a3)
+  %mload1 = tail call <32 x i16> @llvm.masked.load.v32i16.p0(ptr %a2, i32 2, <32 x i1> %mn, <32 x i16> %mload0)
+  store <32 x i16> %mload1, ptr %a1, align 2
+  %limit = load i32, ptr %lsr.iv, align 4
+  %icmp = icmp eq i32 %limit, 0
+  br i1 %icmp, label %Then, label %Loop
+
+Then:                                             ; preds = %Loop
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/prefetch.ll b/llvm/test/CodeGen/X86/prefetch.ll
index 404d49b..c10e052 100644
--- a/llvm/test/CodeGen/X86/prefetch.ll
+++ b/llvm/test/CodeGen/X86/prefetch.ll
@@ -6,9 +6,6 @@
 ; RUN: llc < %s -mtriple=i686-- -mcpu=slm | FileCheck %s -check-prefix=X86-PRFCHWSSE
 ; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 | FileCheck %s -check-prefix=X86-PRFCHWSSE
 ; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 -mattr=-prfchw | FileCheck %s -check-prefix=X86-SSE
-; RUN: llc < %s -mtriple=i686-- -mattr=+sse,+prefetchwt1 | FileCheck %s -check-prefix=X86-PREFETCHWT1
-; RUN: llc < %s -mtriple=i686-- -mattr=-sse,+prefetchwt1 | FileCheck %s -check-prefix=X86-PREFETCHWT1
-; RUN: llc < %s -mtriple=i686-- -mattr=-sse,+3dnow,+prefetchwt1 | FileCheck %s -check-prefix=X86-PREFETCHWT1
 ; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow | FileCheck %s -check-prefix=X86-3DNOW
 ; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow,+prfchw | FileCheck %s -check-prefix=X86-3DNOW
 
@@ -16,7 +13,6 @@
 ; 3dnow by itself get you just the single prefetch instruction with no hints
 ; sse provides prefetch0/1/2/nta
 ; supporting prefetchw, but not 3dnow implicitly provides prefetcht0/1/2/nta regardless of sse setting as we need something to fall back to for the non-write hint.
-; supporting prefetchwt1 implies prefetcht0/1/2/nta and prefetchw regardless of other settings. this allows levels for non-write and gives us an instruction for write+T0
 ; 3dnow prefetch instruction will only get used if you have no other prefetch instructions enabled
 
 ; rdar://10538297
@@ -48,19 +44,6 @@ define void @t(ptr %ptr) nounwind  {
 ; X86-PRFCHWSSE-NEXT:    prefetchw (%eax)
 ; X86-PRFCHWSSE-NEXT:    retl
 ;
-; X86-PREFETCHWT1-LABEL: t:
-; X86-PREFETCHWT1:       # %bb.0: # %entry
-; X86-PREFETCHWT1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-PREFETCHWT1-NEXT:    prefetcht2 (%eax)
-; X86-PREFETCHWT1-NEXT:    prefetcht1 (%eax)
-; X86-PREFETCHWT1-NEXT:    prefetcht0 (%eax)
-; X86-PREFETCHWT1-NEXT:    prefetchnta (%eax)
-; X86-PREFETCHWT1-NEXT:    prefetchwt1 (%eax)
-; X86-PREFETCHWT1-NEXT:    prefetchwt1 (%eax)
-; X86-PREFETCHWT1-NEXT:    prefetchw (%eax)
-; X86-PREFETCHWT1-NEXT:    prefetchwt1 (%eax)
-; X86-PREFETCHWT1-NEXT:    retl
-;
 ; X86-3DNOW-LABEL: t:
 ; X86-3DNOW:       # %bb.0: # %entry
 ; X86-3DNOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index 62051d1..f3f7f05 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -1863,7 +1863,7 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl c, %edx
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    psrld $16, %xmm0
+; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
 ; X86-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    psllq $32, %xmm0
 ; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
@@ -1884,7 +1884,7 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
 ; X64-SSE:       # %bb.0: # %entry
 ; X64-SSE-NEXT:    movq c(%rip), %rax
 ; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT:    psrld $16, %xmm0
+; X64-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
 ; X64-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE-NEXT:    psllq $32, %xmm0
 ; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
diff --git a/llvm/test/CodeGen/X86/speculative-load-hardening-gather.ll b/llvm/test/CodeGen/X86/speculative-load-hardening-gather.ll
index 6e89445..7b36674 100644
--- a/llvm/test/CodeGen/X86/speculative-load-hardening-gather.ll
+++ b/llvm/test/CodeGen/X86/speculative-load-hardening-gather.ll
@@ -558,28 +558,6 @@ entry:
   ret <8 x i64> %v
 }
 
-declare void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, ptr, i32, i32);
-
-define void @test_llvm_x86_avx512_gatherpf_qps_512(<8 x i64> %iv, ptr %b) #1 {
-; CHECK-LABEL: test_llvm_x86_avx512_gatherpf_qps_512:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsp, %rax
-; CHECK-NEXT:    movq $-1, %rcx
-; CHECK-NEXT:    sarq $63, %rax
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
-; CHECK-NEXT:    orq %rax, %rdi
-; CHECK-NEXT:    vpbroadcastq %rax, %zmm1
-; CHECK-NEXT:    vporq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    vgatherpf0qps (%rdi,%zmm0,4) {%k1}
-; CHECK-NEXT:    shlq $47, %rax
-; CHECK-NEXT:    orq %rax, %rsp
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
-entry:
-  call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %iv, ptr %b, i32 4, i32 3)
-  ret void
-}
-
 declare <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float>, ptr, <4 x i32>, i8, i32)
 
 define <4 x float> @test_llvm_x86_avx512_gather3siv4_sf(ptr %b, <4 x i32> %iv) #2 {
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
index e4eca6b..ed7109c 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
@@ -265,7 +265,6 @@ define i32 @stack_fold_fpclassph_mask(<32 x half> %a0, ptr %p) {
 }
 
 define i8 @stack_fold_fpclasssh(<8 x half> %a0) {
-  ;CHECK-LABEl: stack_fold_fpclasssh:
 ; CHECK-LABEL: stack_fold_fpclasssh:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
diff --git a/llvm/test/CodeGen/X86/stack-frame-layout-remarks.ll b/llvm/test/CodeGen/X86/stack-frame-layout-remarks.ll
index d32a37e..cd5edcf 100644
--- a/llvm/test/CodeGen/X86/stack-frame-layout-remarks.ll
+++ b/llvm/test/CodeGen/X86/stack-frame-layout-remarks.ll
@@ -35,7 +35,7 @@ entry:
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
 
 ; BOTH: Function: cleanup_array
-; BOTH-Next:  Offset: [SP+4], Type: Protector, Align: 16, Size: 4
+; BOTH-NEXT:  Offset: [SP+4], Type: Protector, Align: 16, Size: 4
 ; DEBUG: a @ dot.c:13
 ; STRIPPED-NOT: a @ dot.c:13
 ; BOTH:  Offset: [SP-4], Type: Spill, Align: 8, Size: 4
diff --git a/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir b/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir
index 4c715b8..af57d97 100644
--- a/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir
+++ b/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir
@@ -23,7 +23,7 @@
     br i1 %6, label %4, label %5, !llvm.loop !9
   }
 
-  attributes #0 = { nofree norecurse nosync nounwind uwtable writeonly mustprogress "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="generic" }
+  attributes #0 = { nofree norecurse nosync nounwind uwtable writeonly mustprogress "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="generic" }
 
   !llvm.module.flags = !{!0, !1}
   !llvm.ident = !{!2}
diff --git a/llvm/test/DebugInfo/X86/debug-names-types.ll b/llvm/test/DebugInfo/X86/debug-names-types.ll
index ff0d4d5..81016e3 100644
--- a/llvm/test/DebugInfo/X86/debug-names-types.ll
+++ b/llvm/test/DebugInfo/X86/debug-names-types.ll
@@ -48,11 +48,6 @@
 ; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
 ; CHECK-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-NEXT:     }
-; CHECK-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
-; CHECK-NEXT:       Tag: DW_TAG_structure_type
-; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
-; CHECK-NEXT:       DW_IDX_parent: DW_FORM_flag_present
-; CHECK-NEXT:     }
 ; CHECK-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
 ; CHECK-NEXT:       Tag: DW_TAG_subprogram
 ; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
@@ -88,12 +83,6 @@
 ; CHECK-NEXT:           DW_IDX_die_offset: 0x00000023
 ; CHECK-NEXT:           DW_IDX_parent: <parent not indexed>
 ; CHECK-NEXT:       }
-; CHECK-NEXT:       Entry @ {{.+}} {
-; CHECK-NEXT:         Abbrev: [[ABBREV1]]
-; CHECK-NEXT:         Tag: DW_TAG_structure_type
-; CHECK-NEXT:         DW_IDX_die_offset: 0x00000042
-; CHECK-NEXT:         DW_IDX_parent: <parent not indexed>
-; CHECK-NEXT:       }
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:   ]
 ; CHECK-NEXT:   Bucket 2 [
@@ -130,7 +119,7 @@
 ; CHECK-SPLIT:          Foreign TU count: 1
 ; CHECK-SPLIT-NEXT:     Bucket count: 4
 ; CHECK-SPLIT-NEXT:     Name count: 4
-; CHECK-SPLIT-NEXT:     Abbreviations table size: 0x2D
+; CHECK-SPLIT-NEXT:     Abbreviations table size: 0x25
 ; CHECK-SPLIT-NEXT:     Augmentation: 'LLVM0700'
 ; CHECK-SPLIT-NEXT:   }
 ; CHECK-SPLIT-NEXT:   Compilation Unit offsets [
@@ -151,11 +140,6 @@
 ; CHECK-SPLIT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
 ; CHECK-SPLIT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-SPLIT-NEXT:     }
-; CHECK-SPLIT-NEXT:     Abbreviation [[ABBREV:0x[0-9a-f]*]] {
-; CHECK-SPLIT-NEXT:       Tag: DW_TAG_structure_type
-; CHECK-SPLIT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
-; CHECK-SPLIT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
-; CHECK-SPLIT-NEXT:     }
 ; CHECK-SPLIT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
 ; CHECK-SPLIT-NEXT:       Tag: DW_TAG_subprogram
 ; CHECK-SPLIT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
@@ -191,12 +175,6 @@
 ; CHECK-SPLIT-NEXT:         DW_IDX_die_offset: 0x00000021
 ; CHECK-SPLIT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; CHECK-SPLIT-NEXT:       }
-; CHECK-SPLIT-NEXT:       Entry @ {{.*}} {
-; CHECK-SPLIT-NEXT:         Abbrev: [[ABBREV]]
-; CHECK-SPLIT-NEXT:         Tag: DW_TAG_structure_type
-; CHECK-SPLIT-NEXT:         DW_IDX_die_offset: 0x00000039
-; CHECK-SPLIT-NEXT:         DW_IDX_parent: <parent not indexed>
-; CHECK-SPLIT-NEXT:       }
 ; CHECK-SPLIT-NEXT:     }
 ; CHECK-SPLIT-NEXT:   ]
 ; CHECK-SPLIT-NEXT:   Bucket 2 [
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll b/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll
new file mode 100644
index 0000000..c0e370f2
--- /dev/null
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll
@@ -0,0 +1,15 @@
+; Test that HWASan remove writeonly and memory(*) attributes from instrumented functions.
+; RUN: opt -S -passes=hwasan %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-android30"
+
+; CHECK: define dso_local void @test_writeonly(ptr nocapture noundef %p) local_unnamed_addr #0
+define dso_local void @test_writeonly(ptr nocapture noundef writeonly %p) local_unnamed_addr #0 {
+entry:
+  store i32 42, ptr %p, align 4
+  ret void
+}
+
+; CHECK: attributes #0 = { sanitize_hwaddress uwtable }
+attributes #0 = { sanitize_hwaddress memory(argmem: write) uwtable }
diff --git a/llvm/test/Linker/darwin-target-variant.ll b/llvm/test/Linker/darwin-target-variant.ll
new file mode 100644
index 0000000..7d46b2d
--- /dev/null
+++ b/llvm/test/Linker/darwin-target-variant.ll
@@ -0,0 +1,42 @@
+; RUN: rm -rf %t && split-file %s %t
+; RUN: llvm-link %t/1.ll %t/2.ll -S -o - | FileCheck %s
+; CHECK: {i32 2, !"darwin.target_variant.triple", !"x86_64-apple-ios13.1-macabi"}
+
+; RUN: llvm-link %t/1.ll %t/old.ll -S -o - | FileCheck %s -check-prefix OLD
+; OLD: {i32 4, !"darwin.target_variant.triple", !"x86_64-apple-ios14.0-macabi"}
+
+;--- 1.ll
+target triple = "x86_64-apple-macos10.15";
+!llvm.module.flags = !{!0, !1, !2};
+!0 = !{i32 2, !"SDK Version", [3 x i32] [ i32 10, i32 15, i32 1 ] };
+!1 = !{i32 2, !"darwin.target_variant.triple", !"x86_64-apple-ios13.1-macabi"};
+!2 = !{i32 2, !"darwin.target_variant.SDK Version", [2 x i32] [ i32 13, i32 2 ] };
+
+define void @foo() {
+entry:
+  ret void
+}
+
+;--- 2.ll
+target triple = "x86_64-apple-macos10.15";
+!llvm.module.flags = !{!0, !1, !2};
+!0 = !{i32 2, !"SDK Version", [3 x i32] [ i32 10, i32 15, i32 1 ] };
+!1 = !{i32 2, !"darwin.target_variant.triple", !"x86_64-apple-ios14.0-macabi"};
+!2 = !{i32 2, !"darwin.target_variant.SDK Version", [2 x i32] [ i32 13, i32 2 ] };
+
+define void @bar() {
+entry:
+  ret void
+}
+
+;--- old.ll
+target triple = "x86_64-apple-macos10.15";
+!llvm.module.flags = !{!0, !1, !2};
+!0 = !{i32 2, !"SDK Version", [3 x i32] [ i32 10, i32 15, i32 1 ] };
+!1 = !{i32 4, !"darwin.target_variant.triple", !"x86_64-apple-ios14.0-macabi"};
+!2 = !{i32 2, !"darwin.target_variant.SDK Version", [2 x i32] [ i32 13, i32 2 ] };
+
+define void @old() {
+entry:
+  ret void
+}
diff --git a/llvm/test/MC/AArch64/FP8/system-regs.s b/llvm/test/MC/AArch64/FP8/system-regs.s
index 4a396d4..8959a77 100644
--- a/llvm/test/MC/AArch64/FP8/system-regs.s
+++ b/llvm/test/MC/AArch64/FP8/system-regs.s
@@ -1,11 +1,9 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+fpmr < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding  < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
-// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+fpmr < %s \
-// RUN:        | llvm-objdump -d --mattr=+fpmr - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+fpmr < %s \
-// RUN:        | llvm-objdump --mattr=-fpmr -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// RUN: llvm-mc -triple=aarch64 -filetype=obj  < %s \
+// RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj  < %s \
+// RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
 
 // --------------------------------------------------------------------------//
 // read
@@ -13,14 +11,13 @@
 mrs x3, FPMR
 // CHECK-INST: mrs x3, FPMR
 // CHECK-ENCODING: [0x43,0x44,0x3b,0xd5]
-// CHECK-ERROR: expected readable system register
-// CHECK-UNKNOWN: d53b4443   mrs   x3, S3_3_C4_C4_2
+// CHECK-UNKNOWN: d53b4443   mrs   x3, FPMR
+
 
 mrs x3, ID_AA64FPFR0_EL1
 // CHECK-INST: mrs x3, ID_AA64FPFR0_EL1
 // CHECK-ENCODING: [0xe3,0x04,0x38,0xd5]
-// CHECK-ERROR: expected readable system register
-// CHECK-UNKNOWN: d53804e3   mrs   x3, S3_0_C0_C4_7
+// CHECK-UNKNOWN: d53804e3   mrs   x3, ID_AA64FPFR0_EL1
 
 // --------------------------------------------------------------------------//
 // write
@@ -28,5 +25,4 @@ mrs x3, ID_AA64FPFR0_EL1
 msr FPMR, x3
 // CHECK-INST: msr FPMR, x3
 // CHECK-ENCODING: [0x43,0x44,0x1b,0xd5]
-// CHECK-ERROR: expected writable system register or pstate
-// CHECK-UNKNOWN: d51b4443   msr   S3_3_C4_C4_2, x3
+// CHECK-UNKNOWN: d51b4443   msr   FPMR, x3
+\ No newline at end of file
diff --git a/llvm/test/MC/AArch64/SVE/condtion-codes.s b/llvm/test/MC/AArch64/SVE/condition-codes.s
index c1d8e2a..c1d8e2a 100644
--- a/llvm/test/MC/AArch64/SVE/condtion-codes.s
+++ b/llvm/test/MC/AArch64/SVE/condition-codes.s
diff --git a/llvm/test/MC/AArch64/SVE/sqdecd-diagnostics.s b/llvm/test/MC/AArch64/SVE/sqdecd-diagnostics.s
index 658af84..96b14b9 100644
--- a/llvm/test/MC/AArch64/SVE/sqdecd-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE/sqdecd-diagnostics.s
@@ -18,9 +18,9 @@ sqdecd sp
 // CHECK-NEXT: sqdecd sp
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
-uqdecd z0.s
+sqdecd z0.s
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
-// CHECK-NEXT: uqdecd z0.s
+// CHECK-NEXT: sqdecd z0.s
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
 
diff --git a/llvm/test/MC/AArch64/SVE/sqincp-diagnostics.s b/llvm/test/MC/AArch64/SVE/sqincp-diagnostics.s
index 2dfd495..862af7c 100644
--- a/llvm/test/MC/AArch64/SVE/sqincp-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE/sqincp-diagnostics.s
@@ -3,48 +3,48 @@
 // ------------------------------------------------------------------------- //
 // Invalid result register
 
-uqdecp sp, p0
+sqincp sp, p0
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand
-// CHECK-NEXT: uqdecp sp, p0
+// CHECK-NEXT: sqincp sp, p0
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
-uqdecp z0.b, p0
+sqincp z0.b, p0
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
-// CHECK-NEXT: uqdecp z0.b, p0
+// CHECK-NEXT: sqincp z0.b, p0
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
-uqdecp x0, p0.b, w0
+sqincp w0, p0.b, w0
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand
-// CHECK-NEXT: uqdecp x0, p0.b, w0
+// CHECK-NEXT: sqincp w0, p0.b, w0
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
-uqdecp x0, p0.b, x1
+sqincp x0, p0.b, x1
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand
-// CHECK-NEXT: uqdecp x0, p0.b, x1
+// CHECK-NEXT: sqincp x0, p0.b, x1
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
 
 // ------------------------------------------------------------------------- //
 // Invalid predicate operand
 
-uqdecp x0, p0
+sqincp x0, p0
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid predicate register
-// CHECK-NEXT: uqdecp x0, p0
+// CHECK-NEXT: sqincp x0, p0
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
-uqdecp x0, p0/z
+sqincp x0, p0/z
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid predicate register
-// CHECK-NEXT: uqdecp x0, p0/z
+// CHECK-NEXT: sqincp x0, p0/z
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
-uqdecp x0, p0/m
+sqincp x0, p0/m
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid predicate register
-// CHECK-NEXT: uqdecp x0, p0/m
+// CHECK-NEXT: sqincp x0, p0/m
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
-uqdecp x0, p0.q
+sqincp x0, p0.q
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid predicate register
-// CHECK-NEXT: uqdecp x0, p0.q
+// CHECK-NEXT: sqincp x0, p0.q
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
 sqincp z0.d, p0.b
diff --git a/llvm/test/MC/AMDGPU/amd_kernel_code_t.s b/llvm/test/MC/AMDGPU/amd_kernel_code_t.s
new file mode 100644
index 0000000..052ec0b
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/amd_kernel_code_t.s
@@ -0,0 +1,171 @@
+; RUN: llvm-mc -triple=amdgcn-mesa-mesa3d -mcpu=gfx900 -filetype=asm < %s | FileCheck --check-prefix=ASM %s
+; RUN: llvm-mc -triple=amdgcn-mesa-mesa3d -mcpu=gfx900 -filetype=obj < %s > %t
+; RUN: llvm-objdump -s %t | FileCheck --check-prefix=OBJDUMP %s
+
+; OBJDUMP: Contents of section .known_is_dynamic_callstack:
+; OBJDUMP: 0030 00000000 00000000 00001000 00000000
+
+; OBJDUMP: Contents of section .known_wavefront_sgpr_count:
+; OBJDUMP: 0050 00000000 01000000 00000000 00000000
+
+; OBJDUMP: Contents of section .known_workitem_vgpr_count:
+; OBJDUMP: 0050 00000000 00000100 00000000 00000000
+
+; OBJDUMP: Contents of section .known_workitem_private_segment_byte_size:
+; OBJDUMP: 0030 00000000 00000000 00000000 01000000
+
+; OBJDUMP: Contents of section .known_granulated_workitem_vgpr_count:
+; OBJDUMP: 0030 01000000 00000000 00000000 00000000
+
+; OBJDUMP: Contents of section .known_enable_sgpr_workgroup_id_x:
+; OBJDUMP: 0030 00000000 80000000 00000000 00000000
+
+; OBJDUMP: Contents of section .unknown_is_dynamic_callstack:
+; OBJDUMP: 0030 00000000 00000000 00001000 00000000
+
+; OBJDUMP: Contents of section .unknown_wavefront_sgpr_count:
+; OBJDUMP: 0050 00000000 01000000 00000000 00000000
+
+; OBJDUMP: Contents of section .unknown_workitem_vgpr_count:
+; OBJDUMP: 0050 00000000 00000100 00000000 00000000
+
+; OBJDUMP: Contents of section .unknown_workitem_private_segment_byte_size:
+; OBJDUMP: 0030 00000000 00000000 00000000 01000000
+
+; OBJDUMP: Contents of section .unknown_granulated_workitem_vgpr_count:
+; OBJDUMP: 0030 01000000 00000000 00000000 00000000
+
+; OBJDUMP: Contents of section .unknown_enable_sgpr_workgroup_id_x:
+; OBJDUMP: 0030 00000000 80000000 00000000 00000000
+
+.set known, 1
+
+; ASM-LABEL: known_is_dynamic_callstack:
+; ASM: is_dynamic_callstack = 1
+.section .known_is_dynamic_callstack
+known_is_dynamic_callstack:
+	.amd_kernel_code_t
+		is_dynamic_callstack = known
+	.end_amd_kernel_code_t
+	s_endpgm
+
+; ASM-LABEL: known_wavefront_sgpr_count:
+; ASM: wavefront_sgpr_count = 1
+.section .known_wavefront_sgpr_count
+known_wavefront_sgpr_count:
+	.amd_kernel_code_t
+		wavefront_sgpr_count = known
+	.end_amd_kernel_code_t
+	s_endpgm
+
+; ASM-LABEL: known_workitem_vgpr_count:
+; ASM: workitem_vgpr_count = 1
+.section .known_workitem_vgpr_count
+known_workitem_vgpr_count:
+	.amd_kernel_code_t
+		workitem_vgpr_count = known
+	.end_amd_kernel_code_t
+	s_endpgm
+
+; ASM-LABEL: known_workitem_private_segment_byte_size:
+; ASM: workitem_private_segment_byte_size = 1
+.section .known_workitem_private_segment_byte_size
+known_workitem_private_segment_byte_size:
+	.amd_kernel_code_t
+		workitem_private_segment_byte_size = known
+	.end_amd_kernel_code_t
+	s_endpgm
+
+; ASM-LABEL: known_granulated_workitem_vgpr_count:
+; ASM: granulated_workitem_vgpr_count = 1
+.section .known_granulated_workitem_vgpr_count
+known_granulated_workitem_vgpr_count:
+	.amd_kernel_code_t
+		granulated_workitem_vgpr_count = known
+	.end_amd_kernel_code_t
+	s_endpgm
+
+; ASM-LABEL: known_enable_sgpr_workgroup_id_x:
+; ASM: enable_sgpr_workgroup_id_x = 1
+.section .known_enable_sgpr_workgroup_id_x
+known_enable_sgpr_workgroup_id_x:
+	.amd_kernel_code_t
+		enable_sgpr_workgroup_id_x = known
+	.end_amd_kernel_code_t
+	s_endpgm
+
+; ASM-LABEL: unknown_is_dynamic_callstack:
+; ASM: is_dynamic_callstack = unknown
+.section .unknown_is_dynamic_callstack
+unknown_is_dynamic_callstack:
+	.amd_kernel_code_t
+		is_dynamic_callstack = unknown
+	.end_amd_kernel_code_t
+	s_endpgm
+
+; ASM-LABEL: unknown_wavefront_sgpr_count:
+; ASM: wavefront_sgpr_count = unknown
+.section .unknown_wavefront_sgpr_count
+unknown_wavefront_sgpr_count:
+	.amd_kernel_code_t
+		wavefront_sgpr_count = unknown
+	.end_amd_kernel_code_t
+	s_endpgm
+
+; ASM-LABEL: unknown_workitem_vgpr_count:
+; ASM: workitem_vgpr_count = unknown
+.section .unknown_workitem_vgpr_count
+unknown_workitem_vgpr_count:
+	.amd_kernel_code_t
+		workitem_vgpr_count = unknown
+	.end_amd_kernel_code_t
+	s_endpgm
+
+; ASM-LABEL: unknown_workitem_private_segment_byte_size:
+; ASM: workitem_private_segment_byte_size = unknown
+.section .unknown_workitem_private_segment_byte_size
+unknown_workitem_private_segment_byte_size:
+	.amd_kernel_code_t
+		workitem_private_segment_byte_size = unknown
+	.end_amd_kernel_code_t
+	s_endpgm
+
+; ASM-LABEL: unknown_granulated_workitem_vgpr_count:
+; ASM: granulated_workitem_vgpr_count = ((0&4294967232)|(unknown&63))&63
+; ASM: granulated_wavefront_sgpr_count = (((0&4294967232)|(unknown&63))>>6)&15
+; ASM: priority = (((0&4294967232)|(unknown&63))>>10)&3
+; ASM: float_mode = (((0&4294967232)|(unknown&63))>>12)&255
+; ASM: priv = (((0&4294967232)|(unknown&63))>>20)&1
+; ASM: enable_dx10_clamp = (((0&4294967232)|(unknown&63))>>21)&1
+; ASM: debug_mode = (((0&4294967232)|(unknown&63))>>22)&1
+; ASM: enable_ieee_mode = (((0&4294967232)|(unknown&63))>>23)&1
+; ASM: enable_wgp_mode = (((0&4294967232)|(unknown&63))>>29)&1
+; ASM: enable_mem_ordered = (((0&4294967232)|(unknown&63))>>30)&1
+; ASM: enable_fwd_progress = (((0&4294967232)|(unknown&63))>>31)&1
+.section .unknown_granulated_workitem_vgpr_count
+unknown_granulated_workitem_vgpr_count:
+	.amd_kernel_code_t
+		granulated_workitem_vgpr_count = unknown
+	.end_amd_kernel_code_t
+	s_endpgm
+
+; ASM-LABEL: unknown_enable_sgpr_workgroup_id_x:
+; ASM: enable_sgpr_private_segment_wave_byte_offset = ((0&4294967167)|((unknown&1)<<7))&1
+; ASM: user_sgpr_count = (((0&4294967167)|((unknown&1)<<7))>>1)&31
+; ASM: enable_trap_handler = (((0&4294967167)|((unknown&1)<<7))>>6)&1
+; ASM: enable_sgpr_workgroup_id_x = (((0&4294967167)|((unknown&1)<<7))>>7)&1
+; ASM: enable_sgpr_workgroup_id_y = (((0&4294967167)|((unknown&1)<<7))>>8)&1
+; ASM: enable_sgpr_workgroup_id_z = (((0&4294967167)|((unknown&1)<<7))>>9)&1
+; ASM: enable_sgpr_workgroup_info = (((0&4294967167)|((unknown&1)<<7))>>10)&1
+; ASM: enable_vgpr_workitem_id = (((0&4294967167)|((unknown&1)<<7))>>11)&3
+; ASM: enable_exception_msb = (((0&4294967167)|((unknown&1)<<7))>>13)&3
+; ASM: granulated_lds_size = (((0&4294967167)|((unknown&1)<<7))>>15)&511
+; ASM: enable_exception = (((0&4294967167)|((unknown&1)<<7))>>24)&127
+.section .unknown_enable_sgpr_workgroup_id_x
+unknown_enable_sgpr_workgroup_id_x:
+	.amd_kernel_code_t
+		enable_sgpr_workgroup_id_x = unknown
+	.end_amd_kernel_code_t
+	s_endpgm
+
+.set unknown, 1
diff --git a/llvm/test/MC/MachO/darwin-target-variant-reverse.ll b/llvm/test/MC/MachO/darwin-target-variant-reverse.ll
index 6d51cd8..fd527b2 100644
--- a/llvm/test/MC/MachO/darwin-target-variant-reverse.ll
+++ b/llvm/test/MC/MachO/darwin-target-variant-reverse.ll
@@ -3,7 +3,7 @@
 target triple = "x86_64-apple-ios13.1-macabi";
 !llvm.module.flags = !{!0, !1, !2};
 !0 = !{i32 2, !"SDK Version", [2 x i32] [ i32 13, i32 1 ] };
-!1 = !{i32 1, !"darwin.target_variant.triple", !"x86_64-apple-macos10.15"};
+!1 = !{i32 2, !"darwin.target_variant.triple", !"x86_64-apple-macos10.15"};
 !2 = !{i32 2, !"darwin.target_variant.SDK Version", [2 x i32] [ i32 10, i32 15 ] };
 
 define void @foo() {
diff --git a/llvm/test/MC/MachO/darwin-target-variant.ll b/llvm/test/MC/MachO/darwin-target-variant.ll
index d506ed9..78bd1e9 100644
--- a/llvm/test/MC/MachO/darwin-target-variant.ll
+++ b/llvm/test/MC/MachO/darwin-target-variant.ll
@@ -4,7 +4,7 @@
 target triple = "x86_64-apple-macos10.15";
 !llvm.module.flags = !{!0, !1, !2};
 !0 = !{i32 2, !"SDK Version", [3 x i32] [ i32 10, i32 15, i32 1 ] };
-!1 = !{i32 1, !"darwin.target_variant.triple", !"x86_64-apple-ios13.1-macabi"};
+!1 = !{i32 2, !"darwin.target_variant.triple", !"x86_64-apple-ios13.1-macabi"};
 !2 = !{i32 2, !"darwin.target_variant.SDK Version", [2 x i32] [ i32 13, i32 2 ] };
 
 define void @foo() {
diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s
index a028d40..0e5eddd 100644
--- a/llvm/test/MC/RISCV/attribute-arch.s
+++ b/llvm/test/MC/RISCV/attribute-arch.s
@@ -397,7 +397,7 @@
 # CHECK: attribute      5, "rv32i2p1_xcvbi1p0"
 
 .attribute arch, "rv32i_zicfilp0p4"
-# CHECK: attribute      5, "rv32i2p1_zicfilp0p4"
+# CHECK: attribute      5, "rv32i2p1_zicfilp0p4_zicsr2p0"
 
 .attribute arch, "rv32i_zicfiss0p4"
 # CHECK: .attribute     5, "rv32i2p1_zicfiss0p4_zicsr2p0_zimop1p0"
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index 57fa71e..d397188 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -845,4 +845,10 @@ main:
     # CHECK: f32.store_f16 32 # encoding: [0xfc,0x31,0x01,0x20]
     f32.store_f16 32
 
+    # CHECK: f16x8.splat # encoding: [0xfd,0xa0,0x02]
+    f16x8.splat
+
+    # CHECK: f16x8.extract_lane 1 # encoding: [0xfd,0xa1,0x02,0x01]
+    f16x8.extract_lane 1
+
     end_function
diff --git a/llvm/test/ThinLTO/X86/import_callee_declaration.ll b/llvm/test/ThinLTO/X86/import_callee_declaration.ll
index 43214e3..246920e 100644
--- a/llvm/test/ThinLTO/X86/import_callee_declaration.ll
+++ b/llvm/test/ThinLTO/X86/import_callee_declaration.ll
@@ -15,16 +15,20 @@
 ;   and the other one is larger. Both callees of 'small_func' are defined in lib.ll.
 ; - Given the import limit, in main's combined summary, the import type of 'small_func'
 ;   and 'small_indirect_callee' will be 'definition', and the import type of
-;   'large_func' and 'large_indirect_callee' will be 'declaration'.
+;   large* functions and their aliasees will be 'declaration'.
 ;
 ; The test will disassemble combined summaries and check the import type is
 ; correct. Right now postlink optimizer pipeline doesn't do anything (e.g.,
 ; import the declaration or de-serialize summary attributes yet) so there is
 ; nothing to test more than the summary content.
 ;
+; TODO: Extend this test case to test IR once postlink optimizer makes use of
+; the import type for declarations.
+;
 ; RUN: llvm-lto2 run \
 ; RUN:   -debug-only=function-import \
 ; RUN:   -import-instr-limit=7 \
+; RUN:   -import-instr-evolution-factor=1.0 \
 ; RUN:   -import-declaration \
 ; RUN:   -thinlto-distributed-indexes \
 ; RUN:   -r=main.bc,main,px \
@@ -32,36 +36,45 @@
 ; RUN:   -r=main.bc,large_func, \
 ; RUN:   -r=lib.bc,callee,pl \
 ; RUN:   -r=lib.bc,large_indirect_callee,px \
+; RUN:   -r=lib.bc,large_indirect_bar,px \
 ; RUN:   -r=lib.bc,small_func,px \
 ; RUN:   -r=lib.bc,large_func,px \
 ; RUN:   -r=lib.bc,large_indirect_callee_alias,px \
-; RUN:   -r=lib.bc,calleeAddrs,px -o summary main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN:   -r=lib.bc,large_indirect_bar_alias,px \
+; RUN:   -r=lib.bc,calleeAddrs,px -r=lib.bc,calleeAddrs2,px -o summary main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=DUMP
 ;
-; RUN: llvm-lto -thinlto-action=thinlink -import-declaration -import-instr-limit=7  -o combined.index.bc main.bc lib.bc
-; RUN: llvm-lto -thinlto-action=distributedindexes -debug-only=function-import -import-declaration -import-instr-limit=7 -thinlto-index combined.index.bc main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN: llvm-lto -thinlto-action=thinlink -import-declaration -import-instr-limit=7 -import-instr-evolution-factor=1.0 -o combined.index.bc main.bc lib.bc
+; RUN: llvm-lto -thinlto-action=distributedindexes -debug-only=function-import -import-declaration -import-instr-limit=7 -import-instr-evolution-factor=1.0 -thinlto-index combined.index.bc main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=DUMP
 
-; DUMP: - 2 function definitions and 3 function declarations imported from lib.bc
+; DUMP: - 2 function definitions and 4 function declarations imported from lib.bc
 
 ; First disassemble per-module summary and find out the GUID for {large_func, large_indirect_callee}.
 ;
 ; RUN: llvm-dis lib.bc -o - | FileCheck %s --check-prefix=LIB-DIS
-; LIB-DIS: [[LARGEFUNC:\^[0-9]+]] = gv: (name: "large_func", summaries: {{.*}}) ; guid = 2418497564662708935
+; LIB-DIS: module: (path: "lib.bc", hash: (0, 0, 0, 0, 0))
+; LIB-DIS: gv: (name: "large_func", summaries: {{.*}}) ; guid = 2418497564662708935
+; LIB-DIS: gv: (name: "large_indirect_bar_alias", summaries: {{.*}}, aliasee: [[LARGEINDIRECT_BAR:\^[0-9]+]]{{.*}}guid = 13590951773474913315
+; LIB-DIS: [[LARGEINDIRECT_BAR]] = gv: (name: "large_indirect_bar", summaries: {{.*}}) ; guid = 13770917885399536773
 ; LIB-DIS: [[LARGEINDIRECT:\^[0-9]+]] = gv: (name: "large_indirect_callee", summaries: {{.*}}) ; guid = 14343440786664691134
-; LIB-DIS: [[LARGEINDIRECTALIAS:\^[0-9]+]] = gv: (name: "large_indirect_callee_alias", summaries: {{.*}}, aliasee: [[LARGEINDIRECT]]
+; LIB-DIS: gv: (name: "large_indirect_callee_alias", summaries: {{.*}}, aliasee: [[LARGEINDIRECT]]{{.*}}guid = 16730173943625350469
 ;
-; Secondly disassemble main's combined summary and test that large callees are
-; not imported as declarations yet.
+; Secondly disassemble main's combined summary and verify the import type of
+; these two GUIDs are declaration.
 ;
 ; RUN: llvm-dis main.bc.thinlto.bc -o - | FileCheck %s --check-prefix=MAIN-DIS
 ;
 ; MAIN-DIS: [[LIBMOD:\^[0-9]+]] = module: (path: "lib.bc", hash: (0, 0, 0, 0, 0))
-; MAIN-DIS-NOT: [[LARGEFUNC:\^[0-9]+]] = gv: (guid: 2418497564662708935, summaries: (function: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), insts: 8, {{.*}})))
-; MAIN-DIS-NOT: [[LARGEINDIRECT:\^[0-9]+]] = gv: (guid: 14343440786664691134, summaries: (function: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), insts: 8, {{.*}})))
-; MAIN-DIS-NOT: [[LARGEINDIRECTALIAS:\^[0-9]+]] = gv: (guid: 16730173943625350469, summaries: (alias: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration)
+; MAIN-DIS: gv: (guid: 2418497564662708935, summaries: (function: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), insts: 8, {{.*}})))
+; When alias is imported as a copy of the aliasee, but the aliasee is not being
+; imported by itself, the aliasee should be null.
+; MAIN-DIS: gv: (guid: 13590951773474913315, summaries: (alias: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), aliasee: null)))
+; MAIN-DIS: [[LARGEINDIRECT:\^[0-9]+]] = gv: (guid: 14343440786664691134, summaries: (function: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), insts: 8, {{.*}})))
+; MAIN-DIS: gv: (guid: 16730173943625350469, summaries: (alias: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), aliasee: [[LARGEINDIRECT]])))
 
 ; Run in-process ThinLTO and tests that
 ; 1. `callee` remains internalized even if the symbols of its callers
-; (large_func and large_indirect_callee) are exported as declarations and visible to main module.
+; (large_func, large_indirect_callee, large_indirect_bar) are exported as
+; declarations and visible to main module.
 ; 2. the debugging logs from `function-import` pass are expected.
 
 ; RUN: llvm-lto2 run \
@@ -69,20 +82,21 @@
 ; RUN:   -save-temps \
 ; RUN:   -thinlto-threads=1 \
 ; RUN:   -import-instr-limit=7 \
+; RUN:   -import-instr-evolution-factor=1.0 \
 ; RUN:   -import-declaration \
 ; RUN:   -r=main.bc,main,px \
 ; RUN:   -r=main.bc,small_func, \
 ; RUN:   -r=main.bc,large_func, \
 ; RUN:   -r=lib.bc,callee,pl \
 ; RUN:   -r=lib.bc,large_indirect_callee,px \
+; RUN:   -r=lib.bc,large_indirect_bar,px \
 ; RUN:   -r=lib.bc,small_func,px \
 ; RUN:   -r=lib.bc,large_func,px \
 ; RUN:   -r=lib.bc,large_indirect_callee_alias,px \
-; RUN:   -r=lib.bc,calleeAddrs,px -o in-process main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=IMPORTDUMP
+; RUN:   -r=lib.bc,large_indirect_bar_alias,px \
+; RUN:   -r=lib.bc,calleeAddrs,px -r=lib.bc,calleeAddrs2,px -o in-process main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=IMPORTDUMP
 
-; Test import status from debugging logs.
-; TODO: Serialize declaration bit and test declaration bits are correctly set,
-; and extend this test case to test IR once postlink optimizer makes use of
+; TODO: Extend this test case to test IR once postlink optimizer makes use of
 ; the import type for declarations.
 ; IMPORTDUMP-DAG: Not importing function 11825436545918268459 callee from lib.cc
 ; IMPORTDUMP-DAG: Is importing function declaration 14343440786664691134 large_indirect_callee from lib.cc
@@ -91,6 +105,8 @@
 ; IMPORTDUMP-DAG: Is importing function declaration 2418497564662708935 large_func from lib.cc
 ; IMPORTDUMP-DAG: Not importing global 7680325410415171624 calleeAddrs from lib.cc
 ; IMPORTDUMP-DAG: Is importing alias declaration 16730173943625350469 large_indirect_callee_alias from lib.cc
+; IMPORTDUMP-DAG: Is importing alias declaration 13590951773474913315 large_indirect_bar_alias from lib.cc
+; IMPORTDUMP-DAG: Not importing function 13770917885399536773 large_indirect_bar
 
 ; RUN: llvm-dis in-process.1.3.import.bc -o - | FileCheck %s --check-prefix=IMPORT
 
@@ -101,6 +117,8 @@
 ; IMPORT-DAG: declare void @large_func
 ; IMPORT-NOT: large_indirect_callee
 ; IMPORT-NOT: large_indirect_callee_alias
+; IMPORT-NOT: large_indirect_bar
+; IMPORT-NOT: large_indirect_bar_alias
 
 ; INTERNALIZE: define internal void @callee()
 
@@ -124,8 +142,13 @@ source_filename = "lib.cc"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
+; Both large_indirect_callee and large_indirect_callee_alias are referenced
+; and visible to main.ll.
 @calleeAddrs = global [3 x ptr] [ptr @large_indirect_callee, ptr @small_indirect_callee, ptr @large_indirect_callee_alias]
 
+; large_indirect_bar_alias is visible to main.ll but its aliasee isn't.
+@calleeAddrs2 = global [1 x ptr] [ptr @large_indirect_bar_alias]
+
 define void @callee() #1 {
   ret void
 }
@@ -141,12 +164,28 @@ define void @large_indirect_callee()#2 {
   ret void
 }
 
+define void @large_indirect_bar()#2 {
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  ret void
+}
+
 define internal void @small_indirect_callee() #0 {
+entry:
+  %0 = load ptr, ptr @calleeAddrs2
+  call void %0(), !prof !3
   ret void
 }
 
 @large_indirect_callee_alias = alias void(), ptr @large_indirect_callee
 
+@large_indirect_bar_alias = alias void(), ptr @large_indirect_bar
+
 define void @small_func() {
 entry:
   %0 = load ptr, ptr @calleeAddrs
@@ -179,3 +218,4 @@ attributes #2 = { norecurse }
 !0 = !{!"VP", i32 0, i64 1, i64 14343440786664691134, i64 1}
 !1 = !{!"VP", i32 0, i64 1, i64 13568239288960714650, i64 1}
 !2 = !{!"VP", i32 0, i64 1, i64 16730173943625350469, i64 1}
+!3 = !{!"VP", i32 0, i64 1, i64 13590951773474913315, i64 1}
diff --git a/llvm/test/ThinLTO/X86/memprof-tailcall-nonunique.ll b/llvm/test/ThinLTO/X86/memprof-tailcall-nonunique.ll
index d7cfafe..49c22bf 100644
--- a/llvm/test/ThinLTO/X86/memprof-tailcall-nonunique.ll
+++ b/llvm/test/ThinLTO/X86/memprof-tailcall-nonunique.ll
@@ -14,10 +14,11 @@
 ; RUN:  -r=%t.o,_Z4baz1v,plx \
 ; RUN:  -r=%t.o,_Z4baz2v,plx \
 ; RUN:  -r=%t.o,_Z3foob,plx \
+; RUN:  -r=%t.o,xyz,plx \
 ; RUN:  -r=%t.o,main,plx \
 ; RUN:  -r=%t.o,_Znam, \
 ; RUN:  -stats -debug -save-temps \
-; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=STATS
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=STATS --check-prefix=DEBUG
 
 ; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
 
@@ -31,22 +32,20 @@
 ; RUN:  -r=%t.o,_Z4baz1v,plx \
 ; RUN:  -r=%t.o,_Z4baz2v,plx \
 ; RUN:  -r=%t.o,_Z3foob,plx \
+; RUN:  -r=%t.o,xyz,plx \
 ; RUN:  -r=%t.o,main,plx \
 ; RUN:  -r=%t.o,_Znam, \
 ; RUN:  -stats -debug \
-; RUN:  -o %t2.out 2>&1 | FileCheck %s --check-prefix=STATS
+; RUN:  -o %t2.out 2>&1 | FileCheck %s --check-prefix=STATS --check-prefix=DEBUG
 
 ;; Run ThinLTO backend
 ; RUN: opt -passes=memprof-context-disambiguation \
 ; RUN:  -memprof-import-summary=%t.o.thinlto.bc \
 ; RUN:  -stats %t.o -S 2>&1 | FileCheck %s --check-prefix=IR
 
-; DEBUG: Not found through unique tail call chain: _Z3barv from main that actually called _Z3foob (found multiple possible chains)
-; DEBUG: Not found through unique tail call chain: _Z3barv from main that actually called _Z3foob (found multiple possible chains)
-; DEBUG: Not found through unique tail call chain: _Z3barv from main that actually called _Z3foob (found multiple possible chains)
-; DEBUG: Not found through unique tail call chain: _Z3barv from main that actually called _Z3foob (found multiple possible chains)
+; DEBUG: Not found through unique tail call chain: 17377440600225628772 (_Z3barv) from 15822663052811949562 (main) that actually called 8716735811002003409 (xyz) (found multiple possible chains)
 
-; STATS: 4 memprof-context-disambiguation - Number of profiled callees found via multiple tail call chains
+; STATS: 1 memprof-context-disambiguation - Number of profiled callees found via multiple tail call chains
 
 ;; Check that all calls in the IR are to the original functions, leading to a
 ;; non-cold operator new call.
@@ -125,17 +124,24 @@ return:                                           ; preds = %if.else, %if.then
 }
 
 ; Function Attrs: noinline
-; IR-LABEL: @main()
-define dso_local i32 @main() local_unnamed_addr #0 {
+; IR-LABEL: @xyz()
+define dso_local i32 @xyz() local_unnamed_addr #0 {
 delete.end13:
   ; IR: call ptr @_Z3foob(i1 true)
-  %call = tail call ptr @_Z3foob(i1 true), !callsite !10
+  %call = tail call ptr @_Z3foob(i1 true)
   ; IR: call ptr @_Z3foob(i1 true)
-  %call1 = tail call ptr @_Z3foob(i1 true), !callsite !11
+  %call1 = tail call ptr @_Z3foob(i1 true)
   ; IR: call ptr @_Z3foob(i1 false)
-  %call2 = tail call ptr @_Z3foob(i1 false), !callsite !12
+  %call2 = tail call ptr @_Z3foob(i1 false)
   ; IR: call ptr @_Z3foob(i1 false)
-  %call3 = tail call ptr @_Z3foob(i1 false), !callsite !13
+  %call3 = tail call ptr @_Z3foob(i1 false)
+  ret i32 0
+}
+
+define dso_local i32 @main() local_unnamed_addr #0 {
+delete.end13:
+  ; IR: call i32 @xyz()
+  %call1 = tail call i32 @xyz(), !callsite !11
   ret i32 0
 }
 
@@ -145,17 +151,10 @@ attributes #0 = { noinline }
 attributes #1 = { nobuiltin allocsize(0) }
 attributes #2 = { builtin allocsize(0) }
 
-!0 = !{!1, !3, !5, !7}
-!1 = !{!2, !"notcold"}
-!2 = !{i64 3186456655321080972, i64 6307901912192269588}
-!3 = !{!4, !"cold"}
-!4 = !{i64 3186456655321080972, i64 6792096022461663180}
+!0 = !{!5, !7}
 !5 = !{!6, !"notcold"}
 !6 = !{i64 3186456655321080972, i64 8632435727821051414}
 !7 = !{!8, !"cold"}
 !8 = !{i64 3186456655321080972, i64 -3421689549917153178}
 !9 = !{i64 3186456655321080972}
-!10 = !{i64 8632435727821051414}
 !11 = !{i64 -3421689549917153178}
-!12 = !{i64 6307901912192269588}
-!13 = !{i64 6792096022461663180}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
index ce8524c..0acb8f8 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
@@ -176,9 +176,9 @@ define i16 @test_atomicrmw_and_i16_global_agent_align4(ptr addrspace(1) %ptr, i1
   ret i16 %res
 }
 
-; Preserve unknown metadata
-define i16 @test_atomicrmw_and_i16_global_agent_preserve_md(ptr addrspace(1) %ptr, i16 %value) {
-; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_preserve_md(
+; Drop unknown metadata and noundef
+define i16 @test_atomicrmw_and_i16_global_agent_drop_md(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_drop_md(
 ; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
 ; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
@@ -198,9 +198,9 @@ define i16 @test_atomicrmw_and_i16_global_agent_preserve_md(ptr addrspace(1) %pt
   ret i16 %res
 }
 
-; Preserve unknown metadata
-define i16 @test_atomicrmw_and_i16_global_agent_align4_preserve_md(ptr addrspace(1) %ptr, i16 %value) {
-; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_md(
+; Drop unknown metadata
+define i16 @test_atomicrmw_and_i16_global_agent_align4_drop_md(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4_drop_md(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
 ; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536
 ; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
@@ -211,6 +211,89 @@ define i16 @test_atomicrmw_and_i16_global_agent_align4_preserve_md(ptr addrspace
   ret i16 %res
 }
 
+; Drop noundef, preserve mmra
+define i16 @test_atomicrmw_and_i16_global_agent_preserve_mmra(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_preserve_mmra(
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]]
+; CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !mmra [[META0:![0-9]+]]
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, !noundef !0, !mmra !1
+  ret i16 %res
+}
+
+; Drop noundef, preserve mmra
+define i16 @test_atomicrmw_and_i16_global_agent_align4_preserve_mmra(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_mmra(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536
+; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !mmra [[META0]]
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4, !noundef !0, !mmra !1
+  ret i16 %res
+}
+
+define i16 @test_atomicrmw_and_i16_global_agent_align4_preserve_alias_scope(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_alias_scope(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536
+; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !alias.scope [[META1:![0-9]+]]
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4, !alias.scope !2
+  ret i16 %res
+}
+
+define i16 @test_atomicrmw_and_i16_global_agent_align4_preserve_noalias(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_noalias(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536
+; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !noalias [[META1]]
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4, !noalias !2
+  ret i16 %res
+}
+
+define i16 @test_atomicrmw_and_i16_global_agent_align4_preserve_tbaa_struct(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_tbaa_struct(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536
+; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !tbaa.struct [[TBAA_STRUCT4:![0-9]+]]
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4, !tbaa.struct !5
+  ret i16 %res
+}
+
+define i16 @test_atomicrmw_and_i16_global_agent_align4_preserve_tbaa(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_tbaa(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536
+; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4, !tbaa !6
+  ret i16 %res
+}
+
 define i16 @test_atomicrmw_and_i16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i16 %value) {
 ; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent__amdgpu_no_remote_memory(
 ; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
@@ -223,7 +306,7 @@ define i16 @test_atomicrmw_and_i16_global_agent__amdgpu_no_remote_memory(ptr add
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32
 ; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]]
-; CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META8:![0-9]+]]
 ; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; CHECK-NEXT:    ret i16 [[EXTRACTED]]
@@ -236,7 +319,7 @@ define i16 @test_atomicrmw_and_i16_global_agent_align4__amdgpu_no_remote_memory(
 ; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4__amdgpu_no_remote_memory(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
 ; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536
-; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META8]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16
 ; CHECK-NEXT:    ret i16 [[EXTRACTED]]
 ;
@@ -256,7 +339,7 @@ define i16 @test_atomicrmw_and_i16_global_agent__amdgpu_no_fine_grained_memory(p
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32
 ; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]]
-; CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META8]]
 ; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; CHECK-NEXT:    ret i16 [[EXTRACTED]]
@@ -269,7 +352,7 @@ define i16 @test_atomicrmw_and_i16_global_agent_align4__amdgpu_no_fine_grained_m
 ; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4__amdgpu_no_fine_grained_memory(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
 ; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536
-; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META8]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16
 ; CHECK-NEXT:    ret i16 [[EXTRACTED]]
 ;
@@ -1180,6 +1263,15 @@ define bfloat @test_atomicrmw_xchg_bf16_global_agent_align4(ptr addrspace(1) %pt
 }
 
 !0 = !{}
+!1 = !{!"foo", !"bar"}
+!2 = !{!3}
+!3 = distinct !{!3, !4}
+!4 = distinct !{!4}
+!5 = !{i64 0, i64 4, !1, i64 8, i64 4}
+!6 = !{!7, !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C/C++ TBAA"}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; BASE: {{.*}}
 ; GCN: {{.*}}
diff --git a/llvm/test/Transforms/Attributor/issue87856.ll b/llvm/test/Transforms/Attributor/issue87856.ll
new file mode 100644
index 0000000..4da29cc
--- /dev/null
+++ b/llvm/test/Transforms/Attributor/issue87856.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt -S -passes=attributor < %s | FileCheck %s
+
+define void @null_ptr_is_valid_call_with_null() #0 {
+; CHECK-LABEL: define void @null_ptr_is_valid_call_with_null(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    call void @store_as0(ptr nofree noundef writeonly align 4294967296 null) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  call void @store_as0(ptr null)
+  ret void
+}
+
+define void @null_ptr_is_valid_call_with_undef() #0 {
+; CHECK-LABEL: define void @null_ptr_is_valid_call_with_undef(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    call void @store_as0(ptr undef) #[[ATTR4]]
+; CHECK-NEXT:    ret void
+;
+  call void @store_as0(ptr undef)
+  ret void
+}
+
+define void @store_as0(ptr %0) {
+; CHECK-LABEL: define void @store_as0(
+; CHECK-SAME: ptr nocapture nofree noundef nonnull writeonly align 2 dereferenceable(2) [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    store i16 0, ptr [[TMP0]], align 2
+; CHECK-NEXT:    ret void
+;
+  store i16 0, ptr %0, align 2
+  ret void
+}
+
+define void @call_store_as1() {
+; CHECK-LABEL: define void @call_store_as1(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT:    call void @store_as1(ptr addrspace(1) nocapture nofree noundef writeonly align 4294967296 null) #[[ATTR4]]
+; CHECK-NEXT:    ret void
+;
+  call void @store_as1(ptr addrspace(1) null)
+  ret void
+}
+
+define void @store_as1(ptr addrspace(1) %arg) {
+; CHECK-LABEL: define void @store_as1(
+; CHECK-SAME: ptr addrspace(1) nocapture nofree noundef writeonly align 2 dereferenceable_or_null(2) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    store i16 0, ptr addrspace(1) [[ARG]], align 2
+; CHECK-NEXT:    ret void
+;
+  store i16 0, ptr addrspace(1) %arg, align 2
+  ret void
+}
+
+attributes #0 = { null_pointer_is_valid }
+;.
+; CHECK: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(write) }
+; CHECK: attributes #[[ATTR1]] = { mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) }
+; CHECK: attributes #[[ATTR2]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) }
+; CHECK: attributes #[[ATTR3]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR4]] = { nofree nosync nounwind willreturn memory(write) }
+;.
diff --git a/llvm/test/Transforms/ConstraintElimination/sext-unsigned-predicates.ll b/llvm/test/Transforms/ConstraintElimination/sext-unsigned-predicates.ll
index ac3e577..00dc48e 100644
--- a/llvm/test/Transforms/ConstraintElimination/sext-unsigned-predicates.ll
+++ b/llvm/test/Transforms/ConstraintElimination/sext-unsigned-predicates.ll
@@ -13,7 +13,8 @@ define void @uge_sext(i16 %x, i32 %y) {
 ; CHECK-NEXT:    [[AND:%.*]] = and i1 [[C_1]], [[C_2]]
 ; CHECK-NEXT:    br i1 [[AND]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    [[T_1:%.*]] = icmp uge i32 [[X_EXT]], [[Y]]
+; CHECK-NEXT:    call void @use(i1 [[T_1]])
 ; CHECK-NEXT:    [[C_3:%.*]] = icmp uge i16 [[X]], -10
 ; CHECK-NEXT:    call void @use(i1 [[C_3]])
 ; CHECK-NEXT:    [[C_4:%.*]] = icmp uge i32 [[X_EXT]], -9
@@ -65,8 +66,7 @@ define void @uge_sext_known_positive(i16 %x, i32 %y) {
 ; CHECK-NEXT:    br i1 [[AND]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    call void @use(i1 true)
-; CHECK-NEXT:    [[T_2:%.*]] = icmp uge i16 [[X]], 10
-; CHECK-NEXT:    call void @use(i1 [[T_2]])
+; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    [[C_3:%.*]] = icmp uge i32 [[X_EXT]], 11
 ; CHECK-NEXT:    call void @use(i1 [[C_3]])
 ; CHECK-NEXT:    [[C_4:%.*]] = icmp uge i32 [[X_EXT]], 11
diff --git a/llvm/test/Transforms/ConstraintElimination/transfer-signed-facts-to-unsigned.ll b/llvm/test/Transforms/ConstraintElimination/transfer-signed-facts-to-unsigned.ll
index 2fe9262..68e48c7 100644
--- a/llvm/test/Transforms/ConstraintElimination/transfer-signed-facts-to-unsigned.ll
+++ b/llvm/test/Transforms/ConstraintElimination/transfer-signed-facts-to-unsigned.ll
@@ -503,11 +503,9 @@ define i32 @sge_2_gep(i32 %idx, ptr %src, i32 %idx.2) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[IDX]], 2
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
 ; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IDX_2:%.*]]
-; CHECK-NEXT:    [[T_1:%.*]] = icmp ult ptr [[SRC]], [[ADD_PTR]]
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ult ptr [[SRC]], [[ADD_PTR_2]]
-; CHECK-NEXT:    [[X_1:%.*]] = xor i1 [[T_1]], [[C_1]]
-; CHECK-NEXT:    [[F_1:%.*]] = icmp uge ptr [[SRC]], [[ADD_PTR]]
-; CHECK-NEXT:    [[X_2:%.*]] = xor i1 [[X_1]], [[F_1]]
+; CHECK-NEXT:    [[X_1:%.*]] = xor i1 true, [[C_1]]
+; CHECK-NEXT:    [[X_2:%.*]] = xor i1 [[X_1]], false
 ; CHECK-NEXT:    br i1 [[X_2]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    ret i32 0
diff --git a/llvm/test/Transforms/Coroutines/coro-await-suspend-handle-in-ramp.ll b/llvm/test/Transforms/Coroutines/coro-await-suspend-handle-in-ramp.ll
new file mode 100644
index 0000000..ee64ce6
--- /dev/null
+++ b/llvm/test/Transforms/Coroutines/coro-await-suspend-handle-in-ramp.ll
@@ -0,0 +1,59 @@
+; Tests lowerings of different versions of coro.await.suspend
+; RUN: opt < %s -passes='module(coro-early),cgscc(coro-split),simplifycfg' -S | FileCheck %s
+
+%Awaiter = type {}
+
+define void @f() presplitcoroutine {
+entry:
+  %awaiter = alloca %Awaiter
+  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
+  %size = call i32 @llvm.coro.size.i32()
+  %alloc = call ptr @malloc(i32 %size)
+  %hdl = call ptr @llvm.coro.begin(token %id, ptr %alloc)
+  call void @llvm.coro.await.suspend.handle(ptr %awaiter, ptr %hdl, ptr @await_suspend_wrapper_handle)
+  %suspend.init = call i8 @llvm.coro.suspend(token none, i1 false)
+  switch i8 %suspend.init, label %ret [
+    i8 0, label %step
+    i8 1, label %cleanup
+  ]
+
+; Check the calling convention for resuming function is fastcc
+; CHECK:     define {{[^@]*}} @f()
+; CHECK:      entry:
+; CHECK:        %[[NEXT_HDL:.+]] = call ptr @await_suspend_wrapper_handle(
+; CHECK-NEXT:   %[[CONT:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[NEXT_HDL]], i8 0)
+; CHECK-NEXT:   call fastcc void %[[CONT]](ptr %[[NEXT_HDL]])
+step:
+  br label %cleanup
+
+cleanup:
+  %mem = call ptr @llvm.coro.free(token %id, ptr %hdl)
+  call void @free(ptr %mem)
+  br label %ret
+
+ret:
+  call i1 @llvm.coro.end(ptr %hdl, i1 0, token none)
+  ret void
+}
+
+; check that we were haven't accidentally went out of @f body
+; CHECK-LABEL: @f.resume(
+; CHECK-LABEL: @f.destroy(
+; CHECK-LABEL: @f.cleanup(
+
+declare ptr @await_suspend_wrapper_handle(ptr, ptr)
+
+declare ptr @llvm.coro.free(token, ptr)
+declare i32 @llvm.coro.size.i32()
+declare i8  @llvm.coro.suspend(token, i1)
+declare void @llvm.coro.resume(ptr)
+declare void @llvm.coro.destroy(ptr)
+
+declare token @llvm.coro.id(i32, ptr, ptr, ptr)
+declare i1 @llvm.coro.alloc(token)
+declare ptr @llvm.coro.begin(token, ptr)
+declare void @llvm.coro.await.suspend.handle(ptr, ptr, ptr)
+declare i1 @llvm.coro.end(ptr, i1, token)
+
+declare noalias ptr @malloc(i32)
+declare void @free(ptr)
diff --git a/llvm/test/Transforms/Coroutines/coro-debug-frame-variable-O1.ll b/llvm/test/Transforms/Coroutines/coro-debug-frame-variable-inlined.ll
index acd6a08..ff070d9 100644
--- a/llvm/test/Transforms/Coroutines/coro-debug-frame-variable-O1.ll
+++ b/llvm/test/Transforms/Coroutines/coro-debug-frame-variable-inlined.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -passes='module(coro-early),cgscc(inline,coro-split<reuse-storage>)' -S | FileCheck %s
-; RUN: opt --try-experimental-debuginfo-iterators < %s -passes='module(coro-early),cgscc(inline,coro-split<reuse-storage>)' -S | FileCheck %s
+; RUN: opt < %s -passes='module(coro-early),cgscc(inline,coro-split)' -S | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators < %s -passes='module(coro-early),cgscc(inline,coro-split)' -S | FileCheck %s
 
 ; Simplified version from pr#75104.
 ; Make sure we do not update debug location for hosited dbg.declare intrinsics when optimizing coro frame.
diff --git a/llvm/test/Transforms/Coroutines/coro-lifetime-end.ll b/llvm/test/Transforms/Coroutines/coro-lifetime-end.ll
new file mode 100644
index 0000000..330c613
--- /dev/null
+++ b/llvm/test/Transforms/Coroutines/coro-lifetime-end.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+
+declare ptr @malloc(i64)
+
+%i8.array = type { [100 x i8] }
+declare void @consume.i8.array(ptr)
+
+@testbool = external local_unnamed_addr global i8, align 1
+
+; testval does not contain an explicit lifetime end. We must assume that it may
+; live across suspension.
+define void @HasNoLifetimeEnd() presplitcoroutine {
+; CHECK-LABEL: define void @HasNoLifetimeEnd() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ID:%.*]] = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr @HasNoLifetimeEnd.resumers)
+; CHECK-NEXT:    [[ALLOC:%.*]] = call ptr @malloc(i64 16)
+; CHECK-NEXT:    [[VFRAME:%.*]] = call noalias nonnull ptr @llvm.coro.begin(token [[ID]], ptr [[ALLOC]])
+; CHECK-NEXT:    store ptr @HasNoLifetimeEnd.resume, ptr [[VFRAME]], align 8
+; CHECK-NEXT:    [[DESTROY_ADDR:%.*]] = getelementptr inbounds [[HASNOLIFETIMEEND_FRAME:%.*]], ptr [[VFRAME]], i32 0, i32 1
+; CHECK-NEXT:    store ptr @HasNoLifetimeEnd.destroy, ptr [[DESTROY_ADDR]], align 8
+; CHECK-NEXT:    [[INDEX_ADDR1:%.*]] = getelementptr inbounds [[HASNOLIFETIMEEND_FRAME]], ptr [[VFRAME]], i32 0, i32 2
+; CHECK-NEXT:    call void @consume.i8.array(ptr [[INDEX_ADDR1]])
+; CHECK-NEXT:    [[INDEX_ADDR2:%.*]] = getelementptr inbounds [[HASNOLIFETIMEEND_FRAME]], ptr [[VFRAME]], i32 0, i32 3
+; CHECK-NEXT:    store i1 false, ptr [[INDEX_ADDR2]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %testval = alloca %i8.array
+  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
+  %alloc = call ptr @malloc(i64 16) #3
+  %vFrame = call noalias nonnull ptr @llvm.coro.begin(token %id, ptr %alloc)
+
+  call void @llvm.lifetime.start.p0(i64 100, ptr %testval)
+  call void @consume.i8.array(ptr %testval)
+
+  %save = call token @llvm.coro.save(ptr null)
+  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
+  switch i8 %suspend, label %exit [
+    i8 0, label %await.ready
+    i8 1, label %exit
+  ]
+await.ready:
+  br label %exit
+exit:
+  call i1 @llvm.coro.end(ptr null, i1 false, token none)
+  ret void
+}
+
+define void @LifetimeEndAfterCoroEnd() presplitcoroutine {
+; CHECK-LABEL: define void @LifetimeEndAfterCoroEnd() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ID:%.*]] = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr @LifetimeEndAfterCoroEnd.resumers)
+; CHECK-NEXT:    [[ALLOC:%.*]] = call ptr @malloc(i64 16)
+; CHECK-NEXT:    [[VFRAME:%.*]] = call noalias nonnull ptr @llvm.coro.begin(token [[ID]], ptr [[ALLOC]])
+; CHECK-NEXT:    store ptr @LifetimeEndAfterCoroEnd.resume, ptr [[VFRAME]], align 8
+; CHECK-NEXT:    [[DESTROY_ADDR:%.*]] = getelementptr inbounds [[LIFETIMEENDAFTERCOROEND_FRAME:%.*]], ptr [[VFRAME]], i32 0, i32 1
+; CHECK-NEXT:    store ptr @LifetimeEndAfterCoroEnd.destroy, ptr [[DESTROY_ADDR]], align 8
+; CHECK-NEXT:    [[INDEX_ADDR1:%.*]] = getelementptr inbounds [[LIFETIMEENDAFTERCOROEND_FRAME]], ptr [[VFRAME]], i32 0, i32 2
+; CHECK-NEXT:    call void @consume.i8.array(ptr [[INDEX_ADDR1]])
+; CHECK-NEXT:    [[INDEX_ADDR2:%.*]] = getelementptr inbounds [[LIFETIMEENDAFTERCOROEND_FRAME]], ptr [[VFRAME]], i32 0, i32 3
+; CHECK-NEXT:    store i1 false, ptr [[INDEX_ADDR2]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %testval = alloca %i8.array
+  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
+  %alloc = call ptr @malloc(i64 16) #3
+  %vFrame = call noalias nonnull ptr @llvm.coro.begin(token %id, ptr %alloc)
+
+  call void @llvm.lifetime.start.p0(i64 100, ptr %testval)
+  call void @consume.i8.array(ptr %testval)
+
+  %save = call token @llvm.coro.save(ptr null)
+  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
+  switch i8 %suspend, label %exit [
+    i8 0, label %await.ready
+    i8 1, label %exit
+  ]
+await.ready:
+  br label %exit
+exit:
+  call i1 @llvm.coro.end(ptr null, i1 false, token none)
+  call void @llvm.lifetime.end.p0(i64 100, ptr  %testval)
+  ret void
+}
+
+define void @BranchWithoutLifetimeEnd() presplitcoroutine {
+; CHECK-LABEL: define void @BranchWithoutLifetimeEnd() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ID:%.*]] = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr @BranchWithoutLifetimeEnd.resumers)
+; CHECK-NEXT:    [[ALLOC:%.*]] = call ptr @malloc(i64 16)
+; CHECK-NEXT:    [[VFRAME:%.*]] = call noalias nonnull ptr @llvm.coro.begin(token [[ID]], ptr [[ALLOC]])
+; CHECK-NEXT:    store ptr @BranchWithoutLifetimeEnd.resume, ptr [[VFRAME]], align 8
+; CHECK-NEXT:    [[DESTROY_ADDR:%.*]] = getelementptr inbounds [[BRANCHWITHOUTLIFETIMEEND_FRAME:%.*]], ptr [[VFRAME]], i32 0, i32 1
+; CHECK-NEXT:    store ptr @BranchWithoutLifetimeEnd.destroy, ptr [[DESTROY_ADDR]], align 8
+; CHECK-NEXT:    [[TESTVAL:%.*]] = getelementptr inbounds [[BRANCHWITHOUTLIFETIMEEND_FRAME]], ptr [[VFRAME]], i32 0, i32 2
+; CHECK-NEXT:    call void @consume.i8.array(ptr [[TESTVAL]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr @testbool, align 1
+; CHECK-NEXT:    [[INDEX_ADDR1:%.*]] = getelementptr inbounds [[BRANCHWITHOUTLIFETIMEEND_FRAME]], ptr [[VFRAME]], i32 0, i32 3
+; CHECK-NEXT:    store i1 false, ptr [[INDEX_ADDR1]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %testval = alloca %i8.array
+  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
+  %alloc = call ptr @malloc(i64 16) #3
+  %vFrame = call noalias nonnull ptr @llvm.coro.begin(token %id, ptr %alloc)
+
+  call void @llvm.lifetime.start.p0(i64 100, ptr %testval)
+  call void @consume.i8.array(ptr %testval)
+
+  %0 = load i8, ptr @testbool, align 1
+  %tobool = trunc nuw i8 %0 to i1
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:
+  call void @llvm.lifetime.end.p0(i64 100, ptr  %testval)
+  br label %if.end
+
+if.end:
+  %save = call token @llvm.coro.save(ptr null)
+  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
+  switch i8 %suspend, label %exit [
+    i8 0, label %await.ready
+    i8 1, label %exit
+  ]
+await.ready:
+  br label %exit
+exit:
+  call i1 @llvm.coro.end(ptr null, i1 false, token none)
+  ret void
+}
+
+
+declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr)
+declare ptr @llvm.coro.begin(token, ptr writeonly) #3
+declare ptr @llvm.coro.frame() #5
+declare i8 @llvm.coro.suspend(token, i1) #3
+declare i1 @llvm.coro.end(ptr, i1, token) #3
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #4
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #4
diff --git a/llvm/test/Transforms/Coroutines/no-suspend.ll b/llvm/test/Transforms/Coroutines/no-suspend.ll
index 53eb98f..fd8c5ac9 100644
--- a/llvm/test/Transforms/Coroutines/no-suspend.ll
+++ b/llvm/test/Transforms/Coroutines/no-suspend.ll
@@ -325,7 +325,7 @@ body:
   %save = call token @llvm.coro.save(ptr %hdl)
   %subfn = call ptr @llvm.coro.subfn.addr(ptr %hdl, i8 1)
   call fastcc void %subfn(ptr %hdl)
-  ; memcpy separates destory from suspend, therefore cannot simplify.
+  ; memcpy separates destroy from suspend, therefore cannot simplify.
   call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 1, i1 false)
   %0 = call i8 @llvm.coro.suspend(token %save, i1 false)
   switch i8 %0, label %suspend [i8 0, label %resume
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/mul.ll b/llvm/test/Transforms/CorrelatedValuePropagation/mul.ll
index b28107e..086043d 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/mul.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/mul.ll
@@ -179,8 +179,7 @@ define i1 @nuw_range1(i8 %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[C:%.*]] = add nuw nsw i8 [[B:%.*]], 1
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nuw i8 [[C]], 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[MUL]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
 entry:
   %c = add nuw nsw i8 %b, 1
@@ -194,8 +193,7 @@ define i1 @nuw_range2(i8 %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[C:%.*]] = add nuw nsw i8 [[B:%.*]], 3
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nuw i8 [[C]], 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[MUL]], 2
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
 entry:
   %c = add nuw nsw i8 %b, 3
diff --git a/llvm/test/Transforms/EntryExitInstrumenter/mcount-aix.ll b/llvm/test/Transforms/EntryExitInstrumenter/mcount-aix.ll
new file mode 100644
index 0000000..82551f01
--- /dev/null
+++ b/llvm/test/Transforms/EntryExitInstrumenter/mcount-aix.ll
@@ -0,0 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="function(ee-instrument),cgscc(inline),function(ee-instrument<post-inline>)" -S < %s | FileCheck %s
+
+target triple = "powerpc-ibm-aix7.2.0.0"
+
+define void @f1() "instrument-function-entry-inlined"="__mcount" {
+; CHECK-LABEL: define void @f1() {
+; CHECK-NEXT:    call void @__mcount(ptr @[[GLOB0:[0-9]+]])
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
diff --git a/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll b/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll
index c444b06..bd5f4c2 100644
--- a/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll
+++ b/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes="function(ee-instrument),cgscc(inline),function(ee-instrument<post-inline>)" -S < %s | FileCheck %s
 
 ; Running the passes twice should not result in more instrumentation.
@@ -7,104 +8,126 @@ target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux"
 
 define void @leaf_function() #0 {
-entry:
+; CHECK-LABEL: define void @leaf_function() {
+; CHECK-NEXT:    call void @mcount()
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__cyg_profile_func_enter(ptr @leaf_function, ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__cyg_profile_func_exit(ptr @leaf_function, ptr [[TMP2]])
+; CHECK-NEXT:    ret void
+;
   ret void
-
-; CHECK-LABEL: define void @leaf_function()
-; CHECK: entry:
-; CHECK-NEXT: call void @mcount()
-; CHECK-NEXT: %0 = call ptr @llvm.returnaddress(i32 0)
-; CHECK-NEXT: call void @__cyg_profile_func_enter(ptr @leaf_function, ptr %0)
-; CHECK-NEXT: %1 = call ptr @llvm.returnaddress(i32 0)
-; CHECK-NEXT: call void @__cyg_profile_func_exit(ptr @leaf_function, ptr %1)
-; CHECK-NEXT: ret void
 }
 
 
 define void @root_function() #0 {
-entry:
+; CHECK-LABEL: define void @root_function() {
+; CHECK-NEXT:    call void @mcount()
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__cyg_profile_func_enter(ptr @root_function, ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__cyg_profile_func_enter(ptr @leaf_function, ptr [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__cyg_profile_func_exit(ptr @leaf_function, ptr [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__cyg_profile_func_exit(ptr @root_function, ptr [[TMP4]])
+; CHECK-NEXT:    ret void
+;
   call void @leaf_function()
   ret void
-
-; CHECK-LABEL: define void @root_function()
-; CHECK: entry:
-; CHECK-NEXT: call void @mcount()
-
-; CHECK-NEXT: %0 = call ptr @llvm.returnaddress(i32 0)
-; CHECK-NEXT: call void @__cyg_profile_func_enter(ptr @root_function, ptr %0)
-
-; Entry and exit calls, inlined from @leaf_function()
-; CHECK-NEXT: %1 = call ptr @llvm.returnaddress(i32 0)
-; CHECK-NEXT: call void @__cyg_profile_func_enter(ptr @leaf_function, ptr %1)
-; CHECK-NEXT: %2 = call ptr @llvm.returnaddress(i32 0)
-; CHECK-NEXT: call void @__cyg_profile_func_exit(ptr @leaf_function, ptr %2)
-; CHECK-NEXT: %3 = call ptr @llvm.returnaddress(i32 0)
-
-; CHECK-NEXT: call void @__cyg_profile_func_exit(ptr @root_function, ptr %3)
-; CHECK-NEXT: ret void
 }
 
-
-
 ; The mcount function has many different names.
 
-define void @f1() #1 { entry: ret void }
-; CHECK-LABEL: define void @f1
-; CHECK: call void @.mcount
-
-define void @f2() #2 { entry: ret void }
-; CHECK-LABEL: define void @f2
-; CHECK: call void @llvm.arm.gnu.eabi.mcount
+define void @f1() #1 {
+; CHECK-LABEL: define void @f1() {
+; CHECK-NEXT:    call void @.mcount()
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
 
-define void @f3() #3 { entry: ret void }
-; CHECK-LABEL: define void @f3
-; CHECK: call void @"\01_mcount"
+define void @f2() #2 {
+; CHECK-LABEL: define void @f2() {
+; CHECK-NEXT:    call void @llvm.arm.gnu.eabi.mcount()
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
 
-define void @f4() #4 { entry: ret void }
-; CHECK-LABEL: define void @f4
-; CHECK: call void @"\01mcount"
+define void @f3() #3 {
+; CHECK-LABEL: define void @f3() {
+; CHECK-NEXT:    call void @"\01_mcount"()
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
 
-define void @f5() #5 { entry: ret void }
-; CHECK-LABEL: define void @f5
-; CHECK: call void @__mcount
+define void @f4() #4 {
+; CHECK-LABEL: define void @f4() {
+; CHECK-NEXT:    call void @"\01mcount"()
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
 
-define void @f6() #6 { entry: ret void }
-; CHECK-LABEL: define void @f6
-; CHECK: call void @_mcount
+define void @f5() #5 {
+; CHECK-LABEL: define void @f5() {
+; CHECK-NEXT:    call void @__mcount()
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
 
-define void @f7() #7 { entry: ret void }
-; CHECK-LABEL: define void @f7
-; CHECK: call void @__cyg_profile_func_enter_bare
+define void @f6() #6 {
+; CHECK-LABEL: define void @f6() {
+; CHECK-NEXT:    call void @_mcount()
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
 
+define void @f7() #7 {
+; CHECK-LABEL: define void @f7() {
+; CHECK-NEXT:    call void @__cyg_profile_func_enter_bare()
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
 
 ; Treat musttail calls as terminators; inserting between the musttail call and
 ; ret is not allowed.
 declare ptr @tailcallee()
 define ptr @tailcaller() #8 {
+; CHECK-LABEL: define ptr @tailcaller() {
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__cyg_profile_func_exit(ptr @tailcaller, ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = musttail call ptr @tailcallee()
+; CHECK-NEXT:    ret ptr [[TMP2]]
+;
   %1 = musttail call ptr @tailcallee()
   ret ptr %1
-; CHECK-LABEL: define ptr @tailcaller
-; CHECK: call void @__cyg_profile_func_exit
-; CHECK: musttail call ptr @tailcallee
-; CHECK: ret
 }
 define ptr @tailcaller2() #8 {
+; CHECK-LABEL: define ptr @tailcaller2() {
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__cyg_profile_func_exit(ptr @tailcaller2, ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = musttail call ptr @tailcallee()
+; CHECK-NEXT:    ret ptr [[TMP2]]
+;
   %1 = musttail call ptr @tailcallee()
-  %2 = bitcast ptr %1 to ptr
-  ret ptr %2
-; CHECK-LABEL: define ptr @tailcaller2
-; CHECK: call void @__cyg_profile_func_exit
-; CHECK: musttail call ptr @tailcallee
-; CHECK: bitcast
-; CHECK: ret
+  ret ptr %1
 }
 
 ;; naked functions are not instrumented, otherwise the argument registers
 ;; and the return address register (if present) would be clobbered.
-define void @naked() naked { entry: ret void }
-; CHECK-LABEL:      define void @naked(
-; CHECK-LABEL-NEXT: entry:
-; CHECK-LABEL-NEXT:   ret void
+define void @naked() naked {
+; CHECK-LABEL: define void @naked(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
 
 ; The attributes are "consumed" when the instrumentation is inserted.
 ; CHECK: attributes
diff --git a/llvm/test/Transforms/FunctionAttrs/nocapture.ll b/llvm/test/Transforms/FunctionAttrs/nocapture.ll
index 8d6f6a7..7df6132 100644
--- a/llvm/test/Transforms/FunctionAttrs/nocapture.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nocapture.ll
@@ -163,24 +163,24 @@ define i1 @c6(ptr %q, i8 %bit) personality ptr @__gxx_personality_v0 {
 ; FNATTRS-LABEL: define noundef i1 @c6
 ; FNATTRS-SAME: (ptr readonly [[Q:%.*]], i8 [[BIT:%.*]]) #[[ATTR5:[0-9]+]] personality ptr @__gxx_personality_v0 {
 ; FNATTRS-NEXT:    invoke void @throw_if_bit_set(ptr [[Q]], i8 [[BIT]])
-; FNATTRS-NEXT:    to label [[RET0:%.*]] unwind label [[RET1:%.*]]
+; FNATTRS-NEXT:            to label [[RET0:%.*]] unwind label [[RET1:%.*]]
 ; FNATTRS:       ret0:
 ; FNATTRS-NEXT:    ret i1 false
 ; FNATTRS:       ret1:
 ; FNATTRS-NEXT:    [[EXN:%.*]] = landingpad { ptr, i32 }
-; FNATTRS-NEXT:    cleanup
+; FNATTRS-NEXT:            cleanup
 ; FNATTRS-NEXT:    ret i1 true
 ;
 ; ATTRIBUTOR: Function Attrs: nosync memory(read)
 ; ATTRIBUTOR-LABEL: define i1 @c6
 ; ATTRIBUTOR-SAME: (ptr readonly [[Q:%.*]], i8 [[BIT:%.*]]) #[[ATTR4:[0-9]+]] personality ptr @__gxx_personality_v0 {
 ; ATTRIBUTOR-NEXT:    invoke void @throw_if_bit_set(ptr [[Q]], i8 [[BIT]]) #[[ATTR4]]
-; ATTRIBUTOR-NEXT:    to label [[RET0:%.*]] unwind label [[RET1:%.*]]
+; ATTRIBUTOR-NEXT:            to label [[RET0:%.*]] unwind label [[RET1:%.*]]
 ; ATTRIBUTOR:       ret0:
 ; ATTRIBUTOR-NEXT:    ret i1 false
 ; ATTRIBUTOR:       ret1:
 ; ATTRIBUTOR-NEXT:    [[EXN:%.*]] = landingpad { ptr, i32 }
-; ATTRIBUTOR-NEXT:    cleanup
+; ATTRIBUTOR-NEXT:            cleanup
 ; ATTRIBUTOR-NEXT:    ret i1 true
 ;
   invoke void @throw_if_bit_set(ptr %q, i8 %bit)
diff --git a/llvm/test/Transforms/FunctionAttrs/nonnull.ll b/llvm/test/Transforms/FunctionAttrs/nonnull.ll
index ec5545b..4432c4f 100644
--- a/llvm/test/Transforms/FunctionAttrs/nonnull.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nonnull.ll
@@ -246,7 +246,7 @@ define ptr @test10(ptr %a, i64 %n) {
 ; ATTRIBUTOR-LABEL: define ptr @test10(
 ; ATTRIBUTOR-SAME: ptr nofree readnone [[A:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:    [[CMP:%.*]] = icmp ne i64 [[N]], 0
-; ATTRIBUTOR-NEXT:    call void @llvm.assume(i1 [[CMP]]) #[[ATTR14:[0-9]+]]
+; ATTRIBUTOR-NEXT:    call void @llvm.assume(i1 [[CMP]]) #[[ATTR13:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    [[B:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[N]]
 ; ATTRIBUTOR-NEXT:    ret ptr [[B]]
 ;
@@ -338,7 +338,7 @@ define internal void @test13(ptr %a, ptr %b, ptr %c) {
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define internal void @test13(
-; ATTRIBUTOR-SAME: ptr nocapture nofree readnone [[A:%.*]], ptr nocapture nofree readnone [[B:%.*]], ptr nocapture nofree readnone [[C:%.*]]) #[[ATTR4:[0-9]+]] {
+; ATTRIBUTOR-SAME: ptr nocapture nofree nonnull readnone [[A:%.*]], ptr nocapture nofree readnone [[B:%.*]], ptr nocapture nofree readnone [[C:%.*]]) #[[ATTR0]] {
 ; ATTRIBUTOR-NEXT:    ret void
 ;
   ret void
@@ -382,7 +382,7 @@ define internal ptr @f1(ptr %arg) {
 ; FNATTRS-NEXT:    ret ptr [[TMP10]]
 ;
 ; ATTRIBUTOR-LABEL: define internal ptr @f1(
-; ATTRIBUTOR-SAME: ptr nofree readonly [[ARG:%.*]]) #[[ATTR5:[0-9]+]] {
+; ATTRIBUTOR-SAME: ptr nofree readonly [[ARG:%.*]]) #[[ATTR4:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:  bb:
 ; ATTRIBUTOR-NEXT:    [[TMP:%.*]] = icmp eq ptr [[ARG]], null
 ; ATTRIBUTOR-NEXT:    br i1 [[TMP]], label [[BB9:%.*]], label [[BB1:%.*]]
@@ -392,11 +392,11 @@ define internal ptr @f1(ptr %arg) {
 ; ATTRIBUTOR-NEXT:    br i1 [[TMP3]], label [[BB6:%.*]], label [[BB4:%.*]]
 ; ATTRIBUTOR:       bb4:
 ; ATTRIBUTOR-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 1
-; ATTRIBUTOR-NEXT:    [[TMP5B:%.*]] = tail call ptr @f3(ptr readonly [[TMP5]]) #[[ATTR15:[0-9]+]]
+; ATTRIBUTOR-NEXT:    [[TMP5B:%.*]] = tail call ptr @f3(ptr nofree nonnull readonly [[TMP5]]) #[[ATTR14:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    [[TMP5C:%.*]] = getelementptr inbounds i32, ptr [[TMP5B]], i64 -1
 ; ATTRIBUTOR-NEXT:    br label [[BB9]]
 ; ATTRIBUTOR:       bb6:
-; ATTRIBUTOR-NEXT:    [[TMP7:%.*]] = tail call ptr @f2(ptr readonly [[ARG]]) #[[ATTR15]]
+; ATTRIBUTOR-NEXT:    [[TMP7:%.*]] = tail call ptr @f2(ptr nofree nonnull readonly [[ARG]]) #[[ATTR14]]
 ; ATTRIBUTOR-NEXT:    ret ptr [[TMP7]]
 ; ATTRIBUTOR:       bb9:
 ; ATTRIBUTOR-NEXT:    [[TMP10:%.*]] = phi ptr [ [[TMP5C]], [[BB4]] ], [ inttoptr (i64 4 to ptr), [[BB:%.*]] ]
@@ -436,9 +436,9 @@ define internal ptr @f2(ptr %arg) {
 ; FNATTRS-NEXT:    ret ptr [[TMP]]
 ;
 ; ATTRIBUTOR-LABEL: define internal ptr @f2(
-; ATTRIBUTOR-SAME: ptr readonly [[ARG:%.*]]) #[[ATTR5]] {
+; ATTRIBUTOR-SAME: ptr nofree nonnull readonly [[ARG:%.*]]) #[[ATTR4]] {
 ; ATTRIBUTOR-NEXT:  bb:
-; ATTRIBUTOR-NEXT:    [[TMP:%.*]] = tail call ptr @f1(ptr readonly [[ARG]]) #[[ATTR15]]
+; ATTRIBUTOR-NEXT:    [[TMP:%.*]] = tail call ptr @f1(ptr nofree nonnull readonly [[ARG]]) #[[ATTR14]]
 ; ATTRIBUTOR-NEXT:    ret ptr [[TMP]]
 ;
 bb:
@@ -457,9 +457,9 @@ define dso_local noalias ptr @f3(ptr %arg) {
 ; FNATTRS-NEXT:    ret ptr [[TMP]]
 ;
 ; ATTRIBUTOR-LABEL: define dso_local noalias ptr @f3(
-; ATTRIBUTOR-SAME: ptr nofree readonly [[ARG:%.*]]) #[[ATTR5]] {
+; ATTRIBUTOR-SAME: ptr nofree readonly [[ARG:%.*]]) #[[ATTR4]] {
 ; ATTRIBUTOR-NEXT:  bb:
-; ATTRIBUTOR-NEXT:    [[TMP:%.*]] = call ptr @f1(ptr nofree readonly [[ARG]]) #[[ATTR15]]
+; ATTRIBUTOR-NEXT:    [[TMP:%.*]] = call ptr @f1(ptr nofree readonly [[ARG]]) #[[ATTR14]]
 ; ATTRIBUTOR-NEXT:    ret ptr [[TMP]]
 ;
 bb:
@@ -508,14 +508,14 @@ define void @f16(ptr %a, ptr %b, i8 %c) {
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define void @f16(
-; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR7:[0-9]+]] {
+; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR6:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:    [[CMP:%.*]] = icmp eq i8 [[C]], 0
 ; ATTRIBUTOR-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; ATTRIBUTOR:       if.then:
-; ATTRIBUTOR-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr nonnull [[B]]) #[[ATTR16:[0-9]+]]
+; ATTRIBUTOR-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr nonnull [[B]]) #[[ATTR15:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    ret void
 ; ATTRIBUTOR:       if.else:
-; ATTRIBUTOR-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr [[B]]) #[[ATTR16]]
+; ATTRIBUTOR-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr [[B]]) #[[ATTR15]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
   %cmp = icmp eq i8 %c, 0
@@ -550,17 +550,17 @@ define void @f17(ptr %a, i8 %c) {
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define void @f17(
-; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], i8 [[C:%.*]]) #[[ATTR7]] {
+; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], i8 [[C:%.*]]) #[[ATTR6]] {
 ; ATTRIBUTOR-NEXT:    [[CMP:%.*]] = icmp eq i8 [[C]], 0
 ; ATTRIBUTOR-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; ATTRIBUTOR:       if.then:
-; ATTRIBUTOR-NEXT:    tail call void @fun0() #[[ATTR16]]
+; ATTRIBUTOR-NEXT:    tail call void @fun0() #[[ATTR15]]
 ; ATTRIBUTOR-NEXT:    br label [[CONT:%.*]]
 ; ATTRIBUTOR:       if.else:
-; ATTRIBUTOR-NEXT:    tail call void @fun0() #[[ATTR16]]
+; ATTRIBUTOR-NEXT:    tail call void @fun0() #[[ATTR15]]
 ; ATTRIBUTOR-NEXT:    br label [[CONT]]
 ; ATTRIBUTOR:       cont:
-; ATTRIBUTOR-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR16]]
+; ATTRIBUTOR-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR15]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
   %cmp = icmp eq i8 %c, 0
@@ -611,26 +611,26 @@ define void @f18(ptr %a, ptr %b, i8 %c) {
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define void @f18(
-; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR7]] {
+; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR6]] {
 ; ATTRIBUTOR-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[C]], 0
 ; ATTRIBUTOR-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; ATTRIBUTOR:       if.then:
-; ATTRIBUTOR-NEXT:    tail call void @fun0() #[[ATTR16]]
+; ATTRIBUTOR-NEXT:    tail call void @fun0() #[[ATTR15]]
 ; ATTRIBUTOR-NEXT:    br label [[CONT:%.*]]
 ; ATTRIBUTOR:       if.else:
-; ATTRIBUTOR-NEXT:    tail call void @fun0() #[[ATTR16]]
+; ATTRIBUTOR-NEXT:    tail call void @fun0() #[[ATTR15]]
 ; ATTRIBUTOR-NEXT:    br label [[CONT]]
 ; ATTRIBUTOR:       cont:
 ; ATTRIBUTOR-NEXT:    [[CMP2:%.*]] = icmp eq i8 [[C]], 1
 ; ATTRIBUTOR-NEXT:    br i1 [[CMP2]], label [[CONT_THEN:%.*]], label [[CONT_ELSE:%.*]]
 ; ATTRIBUTOR:       cont.then:
-; ATTRIBUTOR-NEXT:    tail call void @fun1(ptr nonnull [[B]]) #[[ATTR16]]
+; ATTRIBUTOR-NEXT:    tail call void @fun1(ptr nonnull [[B]]) #[[ATTR15]]
 ; ATTRIBUTOR-NEXT:    br label [[CONT2:%.*]]
 ; ATTRIBUTOR:       cont.else:
-; ATTRIBUTOR-NEXT:    tail call void @fun0() #[[ATTR16]]
+; ATTRIBUTOR-NEXT:    tail call void @fun0() #[[ATTR15]]
 ; ATTRIBUTOR-NEXT:    br label [[CONT2]]
 ; ATTRIBUTOR:       cont2:
-; ATTRIBUTOR-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR16]]
+; ATTRIBUTOR-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR15]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
   %cmp1 = icmp eq i8 %c, 0
@@ -674,7 +674,7 @@ define void @f19(ptr %a, ptr %b, i8 %c) {
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define void @f19(
-; ATTRIBUTOR-SAME: ptr [[A:%.*]], ptr nonnull [[B:%.*]], i8 [[C:%.*]]) #[[ATTR8:[0-9]+]] {
+; ATTRIBUTOR-SAME: ptr [[A:%.*]], ptr nonnull [[B:%.*]], i8 [[C:%.*]]) #[[ATTR7:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; ATTRIBUTOR:       loop.header:
 ; ATTRIBUTOR-NEXT:    [[CMP2:%.*]] = icmp eq i8 [[C]], 0
@@ -883,7 +883,7 @@ define i8 @parent7(ptr %a) {
 ;
 ; ATTRIBUTOR-LABEL: define i8 @parent7(
 ; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]]) {
-; ATTRIBUTOR-NEXT:    [[RET:%.*]] = call i8 @use1safecall(ptr nonnull [[A]]) #[[ATTR16]]
+; ATTRIBUTOR-NEXT:    [[RET:%.*]] = call i8 @use1safecall(ptr nonnull [[A]]) #[[ATTR15]]
 ; ATTRIBUTOR-NEXT:    call void @use1nonnull(ptr nonnull [[A]])
 ; ATTRIBUTOR-NEXT:    ret i8 [[RET]]
 ;
@@ -915,7 +915,7 @@ define i1 @parent8(ptr %a, ptr %bogus1, ptr %b) personality ptr @esfp{
 ; FNATTRS-NEXT:    unreachable
 ;
 ; ATTRIBUTOR-LABEL: define i1 @parent8(
-; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], ptr nocapture nofree readnone [[BOGUS1:%.*]], ptr nonnull [[B:%.*]]) #[[ATTR8]] personality ptr @esfp {
+; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], ptr nocapture nofree readnone [[BOGUS1:%.*]], ptr nonnull [[B:%.*]]) #[[ATTR7]] personality ptr @esfp {
 ; ATTRIBUTOR-NEXT:  entry:
 ; ATTRIBUTOR-NEXT:    invoke void @use2nonnull(ptr nonnull [[A]], ptr nonnull [[B]])
 ; ATTRIBUTOR-NEXT:            to label [[CONT:%.*]] unwind label [[EXC:%.*]]
@@ -965,7 +965,7 @@ define ptr @gep1_no_null_opt(ptr %p) #0 {
 ; FNATTRS-NEXT:    ret ptr [[Q]]
 ;
 ; ATTRIBUTOR-LABEL: define ptr @gep1_no_null_opt(
-; ATTRIBUTOR-SAME: ptr nofree readnone [[P:%.*]]) #[[ATTR9:[0-9]+]] {
+; ATTRIBUTOR-SAME: ptr nofree readnone [[P:%.*]]) #[[ATTR8:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:    [[Q:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 1
 ; ATTRIBUTOR-NEXT:    ret ptr [[Q]]
 ;
@@ -1006,8 +1006,8 @@ define internal ptr @g2() {
 ; FNATTRS-SAME: ) #[[ATTR0]] {
 ; FNATTRS-NEXT:    ret ptr inttoptr (i64 4 to ptr)
 ;
-; ATTRIBUTOR-LABEL: define internal ptr @g2(
-; ATTRIBUTOR-SAME: ) #[[ATTR10:[0-9]+]] {
+; ATTRIBUTOR-LABEL: define internal nonnull ptr @g2(
+; ATTRIBUTOR-SAME: ) #[[ATTR0]] {
 ; ATTRIBUTOR-NEXT:    ret ptr inttoptr (i64 4 to ptr)
 ;
   ret ptr inttoptr (i64 4 to ptr)
@@ -1021,7 +1021,7 @@ define  ptr @g1() {
 ;
 ; ATTRIBUTOR-LABEL: define ptr @g1(
 ; ATTRIBUTOR-SAME: ) #[[ATTR0]] {
-; ATTRIBUTOR-NEXT:    [[C:%.*]] = call ptr @g2() #[[ATTR10]]
+; ATTRIBUTOR-NEXT:    [[C:%.*]] = call ptr @g2() #[[ATTR16:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    ret ptr [[C]]
 ;
   %c = call ptr @g2()
@@ -1036,8 +1036,8 @@ define internal void @called_by_weak(ptr %a) {
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define internal void @called_by_weak(
-; ATTRIBUTOR-SAME: ptr nocapture readnone [[A:%.*]]) #[[ATTR11:[0-9]+]] {
-; ATTRIBUTOR-NEXT:    call void @use_i32_ptr(ptr [[A]])
+; ATTRIBUTOR-SAME: ptr nocapture nonnull readnone [[A:%.*]]) #[[ATTR10:[0-9]+]] {
+; ATTRIBUTOR-NEXT:    call void @use_i32_ptr(ptr nonnull [[A]]) #[[ATTR17:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
   call void @use_i32_ptr(ptr %a)
@@ -1068,8 +1068,8 @@ define internal void @control(ptr dereferenceable(4) %a) {
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define internal void @control(
-; ATTRIBUTOR-SAME: ptr nocapture readnone dereferenceable(4) [[A:%.*]]) #[[ATTR11]] {
-; ATTRIBUTOR-NEXT:    call void @use_i32_ptr(ptr [[A]])
+; ATTRIBUTOR-SAME: ptr nocapture nonnull readnone dereferenceable(4) [[A:%.*]]) #[[ATTR10]] {
+; ATTRIBUTOR-NEXT:    call void @use_i32_ptr(ptr [[A]]) #[[ATTR17]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
   call void @use_i32_ptr(ptr %a)
@@ -1083,7 +1083,7 @@ define internal void @naked(ptr dereferenceable(4) %a) naked {
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define internal void @naked(
-; ATTRIBUTOR-SAME: ptr dereferenceable(4) [[A:%.*]]) #[[ATTR12:[0-9]+]] {
+; ATTRIBUTOR-SAME: ptr nonnull dereferenceable(4) [[A:%.*]]) #[[ATTR11:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:    call void @use_i32_ptr(ptr [[A]])
 ; ATTRIBUTOR-NEXT:    ret void
 ;
@@ -1098,7 +1098,7 @@ define internal void @optnone(ptr dereferenceable(4) %a) optnone noinline {
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define internal void @optnone(
-; ATTRIBUTOR-SAME: ptr dereferenceable(4) [[A:%.*]]) #[[ATTR13:[0-9]+]] {
+; ATTRIBUTOR-SAME: ptr nonnull dereferenceable(4) [[A:%.*]]) #[[ATTR12:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:    call void @use_i32_ptr(ptr [[A]])
 ; ATTRIBUTOR-NEXT:    ret void
 ;
@@ -1135,35 +1135,20 @@ define void @make_live(ptr nonnull dereferenceable(8) %a) {
 declare void @h(ptr) willreturn nounwind
 declare i32 @g(ptr) willreturn nounwind
 define i32 @nonnull_exec_ctx_1(ptr %a, i32 %b) {
-; FNATTRS-LABEL: define i32 @nonnull_exec_ctx_1(
-; FNATTRS-SAME: ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
-; FNATTRS-NEXT:  en:
-; FNATTRS-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
-; FNATTRS-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
-; FNATTRS:       ex:
-; FNATTRS-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]])
-; FNATTRS-NEXT:    ret i32 [[TMP5]]
-; FNATTRS:       hd:
-; FNATTRS-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD]] ], [ 0, [[EN:%.*]] ]
-; FNATTRS-NEXT:    tail call void @h(ptr [[A]])
-; FNATTRS-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
-; FNATTRS-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
-; FNATTRS-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
-;
-; ATTRIBUTOR-LABEL: define i32 @nonnull_exec_ctx_1(
-; ATTRIBUTOR-SAME: ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
-; ATTRIBUTOR-NEXT:  en:
-; ATTRIBUTOR-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
-; ATTRIBUTOR-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
-; ATTRIBUTOR:       ex:
-; ATTRIBUTOR-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]])
-; ATTRIBUTOR-NEXT:    ret i32 [[TMP5]]
-; ATTRIBUTOR:       hd:
-; ATTRIBUTOR-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD]] ], [ 0, [[EN:%.*]] ]
-; ATTRIBUTOR-NEXT:    tail call void @h(ptr [[A]])
-; ATTRIBUTOR-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
-; ATTRIBUTOR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
-; ATTRIBUTOR-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
+; COMMON-LABEL: define i32 @nonnull_exec_ctx_1(
+; COMMON-SAME: ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7:[0-9]+]] {
+; COMMON-NEXT:  en:
+; COMMON-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
+; COMMON-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
+; COMMON:       ex:
+; COMMON-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]])
+; COMMON-NEXT:    ret i32 [[TMP5]]
+; COMMON:       hd:
+; COMMON-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD]] ], [ 0, [[EN:%.*]] ]
+; COMMON-NEXT:    tail call void @h(ptr [[A]])
+; COMMON-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
+; COMMON-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
+; COMMON-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
 en:
   %tmp3 = icmp eq i32 %b, 0
@@ -1182,39 +1167,22 @@ hd:
 }
 
 define i32 @nonnull_exec_ctx_1b(ptr %a, i32 %b) {
-; FNATTRS-LABEL: define i32 @nonnull_exec_ctx_1b(
-; FNATTRS-SAME: ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
-; FNATTRS-NEXT:  en:
-; FNATTRS-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
-; FNATTRS-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
-; FNATTRS:       ex:
-; FNATTRS-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]])
-; FNATTRS-NEXT:    ret i32 [[TMP5]]
-; FNATTRS:       hd:
-; FNATTRS-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD2:%.*]] ], [ 0, [[EN:%.*]] ]
-; FNATTRS-NEXT:    tail call void @h(ptr [[A]])
-; FNATTRS-NEXT:    br label [[HD2]]
-; FNATTRS:       hd2:
-; FNATTRS-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
-; FNATTRS-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
-; FNATTRS-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
-;
-; ATTRIBUTOR-LABEL: define i32 @nonnull_exec_ctx_1b(
-; ATTRIBUTOR-SAME: ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
-; ATTRIBUTOR-NEXT:  en:
-; ATTRIBUTOR-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
-; ATTRIBUTOR-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
-; ATTRIBUTOR:       ex:
-; ATTRIBUTOR-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]])
-; ATTRIBUTOR-NEXT:    ret i32 [[TMP5]]
-; ATTRIBUTOR:       hd:
-; ATTRIBUTOR-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD2:%.*]] ], [ 0, [[EN:%.*]] ]
-; ATTRIBUTOR-NEXT:    tail call void @h(ptr [[A]])
-; ATTRIBUTOR-NEXT:    br label [[HD2]]
-; ATTRIBUTOR:       hd2:
-; ATTRIBUTOR-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
-; ATTRIBUTOR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
-; ATTRIBUTOR-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
+; COMMON-LABEL: define i32 @nonnull_exec_ctx_1b(
+; COMMON-SAME: ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
+; COMMON-NEXT:  en:
+; COMMON-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
+; COMMON-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
+; COMMON:       ex:
+; COMMON-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]])
+; COMMON-NEXT:    ret i32 [[TMP5]]
+; COMMON:       hd:
+; COMMON-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD2:%.*]] ], [ 0, [[EN:%.*]] ]
+; COMMON-NEXT:    tail call void @h(ptr [[A]])
+; COMMON-NEXT:    br label [[HD2]]
+; COMMON:       hd2:
+; COMMON-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
+; COMMON-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
+; COMMON-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
 en:
   %tmp3 = icmp eq i32 %b, 0
@@ -1252,7 +1220,7 @@ define i32 @nonnull_exec_ctx_2(ptr %a, i32 %b) willreturn nounwind {
 ; FNATTRS-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
 ; ATTRIBUTOR-LABEL: define i32 @nonnull_exec_ctx_2(
-; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
+; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR6]] {
 ; ATTRIBUTOR-NEXT:  en:
 ; ATTRIBUTOR-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
 ; ATTRIBUTOR-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
@@ -1301,7 +1269,7 @@ define i32 @nonnull_exec_ctx_2b(ptr %a, i32 %b) willreturn nounwind {
 ; FNATTRS-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
 ; ATTRIBUTOR-LABEL: define i32 @nonnull_exec_ctx_2b(
-; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
+; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR6]] {
 ; ATTRIBUTOR-NEXT:  en:
 ; ATTRIBUTOR-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
 ; ATTRIBUTOR-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
diff --git a/llvm/test/Transforms/FunctionAttrs/norecurse.ll b/llvm/test/Transforms/FunctionAttrs/norecurse.ll
index 7924428..a902974 100644
--- a/llvm/test/Transforms/FunctionAttrs/norecurse.ll
+++ b/llvm/test/Transforms/FunctionAttrs/norecurse.ll
@@ -4,10 +4,15 @@
 
 
 define i32 @leaf() {
-; COMMON: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; COMMON-LABEL: define {{[^@]+}}@leaf
-; COMMON-SAME: () #[[ATTR0:[0-9]+]] {
-; COMMON-NEXT:    ret i32 1
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; FNATTRS-LABEL: define {{[^@]+}}@leaf
+; FNATTRS-SAME: () #[[ATTR0:[0-9]+]] {
+; FNATTRS-NEXT:    ret i32 1
+;
+; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; ATTRIBUTOR-LABEL: define {{[^@]+}}@leaf
+; ATTRIBUTOR-SAME: () #[[ATTR0:[0-9]+]] {
+; ATTRIBUTOR-NEXT:    ret i32 1
 ;
   ret i32 1
 }
@@ -108,9 +113,9 @@ define internal i32 @called_by_norecurse() {
 ; FNATTRS-NEXT:    [[A:%.*]] = call i32 @k()
 ; FNATTRS-NEXT:    ret i32 [[A]]
 ;
-; ATTRIBUTOR: Function Attrs: nosync memory(none)
+; ATTRIBUTOR: Function Attrs: norecurse nosync memory(none)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@called_by_norecurse
-; ATTRIBUTOR-SAME: () #[[ATTR2]] {
+; ATTRIBUTOR-SAME: () #[[ATTR6:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @k() #[[ATTR7]]
 ; ATTRIBUTOR-NEXT:    ret i32 [[A]]
 ;
@@ -127,7 +132,7 @@ define void @m() norecurse {
 ;
 ; ATTRIBUTOR: Function Attrs: norecurse nosync memory(none)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@m
-; ATTRIBUTOR-SAME: () #[[ATTR6:[0-9]+]] {
+; ATTRIBUTOR-SAME: () #[[ATTR6]] {
 ; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @called_by_norecurse() #[[ATTR2]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
@@ -142,9 +147,9 @@ define internal i32 @called_by_norecurse_indirectly() {
 ; FNATTRS-NEXT:    [[A:%.*]] = call i32 @k()
 ; FNATTRS-NEXT:    ret i32 [[A]]
 ;
-; ATTRIBUTOR: Function Attrs: nosync memory(none)
+; ATTRIBUTOR: Function Attrs: norecurse nosync memory(none)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@called_by_norecurse_indirectly
-; ATTRIBUTOR-SAME: () #[[ATTR2]] {
+; ATTRIBUTOR-SAME: () #[[ATTR6]] {
 ; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @k() #[[ATTR7]]
 ; ATTRIBUTOR-NEXT:    ret i32 [[A]]
 ;
@@ -159,9 +164,9 @@ define internal void @o() {
 ; FNATTRS-NEXT:    [[A:%.*]] = call i32 @called_by_norecurse_indirectly()
 ; FNATTRS-NEXT:    ret void
 ;
-; ATTRIBUTOR: Function Attrs: nosync memory(none)
+; ATTRIBUTOR: Function Attrs: norecurse nosync memory(none)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@o
-; ATTRIBUTOR-SAME: () #[[ATTR2]] {
+; ATTRIBUTOR-SAME: () #[[ATTR6]] {
 ; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @called_by_norecurse_indirectly() #[[ATTR2]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
@@ -213,7 +218,7 @@ define internal void @q() {
 ; ATTRIBUTOR: Function Attrs: norecurse nosync memory(none)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@q
 ; ATTRIBUTOR-SAME: () #[[ATTR6]] {
-; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @escapes_as_parameter(ptr nonnull @escapes_as_parameter) #[[ATTR2]]
+; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @escapes_as_parameter(ptr nocapture nofree nonnull readnone @escapes_as_parameter) #[[ATTR2]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
   %a = call i32 @escapes_as_parameter(ptr @escapes_as_parameter)
@@ -255,3 +260,5 @@ define void @r() norecurse {
 ; ATTRIBUTOR: attributes #[[ATTR7]] = { nosync }
 ; ATTRIBUTOR: attributes #[[ATTR8]] = { nofree willreturn }
 ;.
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; COMMON: {{.*}}
diff --git a/llvm/test/Transforms/FunctionAttrs/read-write-scc.ll b/llvm/test/Transforms/FunctionAttrs/read-write-scc.ll
index 3640eb5..be61990 100644
--- a/llvm/test/Transforms/FunctionAttrs/read-write-scc.ll
+++ b/llvm/test/Transforms/FunctionAttrs/read-write-scc.ll
@@ -4,7 +4,7 @@
 @i = global i32 0
 
 define void @foo() {
-; CHECK: Function Attrs: nofree nosync nounwind
+; CHECK: Function Attrs: nofree nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none)
 ; CHECK-LABEL: define {{[^@]+}}@foo
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    store i32 1, ptr @i, align 4
@@ -17,7 +17,7 @@ define void @foo() {
 }
 
 define void @bar() {
-; CHECK: Function Attrs: nofree nosync nounwind
+; CHECK: Function Attrs: nofree nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none)
 ; CHECK-LABEL: define {{[^@]+}}@bar
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:    [[I:%.*]] = load i32, ptr @i, align 4
diff --git a/llvm/test/Transforms/FunctionAttrs/willreturn.ll b/llvm/test/Transforms/FunctionAttrs/willreturn.ll
index bf3f4ad..7092634 100644
--- a/llvm/test/Transforms/FunctionAttrs/willreturn.ll
+++ b/llvm/test/Transforms/FunctionAttrs/willreturn.ll
@@ -102,23 +102,23 @@ define i64 @mustprogress_mayunwind() mustprogress personality ptr @__gxx_persona
 ; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
 ; FNATTRS-LABEL: @mustprogress_mayunwind(
 ; FNATTRS-NEXT:    [[A:%.*]] = invoke i64 @fn_noread()
-; FNATTRS-NEXT:    to label [[A:%.*]] unwind label [[B:%.*]]
+; FNATTRS-NEXT:            to label [[A:%.*]] unwind label [[B:%.*]]
 ; FNATTRS:       A:
 ; FNATTRS-NEXT:    ret i64 10
 ; FNATTRS:       B:
 ; FNATTRS-NEXT:    [[VAL:%.*]] = landingpad { ptr, i32 }
-; FNATTRS-NEXT:    catch ptr null
+; FNATTRS-NEXT:            catch ptr null
 ; FNATTRS-NEXT:    ret i64 0
 ;
 ; ATTRIBUTOR: Function Attrs: mustprogress nosync nounwind willreturn memory(none)
 ; ATTRIBUTOR-LABEL: @mustprogress_mayunwind(
-; ATTRIBUTOR-NEXT:    [[A:%.*]] = invoke i64 @fn_noread()
-; ATTRIBUTOR-NEXT:    to label [[A:%.*]] unwind label [[B:%.*]]
+; ATTRIBUTOR-NEXT:    [[A:%.*]] = invoke i64 @fn_noread() #[[ATTR13:[0-9]+]]
+; ATTRIBUTOR-NEXT:            to label [[A:%.*]] unwind label [[B:%.*]]
 ; ATTRIBUTOR:       A:
 ; ATTRIBUTOR-NEXT:    ret i64 10
 ; ATTRIBUTOR:       B:
 ; ATTRIBUTOR-NEXT:    [[VAL:%.*]] = landingpad { ptr, i32 }
-; ATTRIBUTOR-NEXT:    catch ptr null
+; ATTRIBUTOR-NEXT:            catch ptr null
 ; ATTRIBUTOR-NEXT:    ret i64 0
 ;
   %a = invoke i64 @fn_noread()
diff --git a/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll b/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll
index c5f656c..99541b3 100644
--- a/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll
+++ b/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll
@@ -265,16 +265,17 @@ define i32 @test5(ptr %a, i32 %b) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[B:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY:%.*]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[SUM_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i64 [[INDVARS_IV]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[SUM_0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[SUM_0]], [[TMP2]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    br label [[FOR_COND]]
 ; CHECK:       for.end:
@@ -349,22 +350,23 @@ define i32 @test7(ptr %a, i32 %b) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[B:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[B]], i32 -1)
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[SMAX]], 2
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[SMAX]], 2
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[TMP2]] to i64
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY:%.*]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[SUM_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i64 [[INDVARS_IV]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[SUM_0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[SUM_0]], [[TMP3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND]], label [[FOR_END]]
+; CHECK-NEXT:    [[EXITCOND2:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND2]], label [[FOR_COND]], label [[FOR_END]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[SUM_0]], [[FOR_BODY]] ], [ [[SUM_0]], [[FOR_COND]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
diff --git a/llvm/test/Transforms/InstCombine/load-cmp.ll b/llvm/test/Transforms/InstCombine/load-cmp.ll
index e941284..b956de2 100644
--- a/llvm/test/Transforms/InstCombine/load-cmp.ll
+++ b/llvm/test/Transforms/InstCombine/load-cmp.ll
@@ -334,3 +334,20 @@ define i1 @test10_struct_arr_noinbounds_i64(i64 %x) {
   %r = icmp eq i32 %q, 9
   ret i1 %r
 }
+
+@table = internal constant [2 x ptr] [ptr @g, ptr getelementptr (i8, ptr @g, i64 4)], align 16
+@g = external global [2 x i32]
+
+define i1 @pr93017(i64 %idx) {
+; CHECK-LABEL: @pr93017(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[IDX:%.*]] to i32
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne ptr [[V]], null
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep = getelementptr inbounds [2 x ptr], ptr @table, i64 0, i64 %idx
+  %v = load ptr, ptr %gep
+  %cmp = icmp ne ptr %v, null
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
index 54348d1..24d624c 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt < %s -interleaved-access -S | FileCheck %s --check-prefix=NEON
-; RUN: opt < %s -interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -force-streaming-compatible-sve -S | FileCheck %s --check-prefix=SVE-FIXED
+; RUN: opt < %s -interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -force-streaming-compatible -S | FileCheck %s --check-prefix=SVE-FIXED
 ; RUN: opt < %s -passes=interleaved-access -S | FileCheck %s --check-prefix=NEON
-; RUN: opt < %s -passes=interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -force-streaming-compatible-sve -S | FileCheck %s --check-prefix=SVE-FIXED
+; RUN: opt < %s -passes=interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -force-streaming-compatible -S | FileCheck %s --check-prefix=SVE-FIXED
 
 target triple = "aarch64-linux-gnu"
 
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/pr40514.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/pr40514.ll
index 03b1aec..a6bff63 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/pr40514.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/pr40514.ll
@@ -54,4 +54,4 @@ bb10:                                             ; preds = %bb10, %bb
 }
 
 
-attributes #0 = { "target-cpu"="broadwell" "target-features"="+sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,-ptwrite,-xsavec,+popcnt,+aes,-avx512bitalg,-movdiri,-xsaves,-avx512er,-avx512vnni,-avx512vpopcntdq,-pconfig,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-movdir64b,-sse4a,-avx512bw,-clflushopt,+xsave,-avx512vbmi2,+64bit,-avx512vl,+invpcid,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,-sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3" }
+attributes #0 = { "target-cpu"="broadwell" "target-features"="+sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,-ptwrite,-xsavec,+popcnt,+aes,-avx512bitalg,-movdiri,-xsaves,-avx512vnni,-avx512vpopcntdq,-pconfig,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-movdir64b,-sse4a,-avx512bw,-clflushopt,+xsave,-avx512vbmi2,+64bit,-avx512vl,+invpcid,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,+f16c,+ssse3,-sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 9d8d703..a74b0b4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -634,6 +634,247 @@ exit:
   ret void
 }
 
+define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, ptr %E, i64 %N) "target-features"="+sve" {
+; DEFAULT-LABEL: define i32 @header_mask_and_invariant_compare(
+; DEFAULT-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], ptr [[E:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; DEFAULT-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; DEFAULT-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 64, i64 [[TMP2]])
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; DEFAULT:       vector.memcheck:
+; DEFAULT-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[E]], i64 4
+; DEFAULT-NEXT:    [[TMP4:%.*]] = shl i64 [[N]], 2
+; DEFAULT-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 4
+; DEFAULT-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP5]]
+; DEFAULT-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 4
+; DEFAULT-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[B]], i64 4
+; DEFAULT-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[C]], i64 4
+; DEFAULT-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[E]], [[SCEVGEP1]]
+; DEFAULT-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[D]], [[SCEVGEP]]
+; DEFAULT-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; DEFAULT-NEXT:    [[BOUND05:%.*]] = icmp ult ptr [[E]], [[SCEVGEP2]]
+; DEFAULT-NEXT:    [[BOUND16:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]]
+; DEFAULT-NEXT:    [[FOUND_CONFLICT7:%.*]] = and i1 [[BOUND05]], [[BOUND16]]
+; DEFAULT-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT7]]
+; DEFAULT-NEXT:    [[BOUND08:%.*]] = icmp ult ptr [[E]], [[SCEVGEP3]]
+; DEFAULT-NEXT:    [[BOUND19:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
+; DEFAULT-NEXT:    [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]]
+; DEFAULT-NEXT:    [[CONFLICT_RDX11:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT10]]
+; DEFAULT-NEXT:    [[BOUND012:%.*]] = icmp ult ptr [[E]], [[SCEVGEP4]]
+; DEFAULT-NEXT:    [[BOUND113:%.*]] = icmp ult ptr [[C]], [[SCEVGEP]]
+; DEFAULT-NEXT:    [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
+; DEFAULT-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX11]], [[FOUND_CONFLICT14]]
+; DEFAULT-NEXT:    [[BOUND016:%.*]] = icmp ult ptr [[D]], [[SCEVGEP2]]
+; DEFAULT-NEXT:    [[BOUND117:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
+; DEFAULT-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; DEFAULT-NEXT:    [[CONFLICT_RDX19:%.*]] = or i1 [[CONFLICT_RDX15]], [[FOUND_CONFLICT18]]
+; DEFAULT-NEXT:    [[BOUND020:%.*]] = icmp ult ptr [[D]], [[SCEVGEP3]]
+; DEFAULT-NEXT:    [[BOUND121:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]]
+; DEFAULT-NEXT:    [[FOUND_CONFLICT22:%.*]] = and i1 [[BOUND020]], [[BOUND121]]
+; DEFAULT-NEXT:    [[CONFLICT_RDX23:%.*]] = or i1 [[CONFLICT_RDX19]], [[FOUND_CONFLICT22]]
+; DEFAULT-NEXT:    [[BOUND024:%.*]] = icmp ult ptr [[D]], [[SCEVGEP4]]
+; DEFAULT-NEXT:    [[BOUND125:%.*]] = icmp ult ptr [[C]], [[SCEVGEP1]]
+; DEFAULT-NEXT:    [[FOUND_CONFLICT26:%.*]] = and i1 [[BOUND024]], [[BOUND125]]
+; DEFAULT-NEXT:    [[CONFLICT_RDX27:%.*]] = or i1 [[CONFLICT_RDX23]], [[FOUND_CONFLICT26]]
+; DEFAULT-NEXT:    br i1 [[CONFLICT_RDX27]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP7]]
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT32:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[E]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT33:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT32]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META9:![0-9]+]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP11]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT29:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT28]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META12:![0-9]+]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP12]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP13:%.*]] = or <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT29]]
+; DEFAULT-NEXT:    [[TMP14:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META14:![0-9]+]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT30:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP14]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT31:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT30]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP15:%.*]] = icmp ugt <vscale x 4 x i32> [[BROADCAST_SPLAT31]], [[TMP13]]
+; DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[D]], i64 [[TMP10]]
+; DEFAULT-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP13]], <vscale x 4 x ptr> [[BROADCAST_SPLAT33]], i32 4, <vscale x 4 x i1> [[TMP15]]), !alias.scope [[META16:![0-9]+]], !noalias [[META18:![0-9]+]]
+; DEFAULT-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[TMP16]], i32 0
+; DEFAULT-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> zeroinitializer, ptr [[TMP17]], i32 4, <vscale x 4 x i1> [[TMP15]]), !alias.scope [[META20:![0-9]+]], !noalias [[META21:![0-9]+]]
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; DEFAULT-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; DEFAULT-NEXT:    br label [[LOOP_HEADER:%.*]]
+; DEFAULT:       loop.header:
+; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; DEFAULT-NEXT:    [[L_A:%.*]] = load i32, ptr [[A]], align 4
+; DEFAULT-NEXT:    [[L_B:%.*]] = load i32, ptr [[B]], align 4
+; DEFAULT-NEXT:    [[OR:%.*]] = or i32 [[L_B]], [[L_A]]
+; DEFAULT-NEXT:    [[L_C:%.*]] = load i32, ptr [[C]], align 4
+; DEFAULT-NEXT:    [[C_0:%.*]] = icmp ugt i32 [[L_C]], [[OR]]
+; DEFAULT-NEXT:    br i1 [[C_0]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
+; DEFAULT:       if.then:
+; DEFAULT-NEXT:    [[GEP_D:%.*]] = getelementptr i32, ptr [[D]], i64 [[IV]]
+; DEFAULT-NEXT:    store i32 [[OR]], ptr [[E]], align 4
+; DEFAULT-NEXT:    store i32 0, ptr [[GEP_D]], align 4
+; DEFAULT-NEXT:    br label [[LOOP_LATCH]]
+; DEFAULT:       loop.latch:
+; DEFAULT-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; DEFAULT-NEXT:    [[C_1:%.*]] = icmp eq i64 [[IV]], [[N]]
+; DEFAULT-NEXT:    br i1 [[C_1]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP23:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret i32 0
+;
+; PRED-LABEL: define i32 @header_mask_and_invariant_compare(
+; PRED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], ptr [[E:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; PRED:       vector.memcheck:
+; PRED-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[E]], i64 4
+; PRED-NEXT:    [[TMP1:%.*]] = shl i64 [[N]], 2
+; PRED-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 4
+; PRED-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP2]]
+; PRED-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 4
+; PRED-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[B]], i64 4
+; PRED-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[C]], i64 4
+; PRED-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[E]], [[SCEVGEP1]]
+; PRED-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[D]], [[SCEVGEP]]
+; PRED-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; PRED-NEXT:    [[BOUND05:%.*]] = icmp ult ptr [[E]], [[SCEVGEP2]]
+; PRED-NEXT:    [[BOUND16:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]]
+; PRED-NEXT:    [[FOUND_CONFLICT7:%.*]] = and i1 [[BOUND05]], [[BOUND16]]
+; PRED-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT7]]
+; PRED-NEXT:    [[BOUND08:%.*]] = icmp ult ptr [[E]], [[SCEVGEP3]]
+; PRED-NEXT:    [[BOUND19:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
+; PRED-NEXT:    [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]]
+; PRED-NEXT:    [[CONFLICT_RDX11:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT10]]
+; PRED-NEXT:    [[BOUND012:%.*]] = icmp ult ptr [[E]], [[SCEVGEP4]]
+; PRED-NEXT:    [[BOUND113:%.*]] = icmp ult ptr [[C]], [[SCEVGEP]]
+; PRED-NEXT:    [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
+; PRED-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX11]], [[FOUND_CONFLICT14]]
+; PRED-NEXT:    [[BOUND016:%.*]] = icmp ult ptr [[D]], [[SCEVGEP2]]
+; PRED-NEXT:    [[BOUND117:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
+; PRED-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; PRED-NEXT:    [[CONFLICT_RDX19:%.*]] = or i1 [[CONFLICT_RDX15]], [[FOUND_CONFLICT18]]
+; PRED-NEXT:    [[BOUND020:%.*]] = icmp ult ptr [[D]], [[SCEVGEP3]]
+; PRED-NEXT:    [[BOUND121:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]]
+; PRED-NEXT:    [[FOUND_CONFLICT22:%.*]] = and i1 [[BOUND020]], [[BOUND121]]
+; PRED-NEXT:    [[CONFLICT_RDX23:%.*]] = or i1 [[CONFLICT_RDX19]], [[FOUND_CONFLICT22]]
+; PRED-NEXT:    [[BOUND024:%.*]] = icmp ult ptr [[D]], [[SCEVGEP4]]
+; PRED-NEXT:    [[BOUND125:%.*]] = icmp ult ptr [[C]], [[SCEVGEP1]]
+; PRED-NEXT:    [[FOUND_CONFLICT26:%.*]] = and i1 [[BOUND024]], [[BOUND125]]
+; PRED-NEXT:    [[CONFLICT_RDX27:%.*]] = or i1 [[CONFLICT_RDX23]], [[FOUND_CONFLICT26]]
+; PRED-NEXT:    br i1 [[CONFLICT_RDX27]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; PRED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; PRED-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP7]]
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; PRED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; PRED-NEXT:    [[TMP12:%.*]] = sub i64 [[TMP0]], [[TMP11]]
+; PRED-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[TMP0]], [[TMP11]]
+; PRED-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT32:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[E]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT33:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT32]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 0
+; PRED-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META6:![0-9]+]]
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP16]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT29:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT28]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP17:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META9:![0-9]+]]
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP17]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP18:%.*]] = or <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT29]]
+; PRED-NEXT:    [[TMP19:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META11:![0-9]+]]
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT30:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP19]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT31:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT30]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP20:%.*]] = icmp ugt <vscale x 4 x i32> [[BROADCAST_SPLAT31]], [[TMP18]]
+; PRED-NEXT:    [[TMP21:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP20]], <vscale x 4 x i1> zeroinitializer
+; PRED-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[D]], i64 [[TMP15]]
+; PRED-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP18]], <vscale x 4 x ptr> [[BROADCAST_SPLAT33]], i32 4, <vscale x 4 x i1> [[TMP21]]), !alias.scope [[META13:![0-9]+]], !noalias [[META15:![0-9]+]]
+; PRED-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0
+; PRED-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> zeroinitializer, ptr [[TMP23]], i32 4, <vscale x 4 x i1> [[TMP21]]), !alias.scope [[META17:![0-9]+]], !noalias [[META18:![0-9]+]]
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP14]])
+; PRED-NEXT:    [[TMP24:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 4 x i1> [[TMP24]], i32 0
+; PRED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; PRED-NEXT:    br label [[LOOP_HEADER:%.*]]
+; PRED:       loop.header:
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; PRED-NEXT:    [[L_A:%.*]] = load i32, ptr [[A]], align 4
+; PRED-NEXT:    [[L_B:%.*]] = load i32, ptr [[B]], align 4
+; PRED-NEXT:    [[OR:%.*]] = or i32 [[L_B]], [[L_A]]
+; PRED-NEXT:    [[L_C:%.*]] = load i32, ptr [[C]], align 4
+; PRED-NEXT:    [[C_0:%.*]] = icmp ugt i32 [[L_C]], [[OR]]
+; PRED-NEXT:    br i1 [[C_0]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
+; PRED:       if.then:
+; PRED-NEXT:    [[GEP_D:%.*]] = getelementptr i32, ptr [[D]], i64 [[IV]]
+; PRED-NEXT:    store i32 [[OR]], ptr [[E]], align 4
+; PRED-NEXT:    store i32 0, ptr [[GEP_D]], align 4
+; PRED-NEXT:    br label [[LOOP_LATCH]]
+; PRED:       loop.latch:
+; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; PRED-NEXT:    [[C_1:%.*]] = icmp eq i64 [[IV]], [[N]]
+; PRED-NEXT:    br i1 [[C_1]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP20:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret i32 0
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %l.A = load i32, ptr %A, align 4
+  %l.B = load i32, ptr %B, align 4
+  %or = or i32 %l.B, %l.A
+  %l.C = load i32, ptr %C, align 4
+  %c.0 = icmp ugt i32 %l.C, %or
+  br i1 %c.0, label %if.then, label %loop.latch
+
+if.then:
+  %gep.D = getelementptr i32, ptr %D, i64 %iv
+  store i32 %or, ptr %E, align 4
+  store i32 0, ptr %gep.D, align 4
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  %c.1 = icmp eq i64 %iv, %N
+  br i1 %c.1, label %exit, label %loop.header
+
+exit:
+  ret i32 0
+}
+
 ;.
 ; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -644,6 +885,21 @@ exit:
 ; DEFAULT: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
 ; DEFAULT: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
 ; DEFAULT: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]}
+; DEFAULT: [[META9]] = !{[[META10:![0-9]+]]}
+; DEFAULT: [[META10]] = distinct !{[[META10]], [[META11:![0-9]+]]}
+; DEFAULT: [[META11]] = distinct !{[[META11]], !"LVerDomain"}
+; DEFAULT: [[META12]] = !{[[META13:![0-9]+]]}
+; DEFAULT: [[META13]] = distinct !{[[META13]], [[META11]]}
+; DEFAULT: [[META14]] = !{[[META15:![0-9]+]]}
+; DEFAULT: [[META15]] = distinct !{[[META15]], [[META11]]}
+; DEFAULT: [[META16]] = !{[[META17:![0-9]+]]}
+; DEFAULT: [[META17]] = distinct !{[[META17]], [[META11]]}
+; DEFAULT: [[META18]] = !{[[META19:![0-9]+]], [[META10]], [[META13]], [[META15]]}
+; DEFAULT: [[META19]] = distinct !{[[META19]], [[META11]]}
+; DEFAULT: [[META20]] = !{[[META19]]}
+; DEFAULT: [[META21]] = !{[[META10]], [[META13]], [[META15]]}
+; DEFAULT: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]]}
 ;.
 ; PRED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; PRED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -651,4 +907,19 @@ exit:
 ; PRED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
 ; PRED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
 ; PRED: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; PRED: [[META6]] = !{[[META7:![0-9]+]]}
+; PRED: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]]}
+; PRED: [[META8]] = distinct !{[[META8]], !"LVerDomain"}
+; PRED: [[META9]] = !{[[META10:![0-9]+]]}
+; PRED: [[META10]] = distinct !{[[META10]], [[META8]]}
+; PRED: [[META11]] = !{[[META12:![0-9]+]]}
+; PRED: [[META12]] = distinct !{[[META12]], [[META8]]}
+; PRED: [[META13]] = !{[[META14:![0-9]+]]}
+; PRED: [[META14]] = distinct !{[[META14]], [[META8]]}
+; PRED: [[META15]] = !{[[META16:![0-9]+]], [[META7]], [[META10]], [[META12]]}
+; PRED: [[META16]] = distinct !{[[META16]], [[META8]]}
+; PRED: [[META17]] = !{[[META16]]}
+; PRED: [[META18]] = !{[[META7]], [[META10]], [[META12]]}
+; PRED: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]}
+; PRED: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/streaming-compatible-sve-no-maximize-bandwidth.ll b/llvm/test/Transforms/LoopVectorize/AArch64/streaming-compatible-sve-no-maximize-bandwidth.ll
index b89d09f..6b10d45 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/streaming-compatible-sve-no-maximize-bandwidth.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/streaming-compatible-sve-no-maximize-bandwidth.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=loop-vectorize -force-streaming-compatible-sve -enable-fixedwidth-autovec-in-streaming-mode -mattr=+sve -force-target-instruction-cost=1 -scalable-vectorization=off -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=SC_SVE
+; RUN: opt < %s -passes=loop-vectorize -force-streaming-compatible -enable-fixedwidth-autovec-in-streaming-mode -mattr=+sve -force-target-instruction-cost=1 -scalable-vectorization=off -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=SC_SVE
 ; RUN: opt < %s -passes=loop-vectorize -mattr=+sve -force-target-instruction-cost=1 -scalable-vectorization=off -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=NO_SC_SVE
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
index c3e30f1..e796e40 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
@@ -592,7 +592,41 @@ define dso_local i32 @predicated_test(i32 noundef %0, ptr %glob) #0 {
   ret i32 0
 }
 
+; This has a maximum trip count of 4. The codegen is currently much better with <8 x half> vectorization.
+; CHECK-LABEL: arm_q15_to_f16_remainder
+; CHECK: LV: Selecting VF: 8
+define void @arm_q15_to_f16_remainder(ptr nocapture noundef readonly %pSrc, ptr nocapture noundef writeonly noalias %pDst, i32 noundef %blockSize) #0 {
+entry:
+  %rem = and i32 %blockSize, 3
+  %cmp.not5 = icmp eq i32 %rem, 0
+  br i1 %cmp.not5, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %blkCnt.08 = phi i32 [ %dec, %while.body ], [ %rem, %while.body.preheader ]
+  %pIn.07 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrc, %while.body.preheader ]
+  %pDst.addr.06 = phi ptr [ %incdec.ptr2, %while.body ], [ %pDst, %while.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i8, ptr %pIn.07, i32 2
+  %0 = load i16, ptr %pIn.07, align 2
+  %conv1 = sitofp i16 %0 to half
+  %1 = fmul fast half %conv1, 0xH0200
+  %incdec.ptr2 = getelementptr inbounds i8, ptr %pDst.addr.06, i32 2
+  store half %1, ptr %pDst.addr.06, align 2
+  %dec = add nsw i32 %blkCnt.08, -1
+  %cmp.not = icmp eq i32 %dec, 0
+  br i1 %cmp.not, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  ret void
+}
+
+
 declare void @llvm.lifetime.start.p0(i64, ptr)
 declare void @llvm.lifetime.end.p0(i64, ptr)
 
-attributes #0 = { "target-features"="+mve" }
+attributes #0 = { "target-features"="+mve.fp" }
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll
index b88254e..786197b 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll
@@ -10,7 +10,7 @@ target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux-gnu"
 
 ; Do not vectorize epilogues for loops with minsize attribute
-; CHECK-LABLE: @f1
+; CHECK-LABEL: @f1
 ; CHECK-NOT: vector.main.loop.iter.check
 ; CHECK-NOT: vec.epilog.iter.check
 ; CHECK-NOT: vec.epilog.ph
@@ -48,7 +48,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 }
 
 ; Do not vectorize epilogues for loops with optsize attribute
-; CHECK-LABLE: @f2
+; CHECK-LABEL: @f2
 ; CHECK-NOT: vector.main.loop.iter.check
 ; CHECK-NOT: vec.epilog.iter.check
 ; CHECK-NOT: vec.epilog.ph
@@ -86,7 +86,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 }
 
 ; Do not vectorize the epilogue for loops with VF less than the default -epilogue-vectorization-minimum-VF of 16.
-; CHECK-MIN-D-LABLE: @f3
+; CHECK-MIN-D-LABEL: @f3
 ; CHECK-MIN-D-NOT: vector.main.loop.iter.check
 ; CHECK-MIN-D-NOT: vec.epilog.iter.check
 ; CHECK-MIN-D-NOT: vec.epilog.ph
@@ -96,7 +96,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 
 ; Specify a smaller minimum VF (via `-epilogue-vectorization-minimum-VF=4`) and
 ; make sure the epilogue gets vectorized in that case.
-; CHECK-MIN-D-LABLE: @f3
+; CHECK-MIN-4-LABEL: @f3
 ; CHECK-MIN-4: vector.main.loop.iter.check
 ; CHECK-MIN-4: vec.epilog.iter.check
 ; CHECK-MIN-4: vec.epilog.ph
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
new file mode 100644
index 0000000..e40f51f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s
+
+; Make sure we do not vectorize a loop with a widened int induction.
+define void @test_wide_integer_induction(ptr noalias %a, i64 %N) {
+; CHECK-LABEL: define void @test_wide_integer_induction(
+; CHECK-SAME: ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    store i64 [[IV]], ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
+  store i64 %iv, ptr %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+; Make sure we do not vectorize a loop with a widened ptr induction.
+define void @test_wide_ptr_induction(ptr noalias %a, ptr noalias %b, i64 %N) {
+; CHECK-LABEL: define void @test_wide_ptr_induction(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ADDR:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[VECTOR_BODY]] ], [ [[B]], [[VECTOR_PH]] ]
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[ADDR]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]]
+; CHECK-NEXT:    store ptr [[ADDR]], ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw nsw i64 [[EVL_BASED_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %addr = phi ptr [ %incdec.ptr, %for.body ], [ %b, %entry ]
+  %incdec.ptr = getelementptr inbounds i8, ptr %addr, i64 8
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
+  store ptr %addr, ptr %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
new file mode 100644
index 0000000..a91f923
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s
+
+define void @test(ptr %p, i64 %a, i8 %b) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY1:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 3)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[VEC_IND]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i64> [[BROADCAST_SPLAT]], <i64 48, i64 48, i64 48, i64 48>
+; CHECK-NEXT:    [[TMP3:%.*]] = ashr <4 x i64> [[TMP2]], <i64 52, i64 52, i64 52, i64 52>
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT2]] to <4 x i32>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <4 x i32> [[PREDPHI]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i32> [[TMP7]] to <4 x i8>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[VECTOR_BODY:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i8> [[TMP8]], i32 0
+; CHECK-NEXT:    store i8 [[TMP10]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[VECTOR_BODY]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; CHECK:       pred.store.if3:
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i8> [[TMP8]], i32 1
+; CHECK-NEXT:    store i8 [[TMP12]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; CHECK:       pred.store.continue4:
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; CHECK:       pred.store.if5:
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i8> [[TMP8]], i32 2
+; CHECK-NEXT:    store i8 [[TMP14]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; CHECK:       pred.store.continue6:
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
+; CHECK-NEXT:    br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.if7:
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i8> [[TMP8]], i32 3
+; CHECK-NEXT:    store i8 [[TMP16]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.continue8:
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[ADD]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP_SLT:%.*]] = icmp slt i32 [[IV]], 2
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[A]], 48
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i64 [[SHL]], 52
+; CHECK-NEXT:    [[TRUNC_I32:%.*]] = trunc i64 [[ASHR]] to i32
+; CHECK-NEXT:    br i1 [[CMP_SLT]], label [[COND_FALSE:%.*]], label [[FOR_BODY]]
+; CHECK:       cond.false:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[B]] to i32
+; CHECK-NEXT:    br label [[FOR_BODY]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[TRUNC_I32]], [[FOR_COND]] ], [ [[ZEXT]], [[COND_FALSE]] ]
+; CHECK-NEXT:    [[SHL_I32:%.*]] = shl i32 [[COND]], 8
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHL_I32]] to i8
+; CHECK-NEXT:    store i8 [[TRUNC]], ptr [[P]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 2
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %iv = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %add = add i32 %iv, 1
+  %cmp.slt = icmp slt i32 %iv, 2
+  %shl = shl i64 %a, 48
+  %ashr = ashr i64 %shl, 52
+  %trunc.i32 = trunc i64 %ashr to i32
+  br i1 %cmp.slt, label %cond.false, label %for.body
+
+cond.false:                                       ; preds = %for.cond
+  %zext = zext i8 %b to i32
+  br label %for.body
+
+for.body:                                         ; preds = %cond.false, %for.cond
+  %cond = phi i32 [ %trunc.i32, %for.cond ], [ %zext, %cond.false ]
+  %shl.i32 = shl i32 %cond, 8
+  %trunc = trunc i32 %shl.i32 to i8
+  store i8 %trunc, ptr %p, align 1
+  %cmp = icmp slt i32 %iv, 2
+  br i1 %cmp, label %for.cond, label %exit
+
+exit:                                             ; preds = %for.body
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll
index ae01bdd..a52da79 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll
@@ -12,66 +12,18 @@
 define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %index, i64 %n) {
 ; IF-EVL-LABEL: @gather_scatter(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; IF-EVL:       vector.ph:
-; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
-; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
-; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
-; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-; IF-EVL-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP11]], zeroinitializer
-; IF-EVL-NEXT:    [[TMP13:%.*]] = mul <vscale x 2 x i64> [[TMP12]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP13]]
-; IF-EVL-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
-; IF-EVL-NEXT:    [[TMP16:%.*]] = mul i64 1, [[TMP15]]
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP16]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; IF-EVL:       vector.body:
-; IF-EVL-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP17]], i32 2, i1 true)
-; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], <vscale x 2 x i64> [[VEC_IND]]
-; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.vp.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP20]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP18]])
-; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]]
-; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x float> @llvm.vp.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP21]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP18]])
-; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]]
-; IF-EVL-NEXT:    call void @llvm.vp.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> [[WIDE_MASKED_GATHER2]], <vscale x 2 x ptr> align 4 [[TMP22]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP18]])
-; IF-EVL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP18]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX1]], [[TMP10]]
-; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; IF-EVL-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; IF-EVL:       scalar.ph:
-; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX]], i64 [[INDVARS_IV]]
-; IF-EVL-NEXT:    [[TMP25:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
-; IF-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[TMP25]]
-; IF-EVL-NEXT:    [[TMP26:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; IF-EVL-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP25]]
-; IF-EVL-NEXT:    store float [[TMP26]], ptr [[ARRAYIDX7]], align 4
+; IF-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], i64 [[INDVARS_IV]]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
+; IF-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    store float [[TMP1]], ptr [[ARRAYIDX7]], align 4
 ; IF-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; IF-EVL:       for.end:
 ; IF-EVL-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
new file mode 100644
index 0000000..07a1cca
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -mtriple=s390x -mcpu=z14 -S %s | FileCheck %s
+
+define void @test(ptr %p, i40 %a) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[P:%.*]], i40 [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i40> poison, i40 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i40> [[BROADCAST_SPLATINSERT1]], <16 x i40> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE32:%.*]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <16 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule <16 x i32> [[VEC_IV]], <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <16 x i40> [[BROADCAST_SPLAT2]], <i40 24, i40 24, i40 24, i40 24, i40 24, i40 24, i40 24, i40 24, i40 24, i40 24, i40 24, i40 24, i40 24, i40 24, i40 24, i40 24>
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr <16 x i40> [[TMP1]], <i40 28, i40 28, i40 28, i40 28, i40 28, i40 28, i40 28, i40 28, i40 28, i40 28, i40 28, i40 28, i40 28, i40 28, i40 28, i40 28>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <16 x i40> [[TMP2]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <16 x i32> [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <16 x i1> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult <16 x i1> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <16 x i1> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    store i1 [[TMP10]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i1> [[TMP0]], i32 1
+; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; CHECK:       pred.store.if3:
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    store i1 [[TMP12]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; CHECK:       pred.store.continue4:
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; CHECK:       pred.store.if5:
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i1> [[TMP8]], i32 2
+; CHECK-NEXT:    store i1 [[TMP14]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; CHECK:       pred.store.continue6:
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3
+; CHECK-NEXT:    br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; CHECK:       pred.store.if7:
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i1> [[TMP8]], i32 3
+; CHECK-NEXT:    store i1 [[TMP16]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.continue8:
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4
+; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; CHECK:       pred.store.if9:
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i1> [[TMP8]], i32 4
+; CHECK-NEXT:    store i1 [[TMP18]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; CHECK:       pred.store.continue10:
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5
+; CHECK-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; CHECK:       pred.store.if11:
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <16 x i1> [[TMP8]], i32 5
+; CHECK-NEXT:    store i1 [[TMP20]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; CHECK:       pred.store.continue12:
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6
+; CHECK-NEXT:    br i1 [[TMP21]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
+; CHECK:       pred.store.if13:
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <16 x i1> [[TMP8]], i32 6
+; CHECK-NEXT:    store i1 [[TMP22]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
+; CHECK:       pred.store.continue14:
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7
+; CHECK-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]]
+; CHECK:       pred.store.if15:
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <16 x i1> [[TMP8]], i32 7
+; CHECK-NEXT:    store i1 [[TMP24]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE16]]
+; CHECK:       pred.store.continue16:
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8
+; CHECK-NEXT:    br i1 [[TMP25]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; CHECK:       pred.store.if17:
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <16 x i1> [[TMP8]], i32 8
+; CHECK-NEXT:    store i1 [[TMP26]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE18]]
+; CHECK:       pred.store.continue18:
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9
+; CHECK-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; CHECK:       pred.store.if19:
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <16 x i1> [[TMP8]], i32 9
+; CHECK-NEXT:    store i1 [[TMP28]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE20]]
+; CHECK:       pred.store.continue20:
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10
+; CHECK-NEXT:    br i1 [[TMP29]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; CHECK:       pred.store.if21:
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <16 x i1> [[TMP8]], i32 10
+; CHECK-NEXT:    store i1 [[TMP30]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; CHECK:       pred.store.continue22:
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11
+; CHECK-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; CHECK:       pred.store.if23:
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP8]], i32 11
+; CHECK-NEXT:    store i1 [[TMP32]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE24]]
+; CHECK:       pred.store.continue24:
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12
+; CHECK-NEXT:    br i1 [[TMP33]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; CHECK:       pred.store.if25:
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x i1> [[TMP8]], i32 12
+; CHECK-NEXT:    store i1 [[TMP34]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE26]]
+; CHECK:       pred.store.continue26:
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13
+; CHECK-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; CHECK:       pred.store.if27:
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <16 x i1> [[TMP8]], i32 13
+; CHECK-NEXT:    store i1 [[TMP36]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE28]]
+; CHECK:       pred.store.continue28:
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14
+; CHECK-NEXT:    br i1 [[TMP37]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
+; CHECK:       pred.store.if29:
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <16 x i1> [[TMP8]], i32 14
+; CHECK-NEXT:    store i1 [[TMP38]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE30]]
+; CHECK:       pred.store.continue30:
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15
+; CHECK-NEXT:    br i1 [[TMP39]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32]]
+; CHECK:       pred.store.if31:
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <16 x i1> [[TMP8]], i32 15
+; CHECK-NEXT:    store i1 [[TMP40]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE32]]
+; CHECK:       pred.store.continue32:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i40 [[A]], 24
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i40 [[SHL]], 28
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i40 [[ASHR]] to i32
+; CHECK-NEXT:    [[ICMP_EQ:%.*]] = icmp eq i32 [[TRUNC]], 0
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[ICMP_EQ]] to i32
+; CHECK-NEXT:    [[ICMP_ULT:%.*]] = icmp ult i32 0, [[ZEXT]]
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[ICMP_ULT]], true
+; CHECK-NEXT:    [[ICMP_SGT:%.*]] = icmp sgt i1 [[OR]], false
+; CHECK-NEXT:    store i1 [[ICMP_SGT]], ptr [[P]], align 1
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[IV_NEXT]], 10
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %shl = shl i40 %a, 24
+  %ashr = ashr i40 %shl, 28
+  %trunc = trunc i40 %ashr to i32
+  %icmp.eq = icmp eq i32 %trunc, 0
+  %zext = zext i1 %icmp.eq to i32
+  %icmp.ult = icmp ult i32 0, %zext
+  %or = or i1 %icmp.ult, true
+  %icmp.sgt = icmp sgt i1 %or, false
+  store i1 %icmp.sgt, ptr %p, align 1
+  %iv.next = add i32 %iv, 1
+  %cond = icmp ult i32 %iv.next, 10
+  br i1 %cond, label %for.body, label %exit
+
+exit:                                             ; preds = %for.body
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
index 0b16d80..3d7153e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
@@ -88,7 +88,7 @@ loopexit:
   ret void
 }
 
-attributes #0 = { uwtable "target-cpu"="skylake" "target-features"="+sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,+xsavec,+popcnt,+aes,-avx512bitalg,+xsaves,-avx512er,-avx512vnni,-avx512vpopcntdq,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-sse4a,-avx512bw,+clflushopt,+xsave,-avx512vbmi2,-avx512vl,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,+sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3" }
+attributes #0 = { uwtable "target-cpu"="skylake" "target-features"="+sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,+xsavec,+popcnt,+aes,-avx512bitalg,+xsaves,-avx512vnni,-avx512vpopcntdq,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-sse4a,-avx512bw,+clflushopt,+xsave,-avx512vbmi2,-avx512vl,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,+f16c,+ssse3,+sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3" }
 
 !0 = !{i32 0, i32 2147483646}
 !1 = !{}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
index 5c9fe54..743ca20 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
@@ -118,7 +118,7 @@ L44:                                              ; preds = %L26
   ret ptr addrspace(10) null
 }
 
-attributes #0 = { "target-cpu"="skylake-avx512" "target-features"="+xsaves,+xsavec,+prfchw,+lzcnt,+sahf,+pku,+avx512vl,+avx512bw,+avx512cd,+clwb,+clflushopt,+adx,+avx512dq,+avx512f,+bmi2,+avx2,+bmi,+fsgsbase,+f16c,+avx,+xsave,+aes,+popcnt,+movbe,+sse4.2,+sse4.1,+cx16,+fma,+ssse3,+pclmul,+sse3,-rdrnd,-rtm,-rdseed,-avx512ifma,-avx512pf,-avx512er,-sha,-prefetchwt1,-avx512vbmi,-waitpkg,-avx512vbmi2,-shstk,-gfni,-vaes,-vpclmulqdq,-avx512vnni,-avx512bitalg,-avx512vpopcntdq,-rdpid,-cldemote,-movdiri,-movdir64b,-enqcmd,-avx512vp2intersect,-serialize,-tsxldtrk,-pconfig,-amx-bf16,-amx-tile,-amx-int8,-sse4a,-xop,-lwp,-fma4,-tbm,-mwaitx,-xsaveopt,-clzero,-wbnoinvd,-avx512bf16,-ptwrite,+sse2,+mmx,+fxsr,+64bit,+cx8" }
+attributes #0 = { "target-cpu"="skylake-avx512" "target-features"="+xsaves,+xsavec,+prfchw,+lzcnt,+sahf,+pku,+avx512vl,+avx512bw,+avx512cd,+clwb,+clflushopt,+adx,+avx512dq,+avx512f,+bmi2,+avx2,+bmi,+fsgsbase,+f16c,+avx,+xsave,+aes,+popcnt,+movbe,+sse4.2,+sse4.1,+cx16,+fma,+ssse3,+pclmul,+sse3,-rdrnd,-rtm,-rdseed,-avx512ifma,-avx512pf,-sha,-avx512vbmi,-waitpkg,-avx512vbmi2,-shstk,-gfni,-vaes,-vpclmulqdq,-avx512vnni,-avx512bitalg,-avx512vpopcntdq,-rdpid,-cldemote,-movdiri,-movdir64b,-enqcmd,-avx512vp2intersect,-serialize,-tsxldtrk,-pconfig,-amx-bf16,-amx-tile,-amx-int8,-sse4a,-xop,-lwp,-fma4,-tbm,-mwaitx,-xsaveopt,-clzero,-wbnoinvd,-avx512bf16,-ptwrite,+sse2,+mmx,+fxsr,+64bit,+cx8" }
 attributes #1 = { inaccessiblemem_or_argmemonly }
 attributes #2 = { allocsize(1) }
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index bf2b9e2..ce460f4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -111,4 +111,4 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit99
 }
 
-attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="knl" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-nonunique.ll b/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-nonunique.ll
index b49c9a1..985c381 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-nonunique.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-nonunique.ll
@@ -9,10 +9,7 @@
 ; RUN:  -stats -debug %s -S 2>&1 | FileCheck %s --check-prefix=STATS \
 ; RUN:  --check-prefix=IR --check-prefix=DEBUG
 
-; DEBUG: Not found through unique tail call chain: _Z3barv from main that actually called _Z3foob (found multiple possible chains)
-; DEBUG: Not found through unique tail call chain: _Z3barv from main that actually called _Z3foob (found multiple possible chains)
-; DEBUG: Not found through unique tail call chain: _Z3barv from main that actually called _Z3foob (found multiple possible chains)
-; DEBUG: Not found through unique tail call chain: _Z3barv from main that actually called _Z3foob (found multiple possible chains)
+; DEBUG: Not found through unique tail call chain: _Z3barv from main that actually called xyz (found multiple possible chains)
 
 ;; Check that all calls in the IR are to the original functions, leading to a
 ;; non-cold operator new call.
@@ -91,39 +88,37 @@ return:                                           ; preds = %if.else, %if.then
 }
 
 ; Function Attrs: noinline
-; IR-LABEL: @main()
-define dso_local i32 @main() local_unnamed_addr #0 {
-delete.end13:
+; IR-LABEL: @xyz()
+define dso_local i32 @xyz() local_unnamed_addr #0 {
   ; IR: call ptr @_Z3foob(i1 true)
-  %call = tail call ptr @_Z3foob(i1 true), !callsite !10
+  %call = tail call ptr @_Z3foob(i1 true)
   ; IR: call ptr @_Z3foob(i1 true)
-  %call1 = tail call ptr @_Z3foob(i1 true), !callsite !11
+  %call1 = tail call ptr @_Z3foob(i1 true)
   ; IR: call ptr @_Z3foob(i1 false)
-  %call2 = tail call ptr @_Z3foob(i1 false), !callsite !12
+  %call2 = tail call ptr @_Z3foob(i1 false)
   ; IR: call ptr @_Z3foob(i1 false)
-  %call3 = tail call ptr @_Z3foob(i1 false), !callsite !13
+  %call3 = tail call ptr @_Z3foob(i1 false)
+  ret i32 0
+}
+
+define dso_local i32 @main() local_unnamed_addr #0 {
+  ; IR: call i32 @xyz()
+  %call1 = tail call i32 @xyz(), !callsite !11
   ret i32 0
 }
 
 ; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" }
 
-; STATS: 4 memprof-context-disambiguation - Number of profiled callees found via multiple tail call chains
+; STATS: 1 memprof-context-disambiguation - Number of profiled callees found via multiple tail call chains
 
 attributes #0 = { noinline }
 attributes #1 = { nobuiltin allocsize(0) }
 attributes #2 = { builtin allocsize(0) }
 
-!0 = !{!1, !3, !5, !7}
-!1 = !{!2, !"notcold"}
-!2 = !{i64 3186456655321080972, i64 6307901912192269588}
-!3 = !{!4, !"cold"}
-!4 = !{i64 3186456655321080972, i64 6792096022461663180}
+!0 = !{!5, !7}
 !5 = !{!6, !"notcold"}
 !6 = !{i64 3186456655321080972, i64 8632435727821051414}
 !7 = !{!8, !"cold"}
 !8 = !{i64 3186456655321080972, i64 -3421689549917153178}
 !9 = !{i64 3186456655321080972}
-!10 = !{i64 8632435727821051414}
 !11 = !{i64 -3421689549917153178}
-!12 = !{i64 6307901912192269588}
-!13 = !{i64 6792096022461663180}
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
index 34a68a3..e6ddf16 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
@@ -8,7 +8,7 @@
 ;; void p1(void);
 ;; int unknown(void);
 ;; void unknown_pure(void) __attribute__((pure));
-;; void unknown_no_openmp(void) __attribute__((assume("omp_no_openmp")));
+;; [[omp::assume("omp_no_openmp")]] void unknown_no_openmp(void);
 ;;
 ;; int G;
 ;; void no_parallel_region_in_here(void) {
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
index 85d495f..d20821d 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
@@ -10,7 +10,7 @@
 ;; void p1(void);
 ;; int unknown(void);
 ;; void unknown_pure(void) __attribute__((pure));
-;; void unknown_no_openmp(void) __attribute__((assume("omp_no_openmp")));
+;; [[omp::assume("omp_no_openmp")]] void unknown_no_openmp(void);
 ;;
 ;; int G;
 ;; void no_parallel_region_in_here(void) {
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
index f8c4e6b..f7bfd30 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
@@ -1,10 +1,10 @@
 ; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -pass-remarks-missed=openmp-opt -pass-remarks-analysis=openmp-opt -disable-output < %s 2>&1 | FileCheck %s
 target triple = "nvptx64"
 
-; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:11:1: Generic-mode kernel is executed with a customized state machine that requires a fallback.
-; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:13:5: Call may contain unknown parallel regions. Use `__attribute__((assume("omp_no_parallelism")))` to override.
-; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:15:5: Call may contain unknown parallel regions. Use `__attribute__((assume("omp_no_parallelism")))` to override.
-; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:20:1: Rewriting generic-mode kernel with a customized state machine.
+; CHECK{LITERAL}: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:11:1: Generic-mode kernel is executed with a customized state machine that requires a fallback.
+; CHECK{LITERAL}: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:13:5: Call may contain unknown parallel regions. Use `[[omp::assume("omp_no_parallelism")]]` to override.
+; CHECK{LITERAL}: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:15:5: Call may contain unknown parallel regions. Use `[[omp::assume("omp_no_parallelism")]]` to override.
+; CHECK{LITERAL}: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:20:1: Rewriting generic-mode kernel with a customized state machine.
 
 
 ;; void unknown(void);
@@ -24,7 +24,7 @@ target triple = "nvptx64"
 ;;   }
 ;; }
 ;;
-;; void no_openmp(void) __attribute__((assume("omp_no_openmp")));
+;; [[omp::assume("omp_no_openmp")]] void no_openmp(void);
 ;; void test_no_fallback(void) {
 ;;   #pragma omp target teams
 ;;   {
diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll
index 159280a..3939689 100644
--- a/llvm/test/Transforms/OpenMP/spmdization.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization.ll
@@ -7,7 +7,7 @@
 ; RUN: opt --mtriple=nvptx64-- -S -passes=openmp-opt-postlink < %s | FileCheck %s --check-prefix=NVPTX-DISABLED2
 
 ;; void unknown(void);
-;; void spmd_amenable(void) __attribute__((assume("ompx_spmd_amenable")));
+;; [[omp::assume("ompx_spmd_amenable")]] void spmd_amenable(void);
 ;;
 ;; void sequential_loop() {
 ;;   #pragma omp target teams
@@ -22,7 +22,7 @@
 ;;   }
 ;; }
 ;;
-;; void use(__attribute__((noescape)) int *) __attribute__((assume("ompx_spmd_amenable")));
+;; [[omp::assume("ompx_spmd_amenable")]] void use(__attribute__((noescape)) int *);
 ;;
 ;; void sequential_loop_to_stack_var() {
 ;;   #pragma omp target teams
diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
index b2e14dc..bd128b7 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
@@ -2,8 +2,8 @@
 ; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s
 ; RUN: opt -S -passes=openmp-opt -openmp-opt-disable-spmdization < %s | FileCheck %s --check-prefix=CHECK-DISABLED
 ;
-;    void pure(void) __attribute__((pure, assume("ompx_spmd_amenable")));
-;    int no_openmp(int *) __attribute__((assume("omp_no_openmp","ompx_spmd_amenable")));
+;    [[omp::assume("ompx_spmd_amenable")]] void pure(void) __attribute__((pure));
+;    [[omp::assume("omp_no_openmp","ompx_spmd_amenable")]] int no_openmp(int *);
 ;
 ;    void sequential_loop(int *x, int N) {
 ;    #pragma omp target teams
diff --git a/llvm/test/Transforms/OpenMP/spmdization_remarks.ll b/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
index 28df2f5..f5a4cea 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
@@ -1,12 +1,12 @@
 ; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -pass-remarks-missed=openmp-opt -pass-remarks-analysis=openmp-opt -disable-output < %s 2>&1 | FileCheck %s
 target triple = "nvptx64"
 
-; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:13:5: Value has potential side effects preventing SPMD-mode execution. Add `__attribute__((assume("ompx_spmd_amenable")))` to the called function to override.
-; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:15:5: Value has potential side effects preventing SPMD-mode execution. Add `__attribute__((assume("ompx_spmd_amenable")))` to the called function to override.
-; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:11:1: Generic-mode kernel is executed with a customized state machine that requires a fallback.
-; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:13:5: Call may contain unknown parallel regions. Use `__attribute__((assume("omp_no_parallelism")))` to override.
-; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:15:5: Call may contain unknown parallel regions. Use `__attribute__((assume("omp_no_parallelism")))` to override.
-; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:20:1: Transformed generic-mode kernel to SPMD-mode.
+; CHECK{LITERAL}: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:13:5: Value has potential side effects preventing SPMD-mode execution. Add `[[omp::assume("ompx_spmd_amenable")]]` to the called function to override.
+; CHECK{LITERAL}: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:15:5: Value has potential side effects preventing SPMD-mode execution. Add `[[omp::assume("ompx_spmd_amenable")]]` to the called function to override.
+; CHECK{LITERAL}: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:11:1: Generic-mode kernel is executed with a customized state machine that requires a fallback.
+; CHECK{LITERAL}: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:13:5: Call may contain unknown parallel regions. Use `[[omp::assume("omp_no_parallelism")]]` to override.
+; CHECK{LITERAL}: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:15:5: Call may contain unknown parallel regions. Use `[[omp::assume("omp_no_parallelism")]]` to override.
+; CHECK{LITERAL}: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:20:1: Transformed generic-mode kernel to SPMD-mode.
 
 
 ;; void unknown(void);
@@ -26,7 +26,7 @@ target triple = "nvptx64"
 ;;   }
 ;; }
 ;;
-;; void no_openmp(void) __attribute__((assume("omp_no_openmp")));
+;; void no_openmp(void) [[omp::assume("omp_no_openmp")]];
 ;; void test_no_fallback(void) {
 ;;   #pragma omp target teams
 ;;   {
diff --git a/llvm/test/Transforms/SCCP/ip-add-range-to-call.ll b/llvm/test/Transforms/SCCP/ip-add-range-to-call.ll
index c24c554..91efbcc 100644
--- a/llvm/test/Transforms/SCCP/ip-add-range-to-call.ll
+++ b/llvm/test/Transforms/SCCP/ip-add-range-to-call.ll
@@ -159,7 +159,7 @@ exit:
 }
 
 define i32 @caller5() {
-; CHECK-LABEL: define range(i32 200, 401) i32 @caller5() {
+; CHECK-LABEL: define i32 @caller5() {
 ; CHECK-NEXT:    [[C1:%.*]] = call i32 @callee5(i32 10, i32 100)
 ; CHECK-NEXT:    [[C2:%.*]] = call i32 @callee5(i32 20, i32 200)
 ; CHECK-NEXT:    [[A:%.*]] = add i32 [[C1]], [[C2]]
diff --git a/llvm/test/Transforms/SCCP/range-mul-nuw-nsw-flags.ll b/llvm/test/Transforms/SCCP/range-mul-nuw-nsw-flags.ll
new file mode 100644
index 0000000..8525264
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/range-mul-nuw-nsw-flags.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=ipsccp -S %s | FileCheck %s
+
+define i1 @range_from_mul_nuw_nsw(i32 %a) {
+; CHECK-LABEL: @range_from_mul_nuw_nsw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[A]], 10000
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[MUL]], -5000
+; CHECK-NEXT:    ret i1 false
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cmp = icmp ne i32 %a, 0
+  br i1 %cmp, label %then, label %else
+then:
+  %mul = mul nuw nsw i32 %a, 10000 ; Refined range via mul_nuw: [10000, 0)
+  %add = add nsw i32 %mul, -5000   ; Range: [5000, UINT_MAX - 5000 + 1)
+  %cond = icmp ult i32 %add, 4999
+  ret i1 %cond
+else:
+  ret i1 0
+}
diff --git a/llvm/test/Transforms/SCCP/range-with-undef.ll b/llvm/test/Transforms/SCCP/range-with-undef.ll
new file mode 100644
index 0000000..9b8d415
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/range-with-undef.ll
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=ipsccp < %s | FileCheck %s
+
+; Make sure that constant ranges including undef are propagated correctly.
+
+define i8 @test_binop(i1 %cond, i8 %a) {
+; CHECK-LABEL: define i8 @test_binop(
+; CHECK-SAME: i1 [[COND:%.*]], i8 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF:.*]], label %[[JOIN:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    [[A_EXT:%.*]] = zext i8 [[A]] to i16
+; CHECK-NEXT:    br label %[[JOIN]]
+; CHECK:       [[JOIN]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i16 [ undef, %[[ENTRY]] ], [ [[A_EXT]], %[[IF]] ]
+; CHECK-NEXT:    [[AND:%.*]] = and i16 [[PHI]], -1
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i16 [[AND]] to i8
+; CHECK-NEXT:    ret i8 [[TRUNC]]
+;
+entry:
+  br i1 %cond, label %if, label %join
+
+if:
+  %a.ext = zext i8 %a to i16
+  br label %join
+
+join:
+  %phi = phi i16 [ undef, %entry ], [ %a.ext, %if ]
+  %and = and i16 %phi, u0x0000ffff
+  %trunc = trunc i16 %and to i8
+  ret i8 %trunc
+}
+
+define i8 @test_cast(i1 %cond, i8 %a) {
+; CHECK-LABEL: define i8 @test_cast(
+; CHECK-SAME: i1 [[COND:%.*]], i8 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF:.*]], label %[[JOIN:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    [[A_EXT:%.*]] = zext i8 [[A]] to i16
+; CHECK-NEXT:    br label %[[JOIN]]
+; CHECK:       [[JOIN]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i16 [ undef, %[[ENTRY]] ], [ [[A_EXT]], %[[IF]] ]
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i16 [[PHI]] to i32
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[ZEXT]] to i8
+; CHECK-NEXT:    ret i8 [[TRUNC]]
+;
+entry:
+  br i1 %cond, label %if, label %join
+
+if:
+  %a.ext = zext i8 %a to i16
+  br label %join
+
+join:
+  %phi = phi i16 [ undef, %entry ], [ %a.ext, %if ]
+  %zext = zext i16 %phi to i32
+  %trunc = trunc i32 %zext to i8
+  ret i8 %trunc
+}
+
+define i8 @test_intrin(i1 %cond, i8 %a) {
+; CHECK-LABEL: define i8 @test_intrin(
+; CHECK-SAME: i1 [[COND:%.*]], i8 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF:.*]], label %[[JOIN:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    [[A_EXT:%.*]] = zext i8 [[A]] to i16
+; CHECK-NEXT:    br label %[[JOIN]]
+; CHECK:       [[JOIN]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i16 [ undef, %[[ENTRY]] ], [ [[A_EXT]], %[[IF]] ]
+; CHECK-NEXT:    [[UMAX:%.*]] = call i16 @llvm.umax.i16(i16 [[PHI]], i16 42)
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i16 [[UMAX]] to i8
+; CHECK-NEXT:    ret i8 [[TRUNC]]
+;
+entry:
+  br i1 %cond, label %if, label %join
+
+if:
+  %a.ext = zext i8 %a to i16
+  br label %join
+
+join:
+  %phi = phi i16 [ undef, %entry ], [ %a.ext, %if ]
+  %umax = call i16 @llvm.umax(i16 %phi, i16 42)
+  %trunc = trunc i16 %umax to i8
+  ret i8 %trunc
+}
+
+define i9 @test_with_overflow(i1 %cond, i8 %a) {
+; CHECK-LABEL: define i9 @test_with_overflow(
+; CHECK-SAME: i1 [[COND:%.*]], i8 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF:.*]], label %[[JOIN:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    [[A_EXT:%.*]] = zext i8 [[A]] to i16
+; CHECK-NEXT:    br label %[[JOIN]]
+; CHECK:       [[JOIN]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i16 [ undef, %[[ENTRY]] ], [ [[A_EXT]], %[[IF]] ]
+; CHECK-NEXT:    [[WO:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[PHI]], i16 1)
+; CHECK-NEXT:    [[ADD:%.*]] = extractvalue { i16, i1 } [[WO]], 0
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i16 [[ADD]] to i9
+; CHECK-NEXT:    ret i9 [[TRUNC]]
+;
+entry:
+  br i1 %cond, label %if, label %join
+
+if:
+  %a.ext = zext i8 %a to i16
+  br label %join
+
+join:
+  %phi = phi i16 [ undef, %entry ], [ %a.ext, %if ]
+  %wo = call {i16, i1} @llvm.uadd.with.overflow(i16 %phi, i16 1)
+  %add = extractvalue {i16, i1} %wo, 0
+  %trunc = trunc i16 %add to i9
+  ret i9 %trunc
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
index 2905601..3749bdf 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s
 
 define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-LABEL: @uadd_sat_v2i16(
@@ -21,6 +21,11 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
 ; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
 ;
+; GFX9-LABEL: @uadd_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
 bb:
   %arg0.0 = extractelement <2 x i16> %arg0, i64 0
   %arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -51,6 +56,11 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
 ; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
 ;
+; GFX9-LABEL: @usub_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
 bb:
   %arg0.0 = extractelement <2 x i16> %arg0, i64 0
   %arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -81,6 +91,11 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
 ; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
 ;
+; GFX9-LABEL: @sadd_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
 bb:
   %arg0.0 = extractelement <2 x i16> %arg0, i64 0
   %arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -111,6 +126,11 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
 ; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
 ;
+; GFX9-LABEL: @ssub_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
 bb:
   %arg0.0 = extractelement <2 x i16> %arg0, i64 0
   %arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -252,6 +272,18 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
 ;
+; GFX9-LABEL: @uadd_sat_v3i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
+; GFX9-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX9-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; GFX9-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
+; GFX9-NEXT:    ret <3 x i16> [[INS_2]]
+;
 bb:
   %arg0.0 = extractelement <3 x i16> %arg0, i64 0
   %arg0.1 = extractelement <3 x i16> %arg0, i64 1
@@ -291,19 +323,25 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 2
-; GFX8-NEXT:    [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3
-; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 2
-; GFX8-NEXT:    [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
 ; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX8-NEXT:    [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]])
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
-; GFX8-NEXT:    [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3
-; GFX8-NEXT:    ret <4 x i16> [[INS_3]]
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
+; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
+;
+; GFX9-LABEL: @uadd_sat_v4i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
+; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX9-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 bb:
   %arg0.0 = extractelement <4 x i16> %arg0, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
index 2038400..0bb6413 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s
 
 define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-LABEL: @uadd_sat_v2i16(
@@ -21,6 +21,11 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
 ; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
 ;
+; GFX9-LABEL: @uadd_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
 bb:
   %arg0.0 = extractelement <2 x i16> %arg0, i64 0
   %arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -51,6 +56,11 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
 ; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
 ;
+; GFX9-LABEL: @usub_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
 bb:
   %arg0.0 = extractelement <2 x i16> %arg0, i64 0
   %arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -81,6 +91,11 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
 ; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
 ;
+; GFX9-LABEL: @sadd_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
 bb:
   %arg0.0 = extractelement <2 x i16> %arg0, i64 0
   %arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -111,6 +126,11 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
 ; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
 ;
+; GFX9-LABEL: @ssub_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
 bb:
   %arg0.0 = extractelement <2 x i16> %arg0, i64 0
   %arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -252,6 +272,18 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
 ;
+; GFX9-LABEL: @uadd_sat_v3i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
+; GFX9-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX9-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; GFX9-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
+; GFX9-NEXT:    ret <3 x i16> [[INS_2]]
+;
 bb:
   %arg0.0 = extractelement <3 x i16> %arg0, i64 0
   %arg0.1 = extractelement <3 x i16> %arg0, i64 1
@@ -291,19 +323,25 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 2
-; GFX8-NEXT:    [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3
-; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 2
-; GFX8-NEXT:    [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
 ; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX8-NEXT:    [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]])
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
-; GFX8-NEXT:    [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3
-; GFX8-NEXT:    ret <4 x i16> [[INS_3]]
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
+; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
+;
+; GFX9-LABEL: @uadd_sat_v4i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
+; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX9-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 bb:
   %arg0.0 = extractelement <4 x i16> %arg0, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll
index 0a020c8..e2d25ba 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll
@@ -4,15 +4,10 @@
 define <2 x i16> @uadd_sat_v9i16_combine_vi16(<9 x i16> %arg0, <9 x i16> %arg1) {
 ; CHECK-LABEL: @uadd_sat_v9i16_combine_vi16(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARG0_1:%.*]] = extractelement <9 x i16> undef, i64 7
-; CHECK-NEXT:    [[ARG0_2:%.*]] = extractelement <9 x i16> [[ARG0:%.*]], i64 8
-; CHECK-NEXT:    [[ARG1_1:%.*]] = extractelement <9 x i16> [[ARG1:%.*]], i64 7
-; CHECK-NEXT:    [[ARG1_2:%.*]] = extractelement <9 x i16> [[ARG1]], i64 8
-; CHECK-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; CHECK-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_1]], i64 0
-; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <2 x i16> [[INS_1]], i16 [[ADD_2]], i64 1
-; CHECK-NEXT:    ret <2 x i16> [[INS_2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <9 x i16> undef, <9 x i16> [[ARG0:%.*]], <2 x i32> <i32 0, i32 17>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <9 x i16> [[ARG1:%.*]], <9 x i16> poison, <2 x i32> <i32 7, i32 8>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; CHECK-NEXT:    ret <2 x i16> [[TMP2]]
 ;
 bb:
   %arg0.1 = extractelement <9 x i16> undef, i64 7
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
index 46980b33..3b63c1e 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
@@ -4,23 +4,20 @@
 define <4 x half> @phis(i1 %cmp1, <4 x half> %in1, <4 x half> %in2)  {
 ; CHECK-LABEL: @phis(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x half> [[IN1:%.*]], i64 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x half> [[IN1]], i64 3
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[IN1:%.*]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[BB0:%.*]]
 ; CHECK:       bb0:
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x half> [[IN2:%.*]], i64 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x half> [[IN2]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x half> [[IN2:%.*]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    br label [[BB1]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[C2:%.*]] = phi half [ [[A2]], [[ENTRY:%.*]] ], [ [[B2]], [[BB0]] ]
-; CHECK-NEXT:    [[C3:%.*]] = phi half [ [[A3]], [[ENTRY]] ], [ [[B3]], [[BB0]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x half> [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[BB0]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x half> [[TMP2]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[O2:%.*]] = insertelement <4 x half> [[TMP3]], half [[C2]], i64 2
-; CHECK-NEXT:    [[O3:%.*]] = insertelement <4 x half> [[O2]], half [[C3]], i64 3
-; CHECK-NEXT:    ret <4 x half> [[O3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x half> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP2]], [[BB0]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x half> [ [[TMP1]], [[ENTRY]] ], [ [[TMP3]], [[BB0]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x half> [[TMP4]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x half> [[TMP5]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x half> [[TMP4]], <2 x half> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x half> [[TMP8]]
 ;
 entry:
   %a0 = extractelement <4 x half> %in1, i64 0
@@ -52,23 +49,20 @@ bb1:
 define <4 x half> @phis_reverse(i1 %cmp1, <4 x half> %in1, <4 x half> %in2)  {
 ; CHECK-LABEL: @phis_reverse(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x half> [[IN1:%.*]], i64 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x half> [[IN1]], i64 3
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[IN1:%.*]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[BB0:%.*]]
 ; CHECK:       bb0:
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x half> [[IN2:%.*]], i64 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x half> [[IN2]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x half> [[IN2:%.*]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    br label [[BB1]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[C3:%.*]] = phi half [ [[A3]], [[ENTRY:%.*]] ], [ [[B3]], [[BB0]] ]
-; CHECK-NEXT:    [[C2:%.*]] = phi half [ [[A2]], [[ENTRY]] ], [ [[B2]], [[BB0]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x half> [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[BB0]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x half> [[TMP2]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[O2:%.*]] = insertelement <4 x half> [[TMP3]], half [[C2]], i64 2
-; CHECK-NEXT:    [[O3:%.*]] = insertelement <4 x half> [[O2]], half [[C3]], i64 3
-; CHECK-NEXT:    ret <4 x half> [[O3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x half> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP2]], [[BB0]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x half> [ [[TMP1]], [[ENTRY]] ], [ [[TMP3]], [[BB0]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x half> [[TMP5]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x half> [[TMP4]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x half> [[TMP8]]
 ;
 entry:
   %a0 = extractelement <4 x half> %in1, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
index b34b9a3..aceee88 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
@@ -3,21 +3,10 @@
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,dce < %s | FileCheck -check-prefixes=GCN,VI %s
 
 define half @reduction_half4(<4 x half> %a) {
-; GFX9-LABEL: @reduction_half4(
-; GFX9-NEXT:  entry:
-; GFX9-NEXT:    [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH8000, <4 x half> [[A:%.*]])
-; GFX9-NEXT:    ret half [[TMP0]]
-;
-; VI-LABEL: @reduction_half4(
-; VI-NEXT:  entry:
-; VI-NEXT:    [[ELT0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0
-; VI-NEXT:    [[ELT1:%.*]] = extractelement <4 x half> [[A]], i64 1
-; VI-NEXT:    [[ELT2:%.*]] = extractelement <4 x half> [[A]], i64 2
-; VI-NEXT:    [[ELT3:%.*]] = extractelement <4 x half> [[A]], i64 3
-; VI-NEXT:    [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
-; VI-NEXT:    [[ADD2:%.*]] = fadd fast half [[ELT2]], [[ADD1]]
-; VI-NEXT:    [[ADD3:%.*]] = fadd fast half [[ELT3]], [[ADD2]]
-; VI-NEXT:    ret half [[ADD3]]
+; GCN-LABEL: @reduction_half4(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH8000, <4 x half> [[A:%.*]])
+; GCN-NEXT:    ret half [[TMP0]]
 ;
 entry:
   %elt0 = extractelement <4 x half> %a, i64 0
@@ -33,29 +22,10 @@ entry:
 }
 
 define half @reduction_half8(<8 x half> %vec8) {
-; GFX9-LABEL: @reduction_half8(
-; GFX9-NEXT:  entry:
-; GFX9-NEXT:    [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[VEC8:%.*]])
-; GFX9-NEXT:    ret half [[TMP0]]
-;
-; VI-LABEL: @reduction_half8(
-; VI-NEXT:  entry:
-; VI-NEXT:    [[ELT0:%.*]] = extractelement <8 x half> [[VEC8:%.*]], i64 0
-; VI-NEXT:    [[ELT1:%.*]] = extractelement <8 x half> [[VEC8]], i64 1
-; VI-NEXT:    [[ELT2:%.*]] = extractelement <8 x half> [[VEC8]], i64 2
-; VI-NEXT:    [[ELT3:%.*]] = extractelement <8 x half> [[VEC8]], i64 3
-; VI-NEXT:    [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4
-; VI-NEXT:    [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5
-; VI-NEXT:    [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6
-; VI-NEXT:    [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7
-; VI-NEXT:    [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
-; VI-NEXT:    [[ADD2:%.*]] = fadd fast half [[ELT2]], [[ADD1]]
-; VI-NEXT:    [[ADD3:%.*]] = fadd fast half [[ELT3]], [[ADD2]]
-; VI-NEXT:    [[ADD4:%.*]] = fadd fast half [[ELT4]], [[ADD3]]
-; VI-NEXT:    [[ADD5:%.*]] = fadd fast half [[ELT5]], [[ADD4]]
-; VI-NEXT:    [[ADD6:%.*]] = fadd fast half [[ELT6]], [[ADD5]]
-; VI-NEXT:    [[ADD7:%.*]] = fadd fast half [[ELT7]], [[ADD6]]
-; VI-NEXT:    ret half [[ADD7]]
+; GCN-LABEL: @reduction_half8(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[VEC8:%.*]])
+; GCN-NEXT:    ret half [[TMP0]]
 ;
 entry:
   %elt0 = extractelement <8 x half> %vec8, i64 0
@@ -86,38 +56,12 @@ define half @reduction_half16(<16 x half> %vec16) {
 ;
 ; VI-LABEL: @reduction_half16(
 ; VI-NEXT:  entry:
-; VI-NEXT:    [[ELT0:%.*]] = extractelement <16 x half> [[VEC16:%.*]], i64 0
-; VI-NEXT:    [[ELT1:%.*]] = extractelement <16 x half> [[VEC16]], i64 1
-; VI-NEXT:    [[ELT2:%.*]] = extractelement <16 x half> [[VEC16]], i64 2
-; VI-NEXT:    [[ELT3:%.*]] = extractelement <16 x half> [[VEC16]], i64 3
-; VI-NEXT:    [[ELT4:%.*]] = extractelement <16 x half> [[VEC16]], i64 4
-; VI-NEXT:    [[ELT5:%.*]] = extractelement <16 x half> [[VEC16]], i64 5
-; VI-NEXT:    [[ELT6:%.*]] = extractelement <16 x half> [[VEC16]], i64 6
-; VI-NEXT:    [[ELT7:%.*]] = extractelement <16 x half> [[VEC16]], i64 7
-; VI-NEXT:    [[ELT8:%.*]] = extractelement <16 x half> [[VEC16]], i64 8
-; VI-NEXT:    [[ELT9:%.*]] = extractelement <16 x half> [[VEC16]], i64 9
-; VI-NEXT:    [[ELT10:%.*]] = extractelement <16 x half> [[VEC16]], i64 10
-; VI-NEXT:    [[ELT11:%.*]] = extractelement <16 x half> [[VEC16]], i64 11
-; VI-NEXT:    [[ELT12:%.*]] = extractelement <16 x half> [[VEC16]], i64 12
-; VI-NEXT:    [[ELT13:%.*]] = extractelement <16 x half> [[VEC16]], i64 13
-; VI-NEXT:    [[ELT14:%.*]] = extractelement <16 x half> [[VEC16]], i64 14
-; VI-NEXT:    [[ELT15:%.*]] = extractelement <16 x half> [[VEC16]], i64 15
-; VI-NEXT:    [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
-; VI-NEXT:    [[ADD2:%.*]] = fadd fast half [[ELT2]], [[ADD1]]
-; VI-NEXT:    [[ADD3:%.*]] = fadd fast half [[ELT3]], [[ADD2]]
-; VI-NEXT:    [[ADD4:%.*]] = fadd fast half [[ELT4]], [[ADD3]]
-; VI-NEXT:    [[ADD5:%.*]] = fadd fast half [[ELT5]], [[ADD4]]
-; VI-NEXT:    [[ADD6:%.*]] = fadd fast half [[ELT6]], [[ADD5]]
-; VI-NEXT:    [[ADD7:%.*]] = fadd fast half [[ELT7]], [[ADD6]]
-; VI-NEXT:    [[ADD8:%.*]] = fadd fast half [[ELT8]], [[ADD7]]
-; VI-NEXT:    [[ADD9:%.*]] = fadd fast half [[ELT9]], [[ADD8]]
-; VI-NEXT:    [[ADD10:%.*]] = fadd fast half [[ELT10]], [[ADD9]]
-; VI-NEXT:    [[ADD11:%.*]] = fadd fast half [[ELT11]], [[ADD10]]
-; VI-NEXT:    [[ADD12:%.*]] = fadd fast half [[ELT12]], [[ADD11]]
-; VI-NEXT:    [[ADD13:%.*]] = fadd fast half [[ELT13]], [[ADD12]]
-; VI-NEXT:    [[ADD14:%.*]] = fadd fast half [[ELT14]], [[ADD13]]
-; VI-NEXT:    [[ADD15:%.*]] = fadd fast half [[ELT15]], [[ADD14]]
-; VI-NEXT:    ret half [[ADD15]]
+; VI-NEXT:    [[TMP0:%.*]] = shufflevector <16 x half> [[VEC16:%.*]], <16 x half> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VI-NEXT:    [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[TMP0]])
+; VI-NEXT:    [[TMP2:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; VI-NEXT:    [[TMP3:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[TMP2]])
+; VI-NEXT:    [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[TMP3]]
+; VI-NEXT:    ret half [[OP_RDX]]
 ;
 entry:
   %elt0 = extractelement <16 x half> %vec16, i64 0
@@ -183,21 +127,10 @@ entry:
 }
 
 define i16 @reduction_v4i16(<4 x i16> %a) {
-; GFX9-LABEL: @reduction_v4i16(
-; GFX9-NEXT:  entry:
-; GFX9-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[A:%.*]])
-; GFX9-NEXT:    ret i16 [[TMP0]]
-;
-; VI-LABEL: @reduction_v4i16(
-; VI-NEXT:  entry:
-; VI-NEXT:    [[ELT0:%.*]] = extractelement <4 x i16> [[A:%.*]], i64 0
-; VI-NEXT:    [[ELT1:%.*]] = extractelement <4 x i16> [[A]], i64 1
-; VI-NEXT:    [[ELT2:%.*]] = extractelement <4 x i16> [[A]], i64 2
-; VI-NEXT:    [[ELT3:%.*]] = extractelement <4 x i16> [[A]], i64 3
-; VI-NEXT:    [[ADD1:%.*]] = add i16 [[ELT1]], [[ELT0]]
-; VI-NEXT:    [[ADD2:%.*]] = add i16 [[ELT2]], [[ADD1]]
-; VI-NEXT:    [[ADD3:%.*]] = add i16 [[ELT3]], [[ADD2]]
-; VI-NEXT:    ret i16 [[ADD3]]
+; GCN-LABEL: @reduction_v4i16(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[A:%.*]])
+; GCN-NEXT:    ret i16 [[TMP0]]
 ;
 entry:
   %elt0 = extractelement <4 x i16> %a, i64 0
@@ -213,29 +146,10 @@ entry:
 }
 
 define i16 @reduction_v8i16(<8 x i16> %vec8) {
-; GFX9-LABEL: @reduction_v8i16(
-; GFX9-NEXT:  entry:
-; GFX9-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[VEC8:%.*]])
-; GFX9-NEXT:    ret i16 [[TMP0]]
-;
-; VI-LABEL: @reduction_v8i16(
-; VI-NEXT:  entry:
-; VI-NEXT:    [[ELT0:%.*]] = extractelement <8 x i16> [[VEC8:%.*]], i64 0
-; VI-NEXT:    [[ELT1:%.*]] = extractelement <8 x i16> [[VEC8]], i64 1
-; VI-NEXT:    [[ELT2:%.*]] = extractelement <8 x i16> [[VEC8]], i64 2
-; VI-NEXT:    [[ELT3:%.*]] = extractelement <8 x i16> [[VEC8]], i64 3
-; VI-NEXT:    [[ELT4:%.*]] = extractelement <8 x i16> [[VEC8]], i64 4
-; VI-NEXT:    [[ELT5:%.*]] = extractelement <8 x i16> [[VEC8]], i64 5
-; VI-NEXT:    [[ELT6:%.*]] = extractelement <8 x i16> [[VEC8]], i64 6
-; VI-NEXT:    [[ELT7:%.*]] = extractelement <8 x i16> [[VEC8]], i64 7
-; VI-NEXT:    [[ADD1:%.*]] = add i16 [[ELT1]], [[ELT0]]
-; VI-NEXT:    [[ADD2:%.*]] = add i16 [[ELT2]], [[ADD1]]
-; VI-NEXT:    [[ADD3:%.*]] = add i16 [[ELT3]], [[ADD2]]
-; VI-NEXT:    [[ADD4:%.*]] = add i16 [[ELT4]], [[ADD3]]
-; VI-NEXT:    [[ADD5:%.*]] = add i16 [[ELT5]], [[ADD4]]
-; VI-NEXT:    [[ADD6:%.*]] = add i16 [[ELT6]], [[ADD5]]
-; VI-NEXT:    [[ADD7:%.*]] = add i16 [[ELT7]], [[ADD6]]
-; VI-NEXT:    ret i16 [[ADD7]]
+; GCN-LABEL: @reduction_v8i16(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[VEC8:%.*]])
+; GCN-NEXT:    ret i16 [[TMP0]]
 ;
 entry:
   %elt0 = extractelement <8 x i16> %vec8, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
index 059e4c3..9608608 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
@@ -155,13 +155,11 @@ define <4 x float> @exp_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @exp_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -173,13 +171,11 @@ define <4 x float> @exp_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
-; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
-; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
+; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -212,13 +208,11 @@ define <4 x float> @int_exp_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @int_exp_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -230,13 +224,11 @@ define <4 x float> @int_exp_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
-; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
-; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
+; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -269,13 +261,11 @@ define <4 x float> @log_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @log_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -287,13 +277,11 @@ define <4 x float> @log_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
-; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
-; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
+; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -326,13 +314,11 @@ define <4 x float> @int_log_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @int_log_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -344,13 +330,11 @@ define <4 x float> @int_log_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
-; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
-; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
+; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -383,13 +367,11 @@ define <4 x float> @sin_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @sin_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -401,13 +383,11 @@ define <4 x float> @sin_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
-; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
-; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
+; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -440,13 +420,11 @@ define <4 x float> @int_sin_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @int_sin_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -458,13 +436,11 @@ define <4 x float> @int_sin_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
-; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
-; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
+; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
new file mode 100644
index 0000000..2daa3b5
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mattr=+v -slp-threshold=-11 < %s | FileCheck %s
+
+define <4 x i32> @test(<2 x i64> %v, ptr %p) {
+; CHECK-LABEL: define <4 x i32> @test(
+; CHECK-SAME: <2 x i64> [[V:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[P]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, <2 x ptr> [[TMP1]], <2 x i64> [[V]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP2]], i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> [[TMP6]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
+;
+entry:
+  %0 = extractelement <2 x i64> %v, i32 1
+  %arrayidx127.2 = getelementptr i16, ptr %p, i64 %0
+  %1 = load i16, ptr %arrayidx127.2, align 2
+  %conv128.2 = zext i16 %1 to i32
+  %2 = extractelement <2 x i64> %v, i32 0
+  %arrayidx127.3 = getelementptr i16, ptr %p, i64 %2
+  %3 = load i16, ptr %arrayidx127.3, align 2
+  %conv128.3 = zext i16 %3 to i32
+  %4 = insertelement <4 x i32> zeroinitializer, i32 %conv128.2, i32 0
+  %5 = insertelement <4 x i32> %4, i32 %conv128.3, i32 1
+  ret <4 x i32> %5
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
index 6c21cc1..45ce1ee 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
@@ -51,25 +51,23 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ;
 ; AVX-LABEL: @ceil_floor(
 ; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i64 1
-; AVX-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i64 2
 ; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; AVX-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX-NEXT:    [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
-; AVX-NEXT:    [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
 ; AVX-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]])
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]])
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; AVX-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
 ; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i64 1
-; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i64 2
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i64 3
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
+; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX-NEXT:    ret <8 x float> [[R71]]
 ;
 ; AVX2-LABEL: @ceil_floor(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
index bc5bcee..b8b284b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
@@ -51,25 +51,23 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ;
 ; AVX-LABEL: @ceil_floor(
 ; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i64 1
-; AVX-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i64 2
 ; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; AVX-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX-NEXT:    [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
-; AVX-NEXT:    [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
 ; AVX-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]])
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]])
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; AVX-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
 ; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i64 1
-; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i64 2
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i64 3
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
+; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX-NEXT:    ret <8 x float> [[R71]]
 ;
 ; AVX2-LABEL: @ceil_floor(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
index 4f35b77..8701551 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
@@ -39,9 +39,10 @@ define <4 x i8> @h(<4 x i8> %x, <4 x i8> %y) {
 
 define <4 x i8> @h_undef(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-LABEL: @h_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 poison, i32 3, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
-; CHECK-NEXT:    ret <4 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> <i8 undef, i8 poison, i8 poison, i8 poison>, <4 x i32> <i32 4, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 0, i32 1, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i8> [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i8> [[TMP3]]
 ;
   %x0 = extractelement <4 x i8> undef, i32 0
   %x3 = extractelement <4 x i8> %x, i32 3
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
index 4a9f717..b85ec5b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
 
 ;
 ; 128-bit vectors
@@ -213,62 +213,16 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 
 ; PR50392
 define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: @test_v4f64_partial_swizzle(
-; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
-; SSE-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
-; SSE-NEXT:    ret <4 x double> [[R03]]
-;
-; SLM-LABEL: @test_v4f64_partial_swizzle(
-; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
-; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
-; SLM-NEXT:    ret <4 x double> [[R03]]
-;
-; AVX1-LABEL: @test_v4f64_partial_swizzle(
-; AVX1-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; AVX1-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
-; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
-; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
-; AVX1-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; AVX1-NEXT:    [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i64 0
-; AVX1-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; AVX1-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
-; AVX1-NEXT:    ret <4 x double> [[R031]]
-;
-; AVX2-LABEL: @test_v4f64_partial_swizzle(
-; AVX2-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; AVX2-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; AVX2-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
-; AVX2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; AVX2-NEXT:    [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i64 0
-; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
-; AVX2-NEXT:    ret <4 x double> [[R031]]
-;
-; AVX512-LABEL: @test_v4f64_partial_swizzle(
-; AVX512-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; AVX512-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; AVX512-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
-; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
-; AVX512-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
-; AVX512-NEXT:    ret <4 x double> [[R03]]
+; CHECK-LABEL: @test_v4f64_partial_swizzle(
+; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
+; CHECK-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
+; CHECK-NEXT:    ret <4 x double> [[R03]]
 ;
   %a0 = extractelement <4 x double> %a, i64 0
   %a1 = extractelement <4 x double> %a, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
index 31e3e6a..e30f84e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
 
 ;
 ; 128-bit vectors
@@ -213,62 +213,16 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 
 ; PR50392
 define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: @test_v4f64_partial_swizzle(
-; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> <double undef, double poison>, <4 x i32> <i32 0, i32 2, i32 1, i32 poison>
-; SSE-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
-; SSE-NEXT:    ret <4 x double> [[R03]]
-;
-; SLM-LABEL: @test_v4f64_partial_swizzle(
-; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> <double undef, double poison>, <4 x i32> <i32 0, i32 2, i32 1, i32 poison>
-; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
-; SLM-NEXT:    ret <4 x double> [[R03]]
-;
-; AVX1-LABEL: @test_v4f64_partial_swizzle(
-; AVX1-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; AVX1-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
-; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
-; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
-; AVX1-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; AVX1-NEXT:    [[R00:%.*]] = insertelement <4 x double> <double poison, double undef, double poison, double poison>, double [[R0]], i64 0
-; AVX1-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; AVX1-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX1-NEXT:    ret <4 x double> [[R031]]
-;
-; AVX2-LABEL: @test_v4f64_partial_swizzle(
-; AVX2-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; AVX2-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; AVX2-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
-; AVX2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; AVX2-NEXT:    [[R00:%.*]] = insertelement <4 x double> <double poison, double undef, double poison, double poison>, double [[R0]], i64 0
-; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX2-NEXT:    ret <4 x double> [[R031]]
-;
-; AVX512-LABEL: @test_v4f64_partial_swizzle(
-; AVX512-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; AVX512-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; AVX512-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
-; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> <double undef, double poison>, <4 x i32> <i32 0, i32 2, i32 1, i32 poison>
-; AVX512-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
-; AVX512-NEXT:    ret <4 x double> [[R03]]
+; CHECK-LABEL: @test_v4f64_partial_swizzle(
+; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> <double undef, double poison>, <4 x i32> <i32 0, i32 2, i32 1, i32 poison>
+; CHECK-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
+; CHECK-NEXT:    ret <4 x double> [[R03]]
 ;
   %a0 = extractelement <4 x double> %a, i64 0
   %a1 = extractelement <4 x double> %a, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scalarazied-result.ll b/llvm/test/Transforms/SLPVectorizer/X86/scalarazied-result.ll
index 53f1708..1d6e191 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scalarazied-result.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scalarazied-result.ll
@@ -4,10 +4,6 @@
 define void @test() {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x half> zeroinitializer, i64 1
-; CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp une half [[TMP0]], 0xH0000
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x half> zeroinitializer, i64 1
-; CHECK-NEXT:    [[TOBOOL3:%.*]] = fcmp une half [[TMP1]], 0xH0000
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll b/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll
index b8c551c..9e8cdc6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll
@@ -26,5 +26,5 @@ entry:
   unreachable
 }
 
-attributes #0 = { noreturn readonly uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="knl" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noreturn readonly uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/streaming-compatible-expand-masked-gather-scatter.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/streaming-compatible-expand-masked-gather-scatter.ll
index ee67ab3..b827fc6 100644
--- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/streaming-compatible-expand-masked-gather-scatter.ll
+++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/streaming-compatible-expand-masked-gather-scatter.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: aarch64-registered-target
-; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu -mattr=+sve -force-streaming-compatible-sve | FileCheck %s
+; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu -mattr=+sve -force-streaming-compatible | FileCheck %s
 
 define <2 x i32> @scalarize_v2i32(<2 x ptr> %p, <2 x i1> %mask, <2 x i32> %passthru) {
 ; CHECK-LABEL: @scalarize_v2i32(
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-dead-default-lookup-table.ll b/llvm/test/Transforms/SimplifyCFG/switch-dead-default-lookup-table.ll
new file mode 100644
index 0000000..bead0dc
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/switch-dead-default-lookup-table.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt %s -S -passes='simplifycfg<switch-to-lookup>' -simplifycfg-require-and-preserve-domtree=1 -switch-range-to-icmp | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+define i64 @test_1(i64 %0) {
+; CHECK-LABEL: define i64 @test_1(
+; CHECK-SAME: i64 [[TMP0:%.*]]) {
+; CHECK-NEXT:  switch.lookup:
+; CHECK-NEXT:    [[TMP1:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [4 x i64], ptr @switch.table.test_1, i32 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i64, ptr [[SWITCH_GEP]], align 8
+; CHECK-NEXT:    ret i64 [[SWITCH_LOAD]]
+;
+  %2 = urem i64 %0, 4
+  switch i64 %2, label %5 [
+  i64 1, label %3
+  i64 2, label %3
+  i64 3, label %4
+  ]
+
+3:
+  br label %5
+
+4:
+  br label %5
+
+5:
+  %.0 = phi i64 [ 2, %4 ], [ 1, %3 ], [ 0, %1 ]
+  ret i64 %.0
+}
+
+
+define i64 @test_2(i64 %0) {
+; CHECK-LABEL: define i64 @test_2(
+; CHECK-SAME: i64 [[TMP0:%.*]]) {
+; CHECK-NEXT:  switch.lookup:
+; CHECK-NEXT:    [[TMP1:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+  %2 = urem i64 %0, 4
+  switch i64 %2, label %6 [
+  i64 1, label %3
+  i64 2, label %4
+  i64 3, label %5
+  ]
+
+3:
+  br label %6
+
+4:
+  br label %6
+
+5:
+  br label %6
+
+6:
+  %.0 = phi i64 [ 0, %1 ], [ 1, %3 ], [ 2, %4 ], [ 3, %5 ]
+  ret i64 %.0
+}
+
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll b/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll
index 7c0d5e4..4a457cc 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll
@@ -79,15 +79,15 @@ default:
   ret void
 }
 
-; This one is a negative test - we know the value of the default,
-; but that's about it
+; We can replace the default branch with case 3 since it is the only case that is missing.
 define void @test3(i2 %a) {
 ; CHECK-LABEL: define void @test3(
 ; CHECK-SAME: i2 [[A:%.*]]) {
-; CHECK-NEXT:    switch i2 [[A]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:    switch i2 [[A]], label [[DOTUNREACHABLEDEFAULT:%.*]] [
 ; CHECK-NEXT:      i2 0, label [[CASE0:%.*]]
 ; CHECK-NEXT:      i2 1, label [[CASE1:%.*]]
 ; CHECK-NEXT:      i2 -2, label [[CASE2:%.*]]
+; CHECK-NEXT:      i2 -1, label [[DEFAULT:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
@@ -100,6 +100,8 @@ define void @test3(i2 %a) {
 ; CHECK:       case2:
 ; CHECK-NEXT:    call void @foo(i32 2)
 ; CHECK-NEXT:    br label [[COMMON_RET]]
+; CHECK:       .unreachabledefault:
+; CHECK-NEXT:    unreachable
 ; CHECK:       default:
 ; CHECK-NEXT:    call void @foo(i32 3)
 ; CHECK-NEXT:    br label [[COMMON_RET]]
@@ -122,6 +124,50 @@ default:
   ret void
 }
 
+define void @test3_prof(i2 %a) {
+; CHECK-LABEL: define void @test3_prof(
+; CHECK-SAME: i2 [[A:%.*]]) {
+; CHECK-NEXT:    switch i2 [[A]], label [[DOTUNREACHABLEDEFAULT:%.*]] [
+; CHECK-NEXT:      i2 0, label [[CASE0:%.*]]
+; CHECK-NEXT:      i2 1, label [[CASE1:%.*]]
+; CHECK-NEXT:      i2 -2, label [[CASE2:%.*]]
+; CHECK-NEXT:      i2 -1, label [[DEFAULT:%.*]]
+; CHECK-NEXT:    ], !prof [[PROF0:![0-9]+]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    ret void
+; CHECK:       case0:
+; CHECK-NEXT:    call void @foo(i32 0)
+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
+; CHECK:       case1:
+; CHECK-NEXT:    call void @foo(i32 1)
+; CHECK-NEXT:    br label [[COMMON_RET]]
+; CHECK:       case2:
+; CHECK-NEXT:    call void @foo(i32 2)
+; CHECK-NEXT:    br label [[COMMON_RET]]
+; CHECK:       .unreachabledefault:
+; CHECK-NEXT:    unreachable
+; CHECK:       default:
+; CHECK-NEXT:    call void @foo(i32 3)
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+  switch i2 %a, label %default [i2 0, label %case0
+  i2 1, label %case1
+  i2 2, label %case2], !prof !0
+
+case0:
+  call void @foo(i32 0)
+  ret void
+case1:
+  call void @foo(i32 1)
+  ret void
+case2:
+  call void @foo(i32 2)
+  ret void
+default:
+  call void @foo(i32 3)
+  ret void
+}
+
 ; Negative test - check for possible overflow when computing
 ; number of possible cases.
 define void @test4(i128 %a) {
@@ -267,3 +313,40 @@ default:
 
 declare void @llvm.assume(i1)
 
+define zeroext i1 @test8(i128 %a) {
+; We should not transform conditions wider than 64 bit.
+; CHECK-LABEL: define zeroext i1 @test8(
+; CHECK-SAME: i128 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and i128 [[A]], 3894222643901120721397872246915072
+; CHECK-NEXT:    switch i128 [[TMP0]], label [[LOR_RHS:%.*]] [
+; CHECK-NEXT:      i128 1298074214633706907132624082305024, label [[LOR_END:%.*]]
+; CHECK-NEXT:      i128 2596148429267413814265248164610048, label [[LOR_END]]
+; CHECK-NEXT:      i128 3894222643901120721397872246915072, label [[LOR_END]]
+; CHECK-NEXT:    ]
+; CHECK:       lor.rhs:
+; CHECK-NEXT:    br label [[LOR_END]]
+; CHECK:       lor.end:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[LOR_RHS]] ], [ true, [[ENTRY]] ], [ true, [[ENTRY]] ]
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+entry:
+  %0 = and i128 %a, 3894222643901120721397872246915072
+  switch i128 %0, label %lor.rhs [
+  i128 1298074214633706907132624082305024, label %lor.end
+  i128 2596148429267413814265248164610048, label %lor.end
+  i128 3894222643901120721397872246915072, label %lor.end
+  ]
+
+lor.rhs:                                          ; preds = %entry
+  br label %lor.end
+
+lor.end:                                          ; preds = %entry, %entry, %entry, %lor.rhs
+  %1 = phi i1 [ true, %entry ], [ false, %lor.rhs ], [ true, %entry ], [ true, %entry ]
+  ret i1 %1
+}
+
+!0 = !{!"branch_weights", i32 8, i32 4, i32 2, i32 1}
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 0, i32 4, i32 2, i32 1, i32 8}
+;.
diff --git a/llvm/test/Transforms/Util/add-TLI-mappings.ll b/llvm/test/Transforms/Util/add-TLI-mappings.ll
index 0e005ae..4e4b81e 100644
--- a/llvm/test/Transforms/Util/add-TLI-mappings.ll
+++ b/llvm/test/Transforms/Util/add-TLI-mappings.ll
@@ -274,19 +274,19 @@ attributes #0 = { nounwind readnone }
 ; ARMPL-SAME:    _ZGVsMxvl4_modff(armpl_svmodf_f32_x)" }
 ; ARMPL:      attributes #[[SIN]] = { "vector-function-abi-variant"=
 ; ARMPL-SAME:    "_ZGV_LLVM_N2v_sin(armpl_vsinq_f64),
-; ARMPL-SAME     _ZGVsMxv_sin(armpl_svsin_f64_x)" }
+; ARMPL-SAME:    _ZGVsMxv_sin(armpl_svsin_f64_x)" }
 ; ARMPL:      attributes #[[SINCOS]] = { "vector-function-abi-variant"=
 ; ARMPL-SAME:    "_ZGV_LLVM_N2vl8l8_sincos(armpl_vsincosq_f64),
-; ARMPL-SAME:   _ZGVsMxvl8l8_sincos(armpl_svsincos_f64_x)" }
+; ARMPL-SAME:    _ZGVsMxvl8l8_sincos(armpl_svsincos_f64_x)" }
 ; ARMPL:      attributes #[[SINCOSF]] = { "vector-function-abi-variant"=
 ; ARMPL-SAME:    "_ZGV_LLVM_N4vl4l4_sincosf(armpl_vsincosq_f32),
 ; ARMPL-SAME:    _ZGVsMxvl4l4_sincosf(armpl_svsincos_f32_x)" }
 ; ARMPL:      attributes #[[SINCOSPI]] = { "vector-function-abi-variant"=
 ; ARMPL-SAME:    "_ZGV_LLVM_N2vl8l8_sincospi(armpl_vsincospiq_f64),
-; ARMPL-SAME:   _ZGVsMxvl8l8_sincospi(armpl_svsincospi_f64_x)" }
+; ARMPL-SAME:    _ZGVsMxvl8l8_sincospi(armpl_svsincospi_f64_x)" }
 ; ARMPL:      attributes #[[SINCOSPIF]] = { "vector-function-abi-variant"=
 ; ARMPL-SAME:    "_ZGV_LLVM_N4vl4l4_sincospif(armpl_vsincospiq_f32),
 ; ARMPL-SAME:    _ZGVsMxvl4l4_sincospif(armpl_svsincospi_f32_x)" }
 ; ARMPL:      attributes #[[LOG10]] = { "vector-function-abi-variant"=
 ; ARMPL-SAME:    "_ZGV_LLVM_N4v_llvm.log10.f32(armpl_vlog10q_f32),
-; ARMPL-SAME     _ZGVsMxv_llvm.log10.f32(armpl_svlog10_f32_x)" }
+; ARMPL-SAME:    _ZGVsMxv_llvm.log10.f32(armpl_svlog10_f32_x)" }
diff --git a/llvm/test/tools/llvm-driver/symlink-call.test b/llvm/test/tools/llvm-driver/symlink-call.test
index eeedf9e..ca60982 100644
--- a/llvm/test/tools/llvm-driver/symlink-call.test
+++ b/llvm/test/tools/llvm-driver/symlink-call.test
@@ -14,6 +14,8 @@
 # RUN: %t/cxxfilt-15 --help | FileCheck %s
 # RUN: ln -s %llvm %t/cxxfilt-15.exe
 # RUN: %t/cxxfilt-15.exe --help | FileCheck %s
+# RUN: ln -s %llvm %t/c++filt
+# RUN: %t/c++filt --help | FileCheck %s
 
 # RUN: ln -s %llvm %t/llvm-15
 # RUN: %t/llvm-15 cxxfilt --help | FileCheck %s
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
new file mode 100644
index 0000000..ab81f9f
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
@@ -0,0 +1,791 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 --timeline --timeline-max-iterations=4 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN GPR32-bit
+ldr w0, [sp]
+add x0, x0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR8-bit
+ldr b0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR16-bit
+ldr h0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR32-bit
+ldr s0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-b
+ld1 {v0.8b}, [sp]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-h
+ld1 {v0.4h}, [sp]
+add v0.8h, v0.8h, v0.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-s
+ld1 {v0.2s}, [sp]
+add v0.4s, v0.4s, v0.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-d
+ld1 {v0.1d}, [sp]
+add v0.2d, v0.2d, v0.2d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ins
+ins v0.b[0], v1.b[1]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN lanewise-load
+ld1 {v0.b}[0], [sp]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - GPR32-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      41
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    4.88
+# CHECK-NEXT: IPC:               4.88
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      4     0.33    *                   ldr	w0, [sp]
+# CHECK-NEXT:  1      1     0.25                        add	x0, x0, x0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34   0.22   0.22   0.28   0.28    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ldr	w0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.22   0.22   0.28   0.28    -      -      -      -     add	x0, x0, x0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeeeeER .   ldr	w0, [sp]
+# CHECK-NEXT: [0,1]     D====eER.   add	x0, x0, x0
+# CHECK-NEXT: [1,0]     DeeeeE-R.   ldr	w0, [sp]
+# CHECK-NEXT: [1,1]     D====eER.   add	x0, x0, x0
+# CHECK-NEXT: [2,0]     DeeeeE-R.   ldr	w0, [sp]
+# CHECK-NEXT: [2,1]     D====eER.   add	x0, x0, x0
+# CHECK-NEXT: [3,0]     D=eeeeER.   ldr	w0, [sp]
+# CHECK-NEXT: [3,1]     D=====eER   add	x0, x0, x0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    0.5       ldr	w0, [sp]
+# CHECK-NEXT: 1.     4     5.3    0.0    0.0       add	x0, x0, x0
+# CHECK-NEXT:        4     3.3    0.6    0.3       <total>
+
+# CHECK:      [1] Code Region - FPR8-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	b0, [sp]
+# CHECK-NEXT:  1      2     0.25                        fadd	d0, d0, d0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ldr	b0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   fadd	d0, d0, d0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	b0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	b0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	b0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	b0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   fadd	d0, d0, d0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	b0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       fadd	d0, d0, d0
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [2] Code Region - FPR16-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	h0, [sp]
+# CHECK-NEXT:  1      2     0.25                        fadd	d0, d0, d0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ldr	h0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   fadd	d0, d0, d0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	h0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	h0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	h0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	h0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   fadd	d0, d0, d0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	h0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       fadd	d0, d0, d0
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [3] Code Region - FPR32-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	s0, [sp]
+# CHECK-NEXT:  1      2     0.25                        fadd	d0, d0, d0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ldr	s0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   fadd	d0, d0, d0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	s0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	s0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	s0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	s0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   fadd	d0, d0, d0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	s0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       fadd	d0, d0, d0
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [4] Code Region - SIMD64-bit-b
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ld1	{ v0.8b }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [5] Code Region - SIMD64-bit-h
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	v0.8h, v0.8h, v0.8h
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ld1	{ v0.4h }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.8h, v0.8h, v0.8h
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	v0.8h, v0.8h, v0.8h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [6] Code Region - SIMD64-bit-s
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	v0.4s, v0.4s, v0.4s
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ld1	{ v0.2s }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.4s, v0.4s, v0.4s
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	v0.4s, v0.4s, v0.4s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [7] Code Region - SIMD64-bit-d
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	v0.2d, v0.2d, v0.2d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ld1	{ v0.1d }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.2d, v0.2d, v0.2d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	v0.2d, v0.2d, v0.2d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [8] Code Region - ins
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      403
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 0.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      2     0.25                        mov	v0.b[0], v1.b[1]
+# CHECK-NEXT:  1      2     0.25                        add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   0.50   0.50
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50   mov	v0.b[0], v1.b[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .    .  .   mov	v0.b[0], v1.b[1]
+# CHECK-NEXT: [0,1]     D==eeER   .    .  .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0]     D====eeER .    .  .   mov	v0.b[0], v1.b[1]
+# CHECK-NEXT: [1,1]     D======eeER    .  .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0]     D========eeER  .  .   mov	v0.b[0], v1.b[1]
+# CHECK-NEXT: [2,1]     D==========eeER.  .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0]     D============eeER .   mov	v0.b[0], v1.b[1]
+# CHECK-NEXT: [3,1]     D==============eeER   add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     7.0    0.3    0.0       mov	v0.b[0], v1.b[1]
+# CHECK-NEXT: 1.     4     9.0    0.0    0.0       add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:        4     8.0    0.1    0.0       <total>
+
+# CHECK:      [9] Code Region - lanewise-load
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      1003
+# CHECK-NEXT: Total uOps:        300
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.30
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 0.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      8     0.33    *                   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT:  1      2     0.25                        add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50   0.50   0.50
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     0.50    -     0.50   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT: [0,1]     D========eeER  .    .    .    .    .    . .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0]     D==========eeeeeeeeER    .    .    .    . .   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT: [1,1]     D==================eeER  .    .    .    . .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0]     D====================eeeeeeeeER    .    . .   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT: [2,1]     D============================eeER  .    . .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0]     D==============================eeeeeeeeER .   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT: [3,1]     D======================================eeER   add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     16.0   0.3    0.0       ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT: 1.     4     24.0   0.0    0.0       add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:        4     20.0   0.1    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
new file mode 100644
index 0000000..fd2083dc
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
@@ -0,0 +1,812 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v2 --timeline --timeline-max-iterations=4 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN FPR8-bit
+ldr b0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR16-bit
+ldr h0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR32-bit
+ldr s0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR64-bit
+ldr d0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR128-bit
+ldr q0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-b
+ld1 {v0.8b}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-h
+ld1 {v0.4h}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-s
+ld1 {v0.2s}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-d
+ld1 {v0.1d}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN insr
+insr z0.s, w0
+add z0.s, z0.s, z0.s
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - FPR8-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	b0, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ldr	b0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	b0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	b0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	b0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	b0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	b0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [1] Code Region - FPR16-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	h0, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ldr	h0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	h0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	h0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	h0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	h0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	h0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [2] Code Region - FPR32-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	s0, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ldr	s0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	s0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	s0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	s0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	s0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	s0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [3] Code Region - FPR64-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	d0, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ldr	d0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	d0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	d0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	d0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	d0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	d0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [4] Code Region - FPR128-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	q0, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ldr	q0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	q0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	q0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	q0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	q0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	q0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [5] Code Region - SIMD64-bit-b
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ld1	{ v0.8b }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [6] Code Region - SIMD64-bit-h
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ld1	{ v0.4h }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [7] Code Region - SIMD64-bit-s
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ld1	{ v0.2s }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [8] Code Region - SIMD64-bit-d
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ld1	{ v0.1d }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [9] Code Region - insr
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      803
+# CHECK-NEXT: Total uOps:        300
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      6     1.00                        insr	z0.s, w0
+# CHECK-NEXT:  1      2     0.25                        add	z0.s, z0.s, z0.s
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -      -     0.33   1.00   0.33   0.34
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -      -      -     1.00    -      -     insr	z0.s, w0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.33    -     0.33   0.34   add	z0.s, z0.s, z0.s
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .   .   insr	z0.s, w0
+# CHECK-NEXT: [0,1]     D======eeER    .    .    .    .   .   add	z0.s, z0.s, z0.s
+# CHECK-NEXT: [1,0]     D========eeeeeeER   .    .    .   .   insr	z0.s, w0
+# CHECK-NEXT: [1,1]     D==============eeER .    .    .   .   add	z0.s, z0.s, z0.s
+# CHECK-NEXT: [2,0]     D================eeeeeeER.    .   .   insr	z0.s, w0
+# CHECK-NEXT: [2,1]     D======================eeER   .   .   add	z0.s, z0.s, z0.s
+# CHECK-NEXT: [3,0]     D========================eeeeeeER .   insr	z0.s, w0
+# CHECK-NEXT: [3,1]     D==============================eeER   add	z0.s, z0.s, z0.s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     13.0   0.3    0.0       insr	z0.s, w0
+# CHECK-NEXT: 1.     4     19.0   0.0    0.0       add	z0.s, z0.s, z0.s
+# CHECK-NEXT:        4     16.0   0.1    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/X86/call-latency.s b/llvm/test/tools/llvm-mca/X86/call-latency.s
new file mode 100644
index 0000000..9559d11
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/call-latency.s
@@ -0,0 +1,58 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2                  -iterations=1 %s | FileCheck --check-prefixes=ALL,DEFAULT %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -call-latency=50 -iterations=1 %s | FileCheck --check-prefixes=ALL,CUSTOM %s
+
+callq printf
+
+# ALL:          Iterations:        1
+# ALL-NEXT:     Instructions:      1
+
+# CUSTOM-NEXT:  Total Cycles:      53
+# DEFAULT-NEXT: Total Cycles:      103
+
+# ALL-NEXT:     Total uOps:        1
+
+# ALL:          Dispatch Width:    2
+
+# CUSTOM-NEXT:  uOps Per Cycle:    0.02
+# CUSTOM-NEXT:  IPC:               0.02
+
+# DEFAULT-NEXT: uOps Per Cycle:    0.01
+# DEFAULT-NEXT: IPC:               0.01
+
+# ALL-NEXT:     Block RThroughput: 0.5
+
+# ALL:          Instruction Info:
+# ALL-NEXT:     [1]: #uOps
+# ALL-NEXT:     [2]: Latency
+# ALL-NEXT:     [3]: RThroughput
+# ALL-NEXT:     [4]: MayLoad
+# ALL-NEXT:     [5]: MayStore
+# ALL-NEXT:     [6]: HasSideEffects (U)
+
+# ALL:          [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# ALL-NEXT:      1      1     0.50                        callq	printf
+
+# ALL:          Resources:
+# ALL-NEXT:     [0]   - JALU0
+# ALL-NEXT:     [1]   - JALU1
+# ALL-NEXT:     [2]   - JDiv
+# ALL-NEXT:     [3]   - JFPA
+# ALL-NEXT:     [4]   - JFPM
+# ALL-NEXT:     [5]   - JFPU0
+# ALL-NEXT:     [6]   - JFPU1
+# ALL-NEXT:     [7]   - JLAGU
+# ALL-NEXT:     [8]   - JMul
+# ALL-NEXT:     [9]   - JSAGU
+# ALL-NEXT:     [10]  - JSTC
+# ALL-NEXT:     [11]  - JVALU0
+# ALL-NEXT:     [12]  - JVALU1
+# ALL-NEXT:     [13]  - JVIMUL
+
+# ALL:          Resource pressure per iteration:
+# ALL-NEXT:     [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# ALL-NEXT:      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -
+
+# ALL:          Resource pressure by instruction:
+# ALL-NEXT:     [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# ALL-NEXT:      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     callq	printf
diff --git a/llvm/test/tools/llvm-objcopy/tool-options.test b/llvm/test/tools/llvm-objcopy/tool-options.test
new file mode 100644
index 0000000..8d2bb44
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/tool-options.test
@@ -0,0 +1,6 @@
+## An error must be reported if a required argument value is missing.
+# RUN: not llvm-objcopy --only-section 2>&1 | FileCheck --check-prefix=CHECK-NO-VALUE-ONLY-SECTION %s
+# CHECK-NO-VALUE-ONLY-SECTION: error: argument to '--only-section' is missing (expected 1 value(s))
+
+# RUN: not llvm-objcopy -O 2>&1 | FileCheck --check-prefix=CHECK-NO-VALUE-O %s
+# CHECK-NO-VALUE-O: error: argument to '-O' is missing (expected 1 value(s))
diff --git a/llvm/test/tools/llvm-profdata/show-order-error.proftext b/llvm/test/tools/llvm-profdata/show-order-error.proftext
new file mode 100644
index 0000000..633f1a9
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/show-order-error.proftext
@@ -0,0 +1,27 @@
+# RUN: not llvm-profdata order %s --num-test-traces=10 2>&1 | FileCheck %s
+
+# CHECK: --num-test-traces must be smaller than the total number of traces
+
+# Header
+:ir
+:temporal_prof_traces
+# Num Traces
+1
+# Trace Stream Size:
+1
+# Weight
+1
+a, b
+
+a
+# Func Hash:
+0x1234
+# Num Counters:
+1
+# Counter Values:
+101
+
+b
+0x5678
+1
+202
diff --git a/llvm/test/tools/llvm-profdata/show-order.proftext b/llvm/test/tools/llvm-profdata/show-order.proftext
index 8ef2684..28eb1b9 100644
--- a/llvm/test/tools/llvm-profdata/show-order.proftext
+++ b/llvm/test/tools/llvm-profdata/show-order.proftext
@@ -1,4 +1,6 @@
-# RUN: llvm-profdata order %s | FileCheck %s
+# RUN: llvm-profdata order %s --num-test-traces=1 | FileCheck %s
+
+# CHECK: # Total area under the page fault curve: 4.000000e+00
 
 # CHECK: a
 # CHECK: b
@@ -9,9 +11,9 @@
 :ir
 :temporal_prof_traces
 # Num Traces
-3
+4
 # Trace Stream Size:
-3
+4
 # Weight
 1
 a, main.c:b, c
@@ -21,6 +23,9 @@ a, x, main.c:b, c
 # Weight
 1
 a, main.c:b, c
+# Weight
+1
+a, main.c:b, c, x
 
 a
 # Func Hash:
diff --git a/llvm/test/tools/llvm-profgen/profile-density.test b/llvm/test/tools/llvm-profgen/profile-density.test
index 0eb8383..086697e 100644
--- a/llvm/test/tools/llvm-profgen/profile-density.test
+++ b/llvm/test/tools/llvm-profgen/profile-density.test
@@ -1,13 +1,17 @@
-; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t1 --use-offset=0 --show-density -hot-function-density-threshold=10  --trim-cold-profile=0 &> %t2
+; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t1 --use-offset=0 --show-density -profile-density-threshold=10  --trim-cold-profile=0 &> %t2
 ; RUN: FileCheck %s --input-file %t2 --check-prefix=CHECK-DENSITY
-
-; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t3 --show-density -hot-function-density-threshold=1 &> %t4
+; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t3 --show-density -profile-density-threshold=1 -profile-density-threshold=10000 &> %t4
 ; RUN: FileCheck %s --input-file %t4 --check-prefix=CHECK-DENSITY-CS
+; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t5 --show-density -profile-density-threshold=1 -profile-density-cutoff-hot=800000  &> %t6
+; RUN: FileCheck %s --input-file %t6 --check-prefix=CHECK-DENSITY-CS-80
+
+;CHECK-DENSITY: Sample PGO is estimated to optimize better with 2.9x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
+;CHECK-DENSITY: Functions with density >= 3.5 account for 99.00% total sample counts.
 
-;CHECK-DENSITY: Sample PGO is estimated to optimize better with 3.1x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
-;CHECK-DENSITY: Minimum profile density for hot functions with top 99.00% total samples: 3.2
+;CHECK-DENSITY-CS: Sample PGO is estimated to optimize better with 12.5x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
+;CHECK-DENSITY-CS: Functions with density >= 800.1 account for 99.00% total sample counts.
 
-;CHECK-DENSITY-CS: Minimum profile density for hot functions with top 99.00% total samples: 128.3
+;CHECK-DENSITY-CS-80: Functions with density >= 1886.2 account for 80.00% total sample counts.
 
 ; original code:
 ; clang -O3 -g -fno-optimize-sibling-calls -fdebug-info-for-profiling qsort.c -o a.out
diff --git a/llvm/test/tools/llvm-readobj/ELF/note-core-ntfile.test b/llvm/test/tools/llvm-readobj/ELF/note-core-ntfile.test
index 752cb72..f4957b4 100644
--- a/llvm/test/tools/llvm-readobj/ELF/note-core-ntfile.test
+++ b/llvm/test/tools/llvm-readobj/ELF/note-core-ntfile.test
@@ -3,6 +3,7 @@
 # RUN: yaml2obj %s -o %t.o
 # RUN: llvm-readelf --notes %t.o | FileCheck %s --check-prefix=GNU
 # RUN: llvm-readobj --notes %t.o | FileCheck %s --check-prefix=LLVM
+# RUN: llvm-readobj --elf-output-style=JSON --pretty-print --notes %t.o | FileCheck %s --check-prefix=JSON
 
 ## llvm-mc doesn't support generating ET_CORE files; the 'Content' field was
 ## generated with the following steps:
@@ -72,24 +73,62 @@ ProgramHeaders:
 # LLVM-NEXT:       Data size: 0x80
 # LLVM-NEXT:       Type: NT_FILE (mapped files)
 # LLVM-NEXT:       Page Size: 4096
-# LLVM-NEXT:       Mapping [
+# LLVM-NEXT:       Mappings [
+# LLVM-NEXT:        {
 # LLVM-NEXT:         Start: 0x1000
 # LLVM-NEXT:         End: 0x2000
 # LLVM-NEXT:         Offset: 0x3000
 # LLVM-NEXT:         Filename: /path/to/a.out
-# LLVM-NEXT:       ]
-# LLVM-NEXT:       Mapping [
+# LLVM-NEXT:        }
+# LLVM-NEXT:        {
 # LLVM-NEXT:         Start: 0x4000
 # LLVM-NEXT:         End: 0x5000
 # LLVM-NEXT:         Offset: 0x6000
 # LLVM-NEXT:         Filename: /path/to/libc.so
-# LLVM-NEXT:       ]
-# LLVM-NEXT:       Mapping [
+# LLVM-NEXT:        }
+# LLVM-NEXT:        {
 # LLVM-NEXT:         Start: 0x7000
 # LLVM-NEXT:         End: 0x8000
 # LLVM-NEXT:         Offset: 0x9000
 # LLVM-NEXT:         Filename: [stack]
-# LLVM-NEXT:       ]
-# LLVM-NEXT:     }
+# LLVM-NEXT:       }
+# LLVM-NEXT:     ]
 # LLVM-NEXT:   }
+# LLVM-NEXT: }
 # LLVM-NEXT: ]
+
+# JSON:      "Notes": [
+# JSON-NEXT:  {
+# JSON-NEXT:      "NoteSection": {
+# JSON-NEXT:          "Name": "<?>",
+# JSON-NEXT:          "Offset": 120,
+# JSON-NEXT:          "Size": 148,
+# JSON-NEXT:          "Note": {
+# JSON-NEXT:              "Owner": "CORE",
+# JSON-NEXT:              "Data size": 128,
+# JSON-NEXT:              "Type": "NT_FILE (mapped files)",
+# JSON-NEXT:              "Page Size": 4096,
+# JSON-NEXT:              "Mappings": [
+# JSON-NEXT:                {
+# JSON-NEXT:                  "Start": 4096,
+# JSON-NEXT:                  "End": 8192,
+# JSON-NEXT:                  "Offset": 12288,
+# JSON-NEXT:                  "Filename": "/path/to/a.out"
+# JSON-NEXT:                },
+# JSON-NEXT:                {
+# JSON-NEXT:                  "Start": 16384,
+# JSON-NEXT:                  "End": 20480,
+# JSON-NEXT:                  "Offset": 24576,
+# JSON-NEXT:                  "Filename": "/path/to/libc.so"
+# JSON-NEXT:                },
+# JSON-NEXT:                {
+# JSON-NEXT:                  "Start": 28672,
+# JSON-NEXT:                  "End": 32768,
+# JSON-NEXT:                  "Offset": 36864,
+# JSON-NEXT:                  "Filename": "[stack]"
+# JSON-NEXT:                }
+# JSON-NEXT:            ]
+# JSON-NEXT:          }
+# JSON-NEXT:      }
+# JSON-NEXT: }
+# JSON-NEXT: ]
diff --git a/llvm/tools/llvm-cxxfilt/CMakeLists.txt b/llvm/tools/llvm-cxxfilt/CMakeLists.txt
index cbc4c2d..a644baf 100644
--- a/llvm/tools/llvm-cxxfilt/CMakeLists.txt
+++ b/llvm/tools/llvm-cxxfilt/CMakeLists.txt
@@ -17,6 +17,10 @@ add_llvm_tool(llvm-cxxfilt
   GENERATE_DRIVER
   )
 
+if(LLVM_TOOL_LLVM_DRIVER_BUILD)
+  set_property(GLOBAL APPEND PROPERTY LLVM_DRIVER_HIDDEN_TOOL_ALIASES_llvm-cxxfilt c++filt)
+endif()
+
 if(LLVM_INSTALL_BINUTILS_SYMLINKS)
   add_llvm_tool_symlink(c++filt llvm-cxxfilt)
 endif()
diff --git a/llvm/tools/llvm-lto/llvm-lto.cpp b/llvm/tools/llvm-lto/llvm-lto.cpp
index f310097..8218bd5 100644
--- a/llvm/tools/llvm-lto/llvm-lto.cpp
+++ b/llvm/tools/llvm-lto/llvm-lto.cpp
@@ -692,8 +692,9 @@ private:
       // Build a map of module to the GUIDs and summary objects that should
       // be written to its index.
       std::map<std::string, GVSummaryMapTy> ModuleToSummariesForIndex;
+      GVSummaryPtrSet DecSummaries;
       ThinGenerator.gatherImportedSummariesForModule(
-          *TheModule, *Index, ModuleToSummariesForIndex, *Input);
+          *TheModule, *Index, ModuleToSummariesForIndex, DecSummaries, *Input);
 
       std::string OutputName = OutputFilename;
       if (OutputName.empty()) {
@@ -703,7 +704,7 @@ private:
       std::error_code EC;
       raw_fd_ostream OS(OutputName, EC, sys::fs::OpenFlags::OF_None);
       error(EC, "error opening the file '" + OutputName + "'");
-      writeIndexToFile(*Index, OS, &ModuleToSummariesForIndex);
+      writeIndexToFile(*Index, OS, &ModuleToSummariesForIndex, &DecSummaries);
     }
   }
 
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
index 03d7d79..cc5d4f5 100644
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -135,6 +135,11 @@ static cl::opt<unsigned>
                                "(instructions per cycle)"),
                       cl::cat(ToolOptions), cl::init(0));
 
+static cl::opt<unsigned>
+    CallLatency("call-latency", cl::Hidden,
+                cl::desc("Number of cycles to assume for a call instruction"),
+                cl::cat(ToolOptions), cl::init(100U));
+
 enum class SkipType { NONE, LACK_SCHED, PARSE_FAILURE, ANY_FAILURE };
 
 static cl::opt<enum SkipType> SkipUnsupportedInstructions(
@@ -568,7 +573,7 @@ int main(int argc, char **argv) {
   }
 
   // Create an instruction builder.
-  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM);
+  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM, CallLatency);
 
   // Create a context to control ownership of the pipeline hardware.
   mca::Context MCA(*MRI, *STI);
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
index a189733..4ab3b72 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -571,6 +571,12 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
   llvm::opt::InputArgList InputArgs =
       T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
 
+  if (MissingArgumentCount)
+    return createStringError(
+        errc::invalid_argument,
+        "argument to '%s' is missing (expected %d value(s))",
+        InputArgs.getArgString(MissingArgumentIndex), MissingArgumentCount);
+
   if (InputArgs.size() == 0 && DashDash == RawArgsArr.end()) {
     printHelp(T, errs(), ToolType::Objcopy);
     exit(1);
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 693af06..28c3afa 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -340,7 +340,7 @@ cl::opt<unsigned long long> OverlapValueCutoff(
         "profile with max count value greater then the parameter value"),
     cl::sub(OverlapSubcommand));
 
-// Options unique to show subcommand.
+// Options specific to show subcommand.
 cl::opt<bool> ShowCounts("counts", cl::init(false),
                          cl::desc("Show counter values for shown functions"),
                          cl::sub(ShowSubcommand));
@@ -439,6 +439,14 @@ cl::opt<bool> ShowProfileVersion("profile-version", cl::init(false),
                                  cl::desc("Show profile version. "),
                                  cl::sub(ShowSubcommand));
 
+// Options specific to order subcommand.
+cl::opt<unsigned>
+    NumTestTraces("num-test-traces", cl::init(0),
+                  cl::desc("Keep aside the last <num-test-traces> traces in "
+                           "the profile when computing the function order and "
+                           "instead use them to evaluate that order"),
+                  cl::sub(OrderSubcommand));
+
 // We use this string to indicate that there are
 // multiple static functions map to the same name.
 const std::string DuplicateNameStr = "----";
@@ -3277,13 +3285,42 @@ static int order_main() {
     // Read all entries
     (void)I;
   }
-  auto &Traces = Reader->getTemporalProfTraces();
-  auto Nodes = TemporalProfTraceTy::createBPFunctionNodes(Traces);
+  ArrayRef Traces = Reader->getTemporalProfTraces();
+  if (NumTestTraces && NumTestTraces >= Traces.size())
+    exitWithError(
+        "--" + NumTestTraces.ArgStr +
+        " must be smaller than the total number of traces: expected: < " +
+        Twine(Traces.size()) + ", actual: " + Twine(NumTestTraces));
+  ArrayRef TestTraces = Traces.take_back(NumTestTraces);
+  Traces = Traces.drop_back(NumTestTraces);
+
+  std::vector<BPFunctionNode> Nodes;
+  TemporalProfTraceTy::createBPFunctionNodes(Traces, Nodes);
   BalancedPartitioningConfig Config;
   BalancedPartitioning BP(Config);
   BP.run(Nodes);
 
   OS << "# Ordered " << Nodes.size() << " functions\n";
+  if (!TestTraces.empty()) {
+    // Since we don't know the symbol sizes, we assume 32 functions per page.
+    DenseMap<BPFunctionNode::IDT, unsigned> IdToPageNumber;
+    for (auto &Node : Nodes)
+      IdToPageNumber[Node.Id] = IdToPageNumber.size() / 32;
+
+    SmallSet<unsigned, 0> TouchedPages;
+    unsigned Area = 0;
+    for (auto &Trace : TestTraces) {
+      for (auto Id : Trace.FunctionNameRefs) {
+        auto It = IdToPageNumber.find(Id);
+        if (It == IdToPageNumber.end())
+          continue;
+        TouchedPages.insert(It->getSecond());
+        Area += TouchedPages.size();
+      }
+      TouchedPages.clear();
+    }
+    OS << "# Total area under the page fault curve: " << (float)Area << "\n";
+  }
   OS << "# Warning: Mach-O may prefix symbols with \"_\" depending on the "
         "linkage and this output does not take that into account. Some "
         "post-processing may be required before passing to the linker via "
diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp
index e944202..e63c6d6 100644
--- a/llvm/tools/llvm-profgen/PerfReader.cpp
+++ b/llvm/tools/llvm-profgen/PerfReader.cpp
@@ -552,7 +552,7 @@ bool PerfScriptReader::extractLBRStack(TraceStream &TraceIt,
   //                           ... 0x4005c8/0x4005dc/P/-/-/0
   // It's in FIFO order and seperated by whitespace.
   SmallVector<StringRef, 32> Records;
-  TraceIt.getCurrentLine().split(Records, " ", -1, false);
+  TraceIt.getCurrentLine().rtrim().split(Records, " ", -1, false);
   auto WarnInvalidLBR = [](TraceStream &TraceIt) {
     WithColor::warning() << "Invalid address in LBR record at line "
                          << TraceIt.getLineNumber() << ": "
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index 5aa4410..2118e95 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -75,14 +75,18 @@ static cl::opt<int, true> CSProfMaxContextDepth(
              "depth limit."),
     cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth));
 
-static cl::opt<double> HotFunctionDensityThreshold(
-    "hot-function-density-threshold", llvm::cl::init(1000),
-    llvm::cl::desc(
-        "specify density threshold for hot functions (default: 1000)"),
+static cl::opt<double> ProfileDensityThreshold(
+    "profile-density-threshold", llvm::cl::init(50),
+    llvm::cl::desc("If the profile density is below the given threshold, it "
+                   "will be suggested to increase the sampling rate."),
     llvm::cl::Optional);
 static cl::opt<bool> ShowDensity("show-density", llvm::cl::init(false),
                                  llvm::cl::desc("show profile density details"),
                                  llvm::cl::Optional);
+static cl::opt<int> ProfileDensityCutOffHot(
+    "profile-density-cutoff-hot", llvm::cl::init(990000),
+    llvm::cl::desc("Total samples cutoff for functions used to calculate "
+                   "profile density."));
 
 static cl::opt<bool> UpdateTotalSamples(
     "update-total-samples", llvm::cl::init(false),
@@ -179,21 +183,22 @@ void ProfileGeneratorBase::write() {
 
 void ProfileGeneratorBase::showDensitySuggestion(double Density) {
   if (Density == 0.0)
-    WithColor::warning() << "The --profile-summary-cutoff-hot option may be "
+    WithColor::warning() << "The output profile is empty or the "
+                            "--profile-density-cutoff-hot option is "
                             "set too low. Please check your command.\n";
-  else if (Density < HotFunctionDensityThreshold)
+  else if (Density < ProfileDensityThreshold)
     WithColor::warning()
         << "Sample PGO is estimated to optimize better with "
-        << format("%.1f", HotFunctionDensityThreshold / Density)
+        << format("%.1f", ProfileDensityThreshold / Density)
         << "x more samples. Please consider increasing sampling rate or "
            "profiling for longer duration to get more samples.\n";
 
   if (ShowDensity)
-    outs() << "Minimum profile density for hot functions with top "
+    outs() << "Functions with density >= " << format("%.1f", Density)
+           << " account for "
            << format("%.2f",
-                     static_cast<double>(ProfileSummaryCutoffHot.getValue()) /
-                         10000)
-           << "% total samples: " << format("%.1f", Density) << "\n";
+                     static_cast<double>(ProfileDensityCutOffHot) / 10000)
+           << "% total sample counts.\n";
 }
 
 bool ProfileGeneratorBase::filterAmbiguousProfile(FunctionSamples &FS) {
@@ -238,32 +243,6 @@ void ProfileGeneratorBase::filterAmbiguousProfile(SampleProfileMap &Profiles) {
   }
 }
 
-double ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles,
-                                              uint64_t HotCntThreshold) {
-  double Density = DBL_MAX;
-  std::vector<const FunctionSamples *> HotFuncs;
-  for (auto &I : Profiles) {
-    auto &FuncSamples = I.second;
-    if (FuncSamples.getTotalSamples() < HotCntThreshold)
-      continue;
-    HotFuncs.emplace_back(&FuncSamples);
-  }
-
-  for (auto *FuncSamples : HotFuncs) {
-    auto *Func = Binary->getBinaryFunction(FuncSamples->getFunction());
-    if (!Func)
-      continue;
-    uint64_t FuncSize = Func->getFuncSize();
-    if (FuncSize == 0)
-      continue;
-    Density =
-        std::min(Density, static_cast<double>(FuncSamples->getTotalSamples()) /
-                              FuncSize);
-  }
-
-  return Density == DBL_MAX ? 0.0 : Density;
-}
-
 void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges,
                                               const RangeSample &Ranges) {
 
@@ -768,9 +747,95 @@ void ProfileGenerator::populateBoundarySamplesForAllFunctions(
   }
 }
 
+void ProfileGeneratorBase::calculateBodySamplesAndSize(
+    const FunctionSamples &FSamples, uint64_t &TotalBodySamples,
+    uint64_t &FuncBodySize) {
+  // Note that ideally the size should be the number of function instruction.
+  // However, for probe-based profile, we don't have the accurate instruction
+  // count for each probe, instead, the probe sample is the samples count for
+  // the block, which is equivelant to
+  // total_instruction_samples/num_of_instruction in one block. Hence, we use
+  // the number of probe as a proxy for the function's size.
+  FuncBodySize += FSamples.getBodySamples().size();
+
+  // The accumulated body samples re-calculated here could be different from the
+  // TotalSamples(getTotalSamples) field of FunctionSamples for line-number
+  // based profile. The reason is that TotalSamples is the sum of all the
+  // samples of the machine instruction in one source-code line, however, the
+  // entry of Bodysamples is the only max number of them, so the TotalSamples is
+  // usually much bigger than the accumulated body samples as one souce-code
+  // line can emit many machine instructions. We observed a regression when we
+  // switched to use the accumulated body samples(by using
+  // -update-total-samples). Hence, it's safer to re-calculate here to avoid
+  // such discrepancy. There is no problem for probe-based profile, as the
+  // TotalSamples is exactly the same as the accumulated body samples.
+  for (const auto &I : FSamples.getBodySamples())
+    TotalBodySamples += I.second.getSamples();
+
+  for (const auto &CallsiteSamples : FSamples.getCallsiteSamples())
+    for (const auto &Callee : CallsiteSamples.second) {
+      // For binary-level density, the inlinees' samples and size should be
+      // included in the calculation.
+      calculateBodySamplesAndSize(Callee.second, TotalBodySamples,
+                                  FuncBodySize);
+    }
+}
+
+// Calculate Profile-density:
+// Calculate the density for each function and sort them in descending order,
+// keep accumulating their total samples unitl it exceeds the
+// percentage_threshold(cut-off) of total profile samples, the profile-density
+// is the last(minimum) function-density of the processed functions, which means
+// all the functions hot to perf are on good density if the profile-density is
+// good. The percentage_threshold(--profile-density-cutoff-hot) is configurable
+// depending on how much regression the system want to tolerate.
+double
+ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles) {
+  double ProfileDensity = 0.0;
+
+  uint64_t TotalProfileSamples = 0;
+  // A list of the function profile density and its total samples.
+  std::vector<std::pair<double, uint64_t>> FuncDensityList;
+  for (const auto &I : Profiles) {
+    uint64_t TotalBodySamples = 0;
+    uint64_t FuncBodySize = 0;
+    calculateBodySamplesAndSize(I.second, TotalBodySamples, FuncBodySize);
+
+    if (FuncBodySize == 0)
+      continue;
+
+    double FuncDensity = static_cast<double>(TotalBodySamples) / FuncBodySize;
+    TotalProfileSamples += TotalBodySamples;
+    FuncDensityList.emplace_back(FuncDensity, TotalBodySamples);
+  }
+
+  // Sorted by the density in descending order.
+  llvm::stable_sort(FuncDensityList, [&](const std::pair<double, uint64_t> &A,
+                                         const std::pair<double, uint64_t> &B) {
+    if (A.first != B.first)
+      return A.first > B.first;
+    return A.second < B.second;
+  });
+
+  uint64_t AccumulatedSamples = 0;
+  uint32_t I = 0;
+  assert(ProfileDensityCutOffHot <= 1000000 &&
+         "The cutoff value is greater than 1000000(100%)");
+  while (AccumulatedSamples < TotalProfileSamples *
+                                  static_cast<float>(ProfileDensityCutOffHot) /
+                                  1000000 &&
+         I < FuncDensityList.size()) {
+    AccumulatedSamples += FuncDensityList[I].second;
+    ProfileDensity = FuncDensityList[I].first;
+    I++;
+  }
+
+  return ProfileDensity;
+}
+
 void ProfileGeneratorBase::calculateAndShowDensity(
     const SampleProfileMap &Profiles) {
-  double Density = calculateDensity(Profiles, HotCountThreshold);
+  double Density = calculateDensity(Profiles);
   showDensitySuggestion(Density);
 }
 
@@ -1057,17 +1122,13 @@ void CSProfileGenerator::postProcessProfiles() {
             CSProfMaxColdContextDepth, EnableCSPreInliner);
   }
 
-  // Merge function samples of CS profile to calculate profile density.
-  sampleprof::SampleProfileMap ContextLessProfiles;
-  ProfileConverter::flattenProfile(ProfileMap, ContextLessProfiles, true);
-
-  calculateAndShowDensity(ContextLessProfiles);
   if (GenCSNestedProfile) {
     ProfileConverter CSConverter(ProfileMap);
     CSConverter.convertCSProfiles();
     FunctionSamples::ProfileIsCS = false;
   }
   filterAmbiguousProfile(ProfileMap);
+  ProfileGeneratorBase::calculateAndShowDensity(ProfileMap);
 }
 
 void ProfileGeneratorBase::computeSummaryAndThreshold(
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h
index d258fb7..5e361285 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.h
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.h
@@ -116,10 +116,13 @@ protected:
 
   void computeSummaryAndThreshold(SampleProfileMap &ProfileMap);
 
-  void calculateAndShowDensity(const SampleProfileMap &Profiles);
+  void calculateBodySamplesAndSize(const FunctionSamples &FSamples,
+                                   uint64_t &TotalBodySamples,
+                                   uint64_t &FuncBodySize);
+
+  double calculateDensity(const SampleProfileMap &Profiles);
 
-  double calculateDensity(const SampleProfileMap &Profiles,
-                          uint64_t HotCntThreshold);
+  void calculateAndShowDensity(const SampleProfileMap &Profiles);
 
   void showDensitySuggestion(double Density);
 
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index a752cc4..966531e 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -7840,8 +7840,9 @@ static bool printLLVMOMPOFFLOADNoteLLVMStyle(uint32_t NoteType,
 
 static void printCoreNoteLLVMStyle(const CoreNote &Note, ScopedPrinter &W) {
   W.printNumber("Page Size", Note.PageSize);
+  ListScope D(W, "Mappings");
   for (const CoreFileMapping &Mapping : Note.Mappings) {
-    ListScope D(W, "Mapping");
+    DictScope D(W);
     W.printHex("Start", Mapping.Start);
     W.printHex("End", Mapping.End);
     W.printHex("Offset", Mapping.Offset);
diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp
index 8ec120d..ac2075c 100644
--- a/llvm/unittests/IR/ConstantRangeTest.cpp
+++ b/llvm/unittests/IR/ConstantRangeTest.cpp
@@ -209,6 +209,10 @@ static bool CheckAll(const ConstantRange &, const ConstantRange &) {
   return true;
 }
 
+static bool CheckCorrectnessOnly(const ConstantRange &, const ConstantRange &) {
+  return false;
+}
+
 static bool CheckSingleElementsOnly(const ConstantRange &CR1,
                                     const ConstantRange &CR2) {
   return CR1.isSingleElement() && CR2.isSingleElement();
@@ -1019,18 +1023,102 @@ TEST_F(ConstantRangeTest, Multiply) {
       });
 }
 
+TEST_F(ConstantRangeTest, MultiplyWithNoWrap) {
+  using OBO = OverflowingBinaryOperator;
+
+  EXPECT_EQ(Empty.multiplyWithNoWrap(Some, OBO::NoUnsignedWrap), Empty);
+  EXPECT_EQ(Some.multiplyWithNoWrap(Empty, OBO::NoUnsignedWrap), Empty);
+  EXPECT_EQ(Full.multiplyWithNoWrap(Full, OBO::NoUnsignedWrap), Full);
+  EXPECT_EQ(Full.multiplyWithNoWrap(Some, OBO::NoUnsignedWrap), Full);
+  EXPECT_EQ(Some.multiplyWithNoWrap(Full, OBO::NoUnsignedWrap), Full);
+  EXPECT_EQ(ConstantRange(APInt(4, 0), APInt(4, 2))
+                .multiplyWithNoWrap(ConstantRange(APInt(4, 2), APInt(4, 0)),
+                                    OBO::NoUnsignedWrap),
+            ConstantRange::getFull(4));
+  EXPECT_EQ(ConstantRange(APInt(4, 1), APInt(4, 5))
+                .multiplyWithNoWrap(ConstantRange(APInt(4, 1), APInt(4, 5)),
+                                    OBO::NoUnsignedWrap),
+            ConstantRange(APInt(4, 1), APInt(4, 0)));
+  EXPECT_EQ(ConstantRange(APInt(8, 254), APInt(8, 0))
+                .multiplyWithNoWrap(ConstantRange(APInt(8, 252), APInt(8, 4)),
+                                    OBO::NoUnsignedWrap),
+            ConstantRange(APInt(8, 250), APInt(8, 9)));
+  EXPECT_EQ(ConstantRange(APInt(8, 254), APInt(8, 255))
+                .multiplyWithNoWrap(ConstantRange(APInt(8, 2), APInt(8, 4)),
+                                    OBO::NoUnsignedWrap),
+            ConstantRange::getEmpty(8));
+
+  EXPECT_EQ(Empty.multiplyWithNoWrap(Some, OBO::NoSignedWrap), Empty);
+  EXPECT_EQ(Some.multiplyWithNoWrap(Empty, OBO::NoSignedWrap), Empty);
+  EXPECT_EQ(Full.multiplyWithNoWrap(Full, OBO::NoSignedWrap), Full);
+  EXPECT_EQ(Full.multiplyWithNoWrap(Some, OBO::NoSignedWrap), Full);
+  EXPECT_EQ(Some.multiplyWithNoWrap(Full, OBO::NoSignedWrap), Full);
+  EXPECT_EQ(
+      ConstantRange(APInt(4, 0), APInt(4, 4))
+          .multiplyWithNoWrap(ConstantRange(APInt(4, -5, true), APInt(4, 4)),
+                              OBO::NoSignedWrap),
+      ConstantRange::getFull(4));
+  EXPECT_EQ(ConstantRange(APInt(4, 0), APInt(4, 3))
+                .multiplyWithNoWrap(ConstantRange(APInt(4, 0), APInt(4, 5)),
+                                    OBO::NoSignedWrap),
+            ConstantRange(APInt(4, 0), APInt(4, -8, true)));
+  EXPECT_EQ(ConstantRange(APInt(8, 3), APInt(8, -11, true))
+                .multiplyWithNoWrap(ConstantRange(APInt(8, -1, true)),
+                                    OBO::NoSignedWrap),
+            ConstantRange(APInt(8, 12), APInt(8, -2, true)));
+  EXPECT_EQ(ConstantRange(APInt(8, 254), APInt(8, 255))
+                .multiplyWithNoWrap(ConstantRange(APInt(8, 100), APInt(8, 121)),
+                                    OBO::NoSignedWrap),
+            ConstantRange::getEmpty(8));
+
+  TestBinaryOpExhaustive(
+      [](const ConstantRange &CR1, const ConstantRange &CR2) {
+        return CR1.multiplyWithNoWrap(CR2, OBO::NoUnsignedWrap);
+      },
+      [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
+        bool IsOverflow;
+        APInt Res = N1.umul_ov(N2, IsOverflow);
+        if (IsOverflow)
+          return std::nullopt;
+        return Res;
+      },
+      PreferSmallest, CheckCorrectnessOnly);
+  TestBinaryOpExhaustive(
+      [](const ConstantRange &CR1, const ConstantRange &CR2) {
+        return CR1.multiplyWithNoWrap(CR2, OBO::NoSignedWrap);
+      },
+      [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
+        bool IsOverflow;
+        APInt Res = N1.smul_ov(N2, IsOverflow);
+        if (IsOverflow)
+          return std::nullopt;
+        return Res;
+      },
+      PreferSmallest, CheckCorrectnessOnly);
+  TestBinaryOpExhaustive(
+      [](const ConstantRange &CR1, const ConstantRange &CR2) {
+        return CR1.multiplyWithNoWrap(CR2,
+                                      OBO::NoUnsignedWrap | OBO::NoSignedWrap);
+      },
+      [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
+        bool IsOverflow1, IsOverflow2;
+        APInt Res1 = N1.umul_ov(N2, IsOverflow1);
+        APInt Res2 = N1.smul_ov(N2, IsOverflow2);
+        if (IsOverflow1 || IsOverflow2)
+          return std::nullopt;
+        assert(Res1 == Res2 && "Multiplication results differ?");
+        return Res1;
+      },
+      PreferSmallest, CheckCorrectnessOnly);
+}
+
 TEST_F(ConstantRangeTest, smul_fast) {
   TestBinaryOpExhaustive(
       [](const ConstantRange &CR1, const ConstantRange &CR2) {
         return CR1.smul_fast(CR2);
       },
-      [](const APInt &N1, const APInt &N2) {
-        return N1 * N2;
-      },
-      PreferSmallest,
-      [](const ConstantRange &, const ConstantRange &) {
-        return false; // Check correctness only.
-      });
+      [](const APInt &N1, const APInt &N2) { return N1 * N2; }, PreferSmallest,
+      CheckCorrectnessOnly);
 }
 
 TEST_F(ConstantRangeTest, UMax) {
diff --git a/llvm/unittests/IR/MDBuilderTest.cpp b/llvm/unittests/IR/MDBuilderTest.cpp
index 2b5ab81..4656c70 100644
--- a/llvm/unittests/IR/MDBuilderTest.cpp
+++ b/llvm/unittests/IR/MDBuilderTest.cpp
@@ -127,4 +127,43 @@ TEST_F(MDBuilderTest, createPCSections) {
   EXPECT_EQ(mdconst::extract<ConstantInt>(Aux->getOperand(1))->getValue(),
             C2->getValue());
 }
+TEST_F(MDBuilderTest, createCallbackAndMerge) {
+  MDBuilder MDHelper(Context);
+  auto *CB1 = MDHelper.createCallbackEncoding(0, {1, -1}, false);
+  auto *CB2 = MDHelper.createCallbackEncoding(2, {-1}, false);
+  ASSERT_EQ(CB1->getNumOperands(), 4U);
+  ASSERT_TRUE(isa<ConstantAsMetadata>(CB1->getOperand(0)));
+  ASSERT_TRUE(isa<ConstantAsMetadata>(CB1->getOperand(1)));
+  ASSERT_TRUE(isa<ConstantAsMetadata>(CB1->getOperand(2)));
+  ASSERT_TRUE(isa<ConstantAsMetadata>(CB1->getOperand(3)));
+  EXPECT_EQ(mdconst::extract<ConstantInt>(CB1->getOperand(0))->getValue(), 0);
+  EXPECT_EQ(mdconst::extract<ConstantInt>(CB1->getOperand(1))->getValue(), 1);
+  EXPECT_EQ(mdconst::extract<ConstantInt>(CB1->getOperand(2))->getValue(), -1);
+  EXPECT_EQ(mdconst::extract<ConstantInt>(CB1->getOperand(3))->getValue(),
+            false);
+  ASSERT_EQ(CB2->getNumOperands(), 3U);
+  ASSERT_TRUE(isa<ConstantAsMetadata>(CB2->getOperand(0)));
+  ASSERT_TRUE(isa<ConstantAsMetadata>(CB2->getOperand(1)));
+  ASSERT_TRUE(isa<ConstantAsMetadata>(CB2->getOperand(2)));
+  EXPECT_EQ(mdconst::extract<ConstantInt>(CB2->getOperand(0))->getValue(), 2);
+  EXPECT_EQ(mdconst::extract<ConstantInt>(CB2->getOperand(1))->getValue(), -1);
+  EXPECT_EQ(mdconst::extract<ConstantInt>(CB2->getOperand(2))->getValue(),
+            false);
+  auto *CBList = MDNode::get(Context, {CB1, CB2});
+  auto *CB3 = MDHelper.createCallbackEncoding(4, {5}, false);
+  auto *NewCBList = MDHelper.mergeCallbackEncodings(CBList, CB3);
+  ASSERT_EQ(NewCBList->getNumOperands(), 3U);
+  EXPECT_TRUE(NewCBList->getOperand(0) == CB1);
+  EXPECT_TRUE(NewCBList->getOperand(1) == CB2);
+  EXPECT_TRUE(NewCBList->getOperand(2) == CB3);
+
+  ASSERT_EQ(CB3->getNumOperands(), 3U);
+  ASSERT_TRUE(isa<ConstantAsMetadata>(CB3->getOperand(0)));
+  ASSERT_TRUE(isa<ConstantAsMetadata>(CB3->getOperand(1)));
+  ASSERT_TRUE(isa<ConstantAsMetadata>(CB3->getOperand(2)));
+  EXPECT_EQ(mdconst::extract<ConstantInt>(CB3->getOperand(0))->getValue(), 4);
+  EXPECT_EQ(mdconst::extract<ConstantInt>(CB3->getOperand(1))->getValue(), 5);
+  EXPECT_EQ(mdconst::extract<ConstantInt>(CB3->getOperand(2))->getValue(),
+            false);
+}
 } // namespace
diff --git a/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp b/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
index 6af6f1b..24586b5 100644
--- a/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
+++ b/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/BalancedPartitioning.h"
-#include "llvm/Testing/Support/SupportHelpers.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
@@ -31,22 +30,32 @@ TEST(BPFunctionNodeTest, Basic) {
                        UnorderedElementsAreArray(UNs)));
   };
 
-  auto Nodes = TemporalProfTraceTy::createBPFunctionNodes({
-      TemporalProfTraceTy({0, 1, 2, 3}),
-  });
+  std::vector<BPFunctionNode> Nodes;
+  TemporalProfTraceTy::createBPFunctionNodes(
+      {TemporalProfTraceTy({0, 1, 2, 3})}, Nodes, /*RemoveOutlierUNs=*/false);
+  // Utility nodes that are too infrequent or too prevalent are filtered out.
   EXPECT_THAT(Nodes,
               UnorderedElementsAre(NodeIs(0, {0, 1, 2}), NodeIs(1, {1, 2}),
-                                   NodeIs(2, {1, 2}), NodeIs(3, {2})));
+                                   NodeIs(2, {2}), NodeIs(3, {2})));
 
-  Nodes = TemporalProfTraceTy::createBPFunctionNodes({
-      TemporalProfTraceTy({0, 1, 2, 3, 4}),
-      TemporalProfTraceTy({4, 2}),
-  });
+  Nodes.clear();
+  TemporalProfTraceTy::createBPFunctionNodes(
+      {TemporalProfTraceTy({0, 1, 2, 3, 4}), TemporalProfTraceTy({4, 2})},
+      Nodes, /*RemoveOutlierUNs=*/false);
 
   EXPECT_THAT(Nodes,
-              UnorderedElementsAre(NodeIs(0, {0, 1, 2}), NodeIs(1, {1, 2}),
-                                   NodeIs(2, {1, 2, 4, 5}), NodeIs(3, {2}),
-                                   NodeIs(4, {2, 3, 4, 5})));
+              UnorderedElementsAre(NodeIs(0, {0, 1, 2, 3}),
+                                   NodeIs(1, {1, 2, 3}), NodeIs(2, {2, 3, 5}),
+                                   NodeIs(3, {2, 3}), NodeIs(4, {3, 4, 5})));
+
+  Nodes.clear();
+  TemporalProfTraceTy::createBPFunctionNodes(
+      {TemporalProfTraceTy({0, 1, 2, 3, 4}), TemporalProfTraceTy({4, 2})},
+      Nodes, /*RemoveOutlierUNs=*/true);
+
+  EXPECT_THAT(Nodes, UnorderedElementsAre(NodeIs(0, {1}), NodeIs(1, {1}),
+                                          NodeIs(2, {5}), NodeIs(3, {}),
+                                          NodeIs(4, {5})));
 }
 
 } // end namespace llvm
diff --git a/llvm/unittests/Support/LEB128Test.cpp b/llvm/unittests/Support/LEB128Test.cpp
index 60f5ddd..5aa7139 100644
--- a/llvm/unittests/Support/LEB128Test.cpp
+++ b/llvm/unittests/Support/LEB128Test.cpp
@@ -147,7 +147,8 @@ TEST(LEB128Test, DecodeULEB128) {
 TEST(LEB128Test, DecodeInvalidULEB128) {
 #define EXPECT_INVALID_ULEB128(VALUE, ERROR_OFFSET)                            \
   do {                                                                         \
-    const uint8_t *Value = reinterpret_cast<const uint8_t *>(VALUE);           \
+    const char *DefaultValue = VALUE;                                          \
+    const uint8_t *Value = reinterpret_cast<const uint8_t *>(DefaultValue);    \
     const char *Error = nullptr;                                               \
     unsigned ErrorOffset = 0;                                                  \
     uint64_t Actual =                                                          \
@@ -155,12 +156,13 @@ TEST(LEB128Test, DecodeInvalidULEB128) {
     EXPECT_NE(Error, nullptr);                                                 \
     EXPECT_EQ(0ul, Actual);                                                    \
     EXPECT_EQ(ERROR_OFFSET, ErrorOffset);                                      \
-    Value = reinterpret_cast<const uint8_t *>(VALUE);                          \
+    Value = reinterpret_cast<const uint8_t *>(DefaultValue);                   \
     Error = nullptr;                                                           \
     Actual = decodeULEB128AndInc(Value, Value + strlen(VALUE), &Error);        \
     EXPECT_NE(Error, nullptr);                                                 \
     EXPECT_EQ(0ul, Actual);                                                    \
-    EXPECT_EQ(ERROR_OFFSET, Value - reinterpret_cast<const uint8_t *>(VALUE)); \
+    EXPECT_EQ(ERROR_OFFSET,                                                    \
+              Value - reinterpret_cast<const uint8_t *>(DefaultValue));        \
   } while (0)
 
   // Buffer overflow.
@@ -222,7 +224,8 @@ TEST(LEB128Test, DecodeSLEB128) {
 TEST(LEB128Test, DecodeInvalidSLEB128) {
 #define EXPECT_INVALID_SLEB128(VALUE, ERROR_OFFSET)                            \
   do {                                                                         \
-    const uint8_t *Value = reinterpret_cast<const uint8_t *>(VALUE);           \
+    const char *DefaultValue = VALUE;                                          \
+    const uint8_t *Value = reinterpret_cast<const uint8_t *>(DefaultValue);    \
     const char *Error = nullptr;                                               \
     unsigned ErrorOffset = 0;                                                  \
     uint64_t Actual =                                                          \
@@ -230,12 +233,13 @@ TEST(LEB128Test, DecodeInvalidSLEB128) {
     EXPECT_NE(Error, nullptr);                                                 \
     EXPECT_EQ(0ul, Actual);                                                    \
     EXPECT_EQ(ERROR_OFFSET, ErrorOffset);                                      \
-    Value = reinterpret_cast<const uint8_t *>(VALUE);                          \
+    Value = reinterpret_cast<const uint8_t *>(DefaultValue);                   \
     Error = nullptr;                                                           \
     Actual = decodeSLEB128AndInc(Value, Value + strlen(VALUE), &Error);        \
     EXPECT_NE(Error, nullptr);                                                 \
     EXPECT_EQ(0ul, Actual);                                                    \
-    EXPECT_EQ(ERROR_OFFSET, Value - reinterpret_cast<const uint8_t *>(VALUE)); \
+    EXPECT_EQ(ERROR_OFFSET,                                                    \
+              Value - reinterpret_cast<const uint8_t *>(DefaultValue));        \
   } while (0)
 
   // Buffer overflow.
@@ -257,7 +261,9 @@ TEST(LEB128Test, DecodeInvalidSLEB128) {
 TEST(LEB128Test, DecodeAndInc) {
 #define EXPECT_LEB128(FUN, VALUE, SIZE)                                        \
   do {                                                                         \
-    const uint8_t *V = reinterpret_cast<const uint8_t *>(VALUE), *P = V;       \
+    const char *DefaultValue = VALUE;                                          \
+    const uint8_t *V = reinterpret_cast<const uint8_t *>(DefaultValue),        \
+                  *P = V;                                                      \
     auto Expected = FUN(P), Actual = FUN##AndInc(P, P + strlen(VALUE));        \
     EXPECT_EQ(Actual, Expected);                                               \
     EXPECT_EQ(P - V, SIZE);                                                    \
diff --git a/llvm/unittests/Support/raw_socket_stream_test.cpp b/llvm/unittests/Support/raw_socket_stream_test.cpp
index a853622..c4e8cfb 100644
--- a/llvm/unittests/Support/raw_socket_stream_test.cpp
+++ b/llvm/unittests/Support/raw_socket_stream_test.cpp
@@ -7,7 +7,6 @@
 #include "llvm/Testing/Support/Error.h"
 #include "gtest/gtest.h"
 #include <future>
-#include <iostream>
 #include <stdlib.h>
 #include <thread>
 
@@ -86,13 +85,8 @@ TEST(raw_socket_streamTest, TIMEOUT_PROVIDED) {
   std::chrono::milliseconds Timeout = std::chrono::milliseconds(100);
   Expected<std::unique_ptr<raw_socket_stream>> MaybeServer =
       ServerListener.accept(Timeout);
-
-  ASSERT_THAT_EXPECTED(MaybeServer, Failed());
-  llvm::Error Err = MaybeServer.takeError();
-  llvm::handleAllErrors(std::move(Err), [&](const llvm::StringError &SE) {
-    std::error_code EC = SE.convertToErrorCode();
-    ASSERT_EQ(EC, std::errc::timed_out);
-  });
+  ASSERT_EQ(llvm::errorToErrorCode(MaybeServer.takeError()),
+            std::errc::timed_out);
 }
 
 TEST(raw_socket_streamTest, FILE_DESCRIPTOR_CLOSED) {
@@ -122,12 +116,7 @@ TEST(raw_socket_streamTest, FILE_DESCRIPTOR_CLOSED) {
 
   // Wait for the CloseThread to finish
   CloseThread.join();
-
-  ASSERT_THAT_EXPECTED(MaybeServer, Failed());
-  llvm::Error Err = MaybeServer.takeError();
-  llvm::handleAllErrors(std::move(Err), [&](const llvm::StringError &SE) {
-    std::error_code EC = SE.convertToErrorCode();
-    ASSERT_EQ(EC, std::errc::operation_canceled);
-  });
+  ASSERT_EQ(llvm::errorToErrorCode(MaybeServer.takeError()),
+            std::errc::operation_canceled);
 }
 } // namespace
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 0455e06..797d7df 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1996,7 +1996,6 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
       AArch64::AEK_D128,         AArch64::AEK_LSE128,
       AArch64::AEK_SPECRES2,     AArch64::AEK_RASV2,
       AArch64::AEK_ITE,          AArch64::AEK_GCS,
-      AArch64::AEK_FPMR,         AArch64::AEK_FP8,
       AArch64::AEK_FAMINMAX,     AArch64::AEK_FP8FMA,
       AArch64::AEK_SSVE_FP8FMA,  AArch64::AEK_FP8DOT2,
       AArch64::AEK_SSVE_FP8DOT2, AArch64::AEK_FP8DOT4,
@@ -2005,7 +2004,8 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
       AArch64::AEK_SMEF8F32,     AArch64::AEK_SMEFA64,
       AArch64::AEK_CPA,          AArch64::AEK_PAUTHLR,
       AArch64::AEK_TLBIW,        AArch64::AEK_JSCVT,
-      AArch64::AEK_FCMA,
+      AArch64::AEK_FCMA,         AArch64::AEK_FP8,
+
   };
 
   std::vector<StringRef> Features;
@@ -2078,7 +2078,6 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   EXPECT_TRUE(llvm::is_contained(Features, "+specres2"));
   EXPECT_TRUE(llvm::is_contained(Features, "+ite"));
   EXPECT_TRUE(llvm::is_contained(Features, "+gcs"));
-  EXPECT_TRUE(llvm::is_contained(Features, "+fpmr"));
   EXPECT_TRUE(llvm::is_contained(Features, "+fp8"));
   EXPECT_TRUE(llvm::is_contained(Features, "+faminmax"));
   EXPECT_TRUE(llvm::is_contained(Features, "+fp8fma"));
@@ -2224,7 +2223,6 @@ TEST(TargetParserTest, AArch64ArchExtFeature) {
       {"predres2", "nopredres2", "+specres2", "-specres2"},
       {"rasv2", "norasv2", "+rasv2", "-rasv2"},
       {"gcs", "nogcs", "+gcs", "-gcs"},
-      {"fpmr", "nofpmr", "+fpmr", "-fpmr"},
       {"fp8", "nofp8", "+fp8", "-fp8"},
       {"faminmax", "nofaminmax", "+faminmax", "-faminmax"},
       {"fp8fma", "nofp8fma", "+fp8fma", "-fp8fma"},
diff --git a/llvm/unittests/tools/llvm-mca/MCATestBase.cpp b/llvm/unittests/tools/llvm-mca/MCATestBase.cpp
index 4f444fae..4a39f5e 100644
--- a/llvm/unittests/tools/llvm-mca/MCATestBase.cpp
+++ b/llvm/unittests/tools/llvm-mca/MCATestBase.cpp
@@ -66,7 +66,7 @@ Error MCATestBase::runBaselineMCA(json::Object &Result, ArrayRef<MCInst> Insts,
 
   // Default InstrumentManager
   auto IM = std::make_unique<mca::InstrumentManager>(*STI, *MCII);
-  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM);
+  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM, /*CallLatency=*/100);
 
   const SmallVector<mca::Instrument *> Instruments;
   SmallVector<std::unique_ptr<mca::Instruction>> LoweredInsts;
diff --git a/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp b/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp
index 00a44dc..ac35dce 100644
--- a/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp
+++ b/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp
@@ -33,7 +33,7 @@ TEST_F(X86TestBase, TestResumablePipeline) {
   P->addEventListener(SV.get());
 
   auto IM = std::make_unique<mca::InstrumentManager>(*STI, *MCII);
-  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM);
+  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM, /*CallLatency=*/100);
 
   const SmallVector<mca::Instrument *> Instruments;
   // Tile size = 7
@@ -124,7 +124,7 @@ TEST_F(X86TestBase, TestInstructionRecycling) {
   // Default InstrumentManager
   auto IM = std::make_unique<mca::InstrumentManager>(*STI, *MCII);
 
-  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM);
+  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM, /*CallLatency=*/100);
   IB.setInstRecycleCallback(GetRecycledInst);
 
   const SmallVector<mca::Instrument *> Instruments;
diff --git a/llvm/utils/TableGen/Common/CMakeLists.txt b/llvm/utils/TableGen/Common/CMakeLists.txt
index 699583f..13883aa 100644
--- a/llvm/utils/TableGen/Common/CMakeLists.txt
+++ b/llvm/utils/TableGen/Common/CMakeLists.txt
@@ -40,6 +40,7 @@ add_llvm_library(LLVMTableGenCommon STATIC OBJECT EXCLUDE_FROM_ALL
 
   DEPENDS
   vt_gen
+  intrinsics_gen
   )
 
 # Users may include its headers as "Common/*.h"
diff --git a/llvm/utils/TableGen/Common/CodeGenTarget.cpp b/llvm/utils/TableGen/Common/CodeGenTarget.cpp
index e1cf33e..bc3ccd8 100644
--- a/llvm/utils/TableGen/Common/CodeGenTarget.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenTarget.cpp
@@ -63,212 +63,9 @@ StringRef llvm::getName(MVT::SimpleValueType T) {
 StringRef llvm::getEnumName(MVT::SimpleValueType T) {
   // clang-format off
   switch (T) {
-  case MVT::Other:    return "MVT::Other";
-  case MVT::i1:       return "MVT::i1";
-  case MVT::i2:       return "MVT::i2";
-  case MVT::i4:       return "MVT::i4";
-  case MVT::i8:       return "MVT::i8";
-  case MVT::i16:      return "MVT::i16";
-  case MVT::i32:      return "MVT::i32";
-  case MVT::i64:      return "MVT::i64";
-  case MVT::i128:     return "MVT::i128";
-  case MVT::Any:      return "MVT::Any";
-  case MVT::iAny:     return "MVT::iAny";
-  case MVT::fAny:     return "MVT::fAny";
-  case MVT::vAny:     return "MVT::vAny";
-  case MVT::f16:      return "MVT::f16";
-  case MVT::bf16:     return "MVT::bf16";
-  case MVT::f32:      return "MVT::f32";
-  case MVT::f64:      return "MVT::f64";
-  case MVT::f80:      return "MVT::f80";
-  case MVT::f128:     return "MVT::f128";
-  case MVT::ppcf128:  return "MVT::ppcf128";
-  case MVT::x86mmx:   return "MVT::x86mmx";
-  case MVT::x86amx:   return "MVT::x86amx";
-  case MVT::aarch64svcount:   return "MVT::aarch64svcount";
-  case MVT::i64x8:    return "MVT::i64x8";
-  case MVT::Glue:     return "MVT::Glue";
-  case MVT::isVoid:   return "MVT::isVoid";
-  case MVT::v1i1:     return "MVT::v1i1";
-  case MVT::v2i1:     return "MVT::v2i1";
-  case MVT::v3i1:     return "MVT::v3i1";
-  case MVT::v4i1:     return "MVT::v4i1";
-  case MVT::v8i1:     return "MVT::v8i1";
-  case MVT::v16i1:    return "MVT::v16i1";
-  case MVT::v32i1:    return "MVT::v32i1";
-  case MVT::v64i1:    return "MVT::v64i1";
-  case MVT::v128i1:   return "MVT::v128i1";
-  case MVT::v256i1:   return "MVT::v256i1";
-  case MVT::v512i1:   return "MVT::v512i1";
-  case MVT::v1024i1:  return "MVT::v1024i1";
-  case MVT::v2048i1:  return "MVT::v2048i1";
-  case MVT::v128i2:   return "MVT::v128i2";
-  case MVT::v256i2:   return "MVT::v256i2";
-  case MVT::v64i4:    return "MVT::v64i4";
-  case MVT::v128i4:   return "MVT::v128i4";
-  case MVT::v1i8:     return "MVT::v1i8";
-  case MVT::v2i8:     return "MVT::v2i8";
-  case MVT::v3i8:     return "MVT::v3i8";
-  case MVT::v4i8:     return "MVT::v4i8";
-  case MVT::v8i8:     return "MVT::v8i8";
-  case MVT::v16i8:    return "MVT::v16i8";
-  case MVT::v32i8:    return "MVT::v32i8";
-  case MVT::v64i8:    return "MVT::v64i8";
-  case MVT::v128i8:   return "MVT::v128i8";
-  case MVT::v256i8:   return "MVT::v256i8";
-  case MVT::v512i8:   return "MVT::v512i8";
-  case MVT::v1024i8:  return "MVT::v1024i8";
-  case MVT::v1i16:    return "MVT::v1i16";
-  case MVT::v2i16:    return "MVT::v2i16";
-  case MVT::v3i16:    return "MVT::v3i16";
-  case MVT::v4i16:    return "MVT::v4i16";
-  case MVT::v8i16:    return "MVT::v8i16";
-  case MVT::v16i16:   return "MVT::v16i16";
-  case MVT::v32i16:   return "MVT::v32i16";
-  case MVT::v64i16:   return "MVT::v64i16";
-  case MVT::v128i16:  return "MVT::v128i16";
-  case MVT::v256i16:  return "MVT::v256i16";
-  case MVT::v512i16:  return "MVT::v512i16";
-  case MVT::v1i32:    return "MVT::v1i32";
-  case MVT::v2i32:    return "MVT::v2i32";
-  case MVT::v3i32:    return "MVT::v3i32";
-  case MVT::v4i32:    return "MVT::v4i32";
-  case MVT::v5i32:    return "MVT::v5i32";
-  case MVT::v6i32:    return "MVT::v6i32";
-  case MVT::v7i32:    return "MVT::v7i32";
-  case MVT::v8i32:    return "MVT::v8i32";
-  case MVT::v9i32:    return "MVT::v9i32";
-  case MVT::v10i32:   return "MVT::v10i32";
-  case MVT::v11i32:   return "MVT::v11i32";
-  case MVT::v12i32:   return "MVT::v12i32";
-  case MVT::v16i32:   return "MVT::v16i32";
-  case MVT::v32i32:   return "MVT::v32i32";
-  case MVT::v64i32:   return "MVT::v64i32";
-  case MVT::v128i32:  return "MVT::v128i32";
-  case MVT::v256i32:  return "MVT::v256i32";
-  case MVT::v512i32:  return "MVT::v512i32";
-  case MVT::v1024i32: return "MVT::v1024i32";
-  case MVT::v2048i32: return "MVT::v2048i32";
-  case MVT::v1i64:    return "MVT::v1i64";
-  case MVT::v2i64:    return "MVT::v2i64";
-  case MVT::v3i64:    return "MVT::v3i64";
-  case MVT::v4i64:    return "MVT::v4i64";
-  case MVT::v8i64:    return "MVT::v8i64";
-  case MVT::v16i64:   return "MVT::v16i64";
-  case MVT::v32i64:   return "MVT::v32i64";
-  case MVT::v64i64:   return "MVT::v64i64";
-  case MVT::v128i64:  return "MVT::v128i64";
-  case MVT::v256i64:  return "MVT::v256i64";
-  case MVT::v1i128:   return "MVT::v1i128";
-  case MVT::v1f16:    return "MVT::v1f16";
-  case MVT::v2f16:    return "MVT::v2f16";
-  case MVT::v3f16:    return "MVT::v3f16";
-  case MVT::v4f16:    return "MVT::v4f16";
-  case MVT::v8f16:    return "MVT::v8f16";
-  case MVT::v16f16:   return "MVT::v16f16";
-  case MVT::v32f16:   return "MVT::v32f16";
-  case MVT::v64f16:   return "MVT::v64f16";
-  case MVT::v128f16:  return "MVT::v128f16";
-  case MVT::v256f16:  return "MVT::v256f16";
-  case MVT::v512f16:  return "MVT::v512f16";
-  case MVT::v2bf16:   return "MVT::v2bf16";
-  case MVT::v3bf16:   return "MVT::v3bf16";
-  case MVT::v4bf16:   return "MVT::v4bf16";
-  case MVT::v8bf16:   return "MVT::v8bf16";
-  case MVT::v16bf16:  return "MVT::v16bf16";
-  case MVT::v32bf16:  return "MVT::v32bf16";
-  case MVT::v64bf16:  return "MVT::v64bf16";
-  case MVT::v128bf16: return "MVT::v128bf16";
-  case MVT::v1f32:    return "MVT::v1f32";
-  case MVT::v2f32:    return "MVT::v2f32";
-  case MVT::v3f32:    return "MVT::v3f32";
-  case MVT::v4f32:    return "MVT::v4f32";
-  case MVT::v5f32:    return "MVT::v5f32";
-  case MVT::v6f32:    return "MVT::v6f32";
-  case MVT::v7f32:    return "MVT::v7f32";
-  case MVT::v8f32:    return "MVT::v8f32";
-  case MVT::v9f32:    return "MVT::v9f32";
-  case MVT::v10f32:   return "MVT::v10f32";
-  case MVT::v11f32:   return "MVT::v11f32";
-  case MVT::v12f32:   return "MVT::v12f32";
-  case MVT::v16f32:   return "MVT::v16f32";
-  case MVT::v32f32:   return "MVT::v32f32";
-  case MVT::v64f32:   return "MVT::v64f32";
-  case MVT::v128f32:  return "MVT::v128f32";
-  case MVT::v256f32:  return "MVT::v256f32";
-  case MVT::v512f32:  return "MVT::v512f32";
-  case MVT::v1024f32: return "MVT::v1024f32";
-  case MVT::v2048f32: return "MVT::v2048f32";
-  case MVT::v1f64:    return "MVT::v1f64";
-  case MVT::v2f64:    return "MVT::v2f64";
-  case MVT::v3f64:    return "MVT::v3f64";
-  case MVT::v4f64:    return "MVT::v4f64";
-  case MVT::v8f64:    return "MVT::v8f64";
-  case MVT::v16f64:   return "MVT::v16f64";
-  case MVT::v32f64:   return "MVT::v32f64";
-  case MVT::v64f64:   return "MVT::v64f64";
-  case MVT::v128f64:  return "MVT::v128f64";
-  case MVT::v256f64:  return "MVT::v256f64";
-  case MVT::nxv1i1:   return "MVT::nxv1i1";
-  case MVT::nxv2i1:   return "MVT::nxv2i1";
-  case MVT::nxv4i1:   return "MVT::nxv4i1";
-  case MVT::nxv8i1:   return "MVT::nxv8i1";
-  case MVT::nxv16i1:  return "MVT::nxv16i1";
-  case MVT::nxv32i1:  return "MVT::nxv32i1";
-  case MVT::nxv64i1:  return "MVT::nxv64i1";
-  case MVT::nxv1i8:   return "MVT::nxv1i8";
-  case MVT::nxv2i8:   return "MVT::nxv2i8";
-  case MVT::nxv4i8:   return "MVT::nxv4i8";
-  case MVT::nxv8i8:   return "MVT::nxv8i8";
-  case MVT::nxv16i8:  return "MVT::nxv16i8";
-  case MVT::nxv32i8:  return "MVT::nxv32i8";
-  case MVT::nxv64i8:  return "MVT::nxv64i8";
-  case MVT::nxv1i16:  return "MVT::nxv1i16";
-  case MVT::nxv2i16:  return "MVT::nxv2i16";
-  case MVT::nxv4i16:  return "MVT::nxv4i16";
-  case MVT::nxv8i16:  return "MVT::nxv8i16";
-  case MVT::nxv16i16: return "MVT::nxv16i16";
-  case MVT::nxv32i16: return "MVT::nxv32i16";
-  case MVT::nxv1i32:  return "MVT::nxv1i32";
-  case MVT::nxv2i32:  return "MVT::nxv2i32";
-  case MVT::nxv4i32:  return "MVT::nxv4i32";
-  case MVT::nxv8i32:  return "MVT::nxv8i32";
-  case MVT::nxv16i32: return "MVT::nxv16i32";
-  case MVT::nxv32i32: return "MVT::nxv32i32";
-  case MVT::nxv1i64:  return "MVT::nxv1i64";
-  case MVT::nxv2i64:  return "MVT::nxv2i64";
-  case MVT::nxv4i64:  return "MVT::nxv4i64";
-  case MVT::nxv8i64:  return "MVT::nxv8i64";
-  case MVT::nxv16i64: return "MVT::nxv16i64";
-  case MVT::nxv32i64: return "MVT::nxv32i64";
-  case MVT::nxv1f16:  return "MVT::nxv1f16";
-  case MVT::nxv2f16:  return "MVT::nxv2f16";
-  case MVT::nxv4f16:  return "MVT::nxv4f16";
-  case MVT::nxv8f16:  return "MVT::nxv8f16";
-  case MVT::nxv16f16: return "MVT::nxv16f16";
-  case MVT::nxv32f16: return "MVT::nxv32f16";
-  case MVT::nxv1bf16:  return "MVT::nxv1bf16";
-  case MVT::nxv2bf16:  return "MVT::nxv2bf16";
-  case MVT::nxv4bf16:  return "MVT::nxv4bf16";
-  case MVT::nxv8bf16:  return "MVT::nxv8bf16";
-  case MVT::nxv16bf16: return "MVT::nxv16bf16";
-  case MVT::nxv32bf16: return "MVT::nxv32bf16";
-  case MVT::nxv1f32:   return "MVT::nxv1f32";
-  case MVT::nxv2f32:   return "MVT::nxv2f32";
-  case MVT::nxv4f32:   return "MVT::nxv4f32";
-  case MVT::nxv8f32:   return "MVT::nxv8f32";
-  case MVT::nxv16f32:  return "MVT::nxv16f32";
-  case MVT::nxv1f64:   return "MVT::nxv1f64";
-  case MVT::nxv2f64:   return "MVT::nxv2f64";
-  case MVT::nxv4f64:   return "MVT::nxv4f64";
-  case MVT::nxv8f64:   return "MVT::nxv8f64";
-  case MVT::token:     return "MVT::token";
-  case MVT::Metadata:  return "MVT::Metadata";
-  case MVT::iPTR:      return "MVT::iPTR";
-  case MVT::iPTRAny:   return "MVT::iPTRAny";
-  case MVT::Untyped:   return "MVT::Untyped";
-  case MVT::funcref:   return "MVT::funcref";
-  case MVT::externref: return "MVT::externref";
+#define GET_VT_ATTR(Ty, N, Sz, Any, Int, FP, Vec, Sc)                          \
+  case MVT::Ty: return "MVT::" # Ty;
+#include "llvm/CodeGen/GenVT.inc"
   default: llvm_unreachable("ILLEGAL VALUE TYPE!");
   }
   // clang-format on
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index 7da16e0..85c1294 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -1041,7 +1041,7 @@ class NamelessValue:
         var = var.replace("-", "_")
         return var.upper()
 
-    def get_affixes_from_match(self, match: re.Match):
+    def get_affixes_from_match(self, match):
         prefix = re.match(self.ir_prefix, match.group(2)).group(0)
         suffix = re.search(self.ir_suffix + "$", match.group(2)).group(0)
         return prefix, suffix
diff --git a/llvm/utils/gn/README.rst b/llvm/utils/gn/README.rst
index 9ca5450..52d03be 100644
--- a/llvm/utils/gn/README.rst
+++ b/llvm/utils/gn/README.rst
@@ -131,7 +131,7 @@ configure is used for three classes of feature checks:
 
 For the last two points, it would be nice if LLVM didn't have a single
 ``config.h`` header, but one header per toggle. That way, when e.g.
-``llvm_enable_terminfo`` is toggled, only the 3 files caring about that setting
+``llvm_enable_zlib`` is toggled, only the 3 files caring about that setting
 would need to be rebuilt, instead of everything including ``config.h``.
 
 GN doesn't believe in users setting arbitrary cflags from an environment
diff --git a/llvm/utils/gn/build/libs/terminfo/BUILD.gn b/llvm/utils/gn/build/libs/terminfo/BUILD.gn
deleted file mode 100644
index 10003d6..0000000
--- a/llvm/utils/gn/build/libs/terminfo/BUILD.gn
+++ /dev/null
@@ -1,12 +0,0 @@
-import("//llvm/utils/gn/build/libs/terminfo/enable.gni")
-
-config("terminfo_config") {
-  visibility = [ ":terminfo" ]
-  libs = [ "ncurses" ]
-}
-
-group("terminfo") {
-  if (llvm_enable_terminfo) {
-    public_configs = [ ":terminfo_config" ]
-  }
-}
diff --git a/llvm/utils/gn/build/libs/terminfo/enable.gni b/llvm/utils/gn/build/libs/terminfo/enable.gni
deleted file mode 100644
index 79ea2b6..0000000
--- a/llvm/utils/gn/build/libs/terminfo/enable.gni
+++ /dev/null
@@ -1,4 +0,0 @@
-declare_args() {
-  # Whether to link against terminfo.
-  llvm_enable_terminfo = false
-}
diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
index 0a7cc38..c312c86 100644
--- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
@@ -140,12 +140,10 @@ copy("Headers") {
     "avx512bwintrin.h",
     "avx512cdintrin.h",
     "avx512dqintrin.h",
-    "avx512erintrin.h",
     "avx512fintrin.h",
     "avx512fp16intrin.h",
     "avx512ifmaintrin.h",
     "avx512ifmavlintrin.h",
-    "avx512pfintrin.h",
     "avx512vbmi2intrin.h",
     "avx512vbmiintrin.h",
     "avx512vbmivlintrin.h",
diff --git a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
index 188c718..9075ada 100644
--- a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
@@ -95,6 +95,7 @@ static_library("Sema") {
     "SemaTemplateInstantiateDecl.cpp",
     "SemaTemplateVariadic.cpp",
     "SemaType.cpp",
+    "SemaX86.cpp",
     "TypeLocBuilder.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn
index da48149..3ae50b2 100644
--- a/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn
@@ -104,6 +104,7 @@ static_library("Checkers") {
     "PointerSortingChecker.cpp",
     "PointerSubChecker.cpp",
     "PthreadLockChecker.cpp",
+    "PutenvStackArrayChecker.cpp",
     "RetainCountChecker/RetainCountChecker.cpp",
     "RetainCountChecker/RetainCountDiagnostics.cpp",
     "ReturnPointerRangeChecker.cpp",
@@ -111,6 +112,7 @@ static_library("Checkers") {
     "ReturnValueChecker.cpp",
     "RunLoopAutoreleaseLeakChecker.cpp",
     "STLAlgorithmModeling.cpp",
+    "SetgidSetuidOrderChecker.cpp",
     "SimpleStreamChecker.cpp",
     "SmartPtrChecker.cpp",
     "SmartPtrModeling.cpp",
@@ -147,6 +149,5 @@ static_library("Checkers") {
     "WebKit/UncountedLambdaCapturesChecker.cpp",
     "WebKit/UncountedLocalVarsChecker.cpp",
     "cert/InvalidPtrChecker.cpp",
-    "cert/PutenvWithAutoChecker.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 210b26e..c51e4bf 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -294,6 +294,7 @@ if (current_toolchain == default_toolchain) {
       "__atomic/atomic_flag.h",
       "__atomic/atomic_init.h",
       "__atomic/atomic_lock_free.h",
+      "__atomic/atomic_ref.h",
       "__atomic/atomic_sync.h",
       "__atomic/check_memory_order.h",
       "__atomic/contention_t.h",
@@ -302,6 +303,7 @@ if (current_toolchain == default_toolchain) {
       "__atomic/is_always_lock_free.h",
       "__atomic/kill_dependency.h",
       "__atomic/memory_order.h",
+      "__atomic/to_gcc_order.h",
       "__availability",
       "__bit/bit_cast.h",
       "__bit/bit_ceil.h",
diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
index 80a9150..e93130e 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -10,7 +10,6 @@ import("//llvm/utils/gn/build/buildflags.gni")
 import("//llvm/utils/gn/build/libs/curl/enable.gni")
 import("//llvm/utils/gn/build/libs/edit/enable.gni")
 import("//llvm/utils/gn/build/libs/pthread/enable.gni")
-import("//llvm/utils/gn/build/libs/terminfo/enable.gni")
 import("//llvm/utils/gn/build/libs/xar/enable.gni")
 import("//llvm/utils/gn/build/libs/xml/enable.gni")
 import("//llvm/utils/gn/build/libs/zlib/enable.gni")
@@ -294,12 +293,6 @@ write_cmake_config("config") {
     values += [ "HAVE_LIBEDIT=" ]
   }
 
-  if (llvm_enable_terminfo) {
-    values += [ "LLVM_ENABLE_TERMINFO=1" ]
-  } else {
-    values += [ "LLVM_ENABLE_TERMINFO=" ]
-  }
-
   if (llvm_enable_libxml2) {
     values += [ "LLVM_ENABLE_LIBXML2=1" ]
   } else {
diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
index 941d448..7728455 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
@@ -6,7 +6,6 @@ static_library("Support") {
     "//llvm/include/llvm/Support:write_vcsrevision",
     "//llvm/lib/Demangle",
     "//llvm/utils/gn/build/libs/pthread",
-    "//llvm/utils/gn/build/libs/terminfo",
     "//llvm/utils/gn/build/libs/zlib",
   ]
 
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-config/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-config/BUILD.gn
index bf50cd0f..711e4e3 100644
--- a/llvm/utils/gn/secondary/llvm/tools/llvm-config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-config/BUILD.gn
@@ -1,7 +1,6 @@
 import("//llvm/lib/Target/targets_string.gni")
 import("//llvm/utils/gn/build/buildflags.gni")
 import("//llvm/utils/gn/build/libs/pthread/enable.gni")
-import("//llvm/utils/gn/build/libs/terminfo/enable.gni")
 import("//llvm/utils/gn/build/libs/xml/enable.gni")
 import("//llvm/utils/gn/build/libs/zlib/enable.gni")
 import("//llvm/utils/gn/build/write_cmake_config.gni")
@@ -36,7 +35,7 @@ write_cmake_config("BuildVariables.inc") {
     lib = ""
   }
 
-  # Windows doesn't use any of libxml2, terminfo, zlib by default.
+  # Windows doesn't use any of libxml2, zlib by default.
   # Make GN not warn about these variables being unused.
   not_needed([
                "l",
@@ -63,9 +62,6 @@ write_cmake_config("BuildVariables.inc") {
   if (llvm_enable_libxml2) {
     system_libs += " ${l}xml2${lib}"
   }
-  if (llvm_enable_terminfo) {
-    system_libs += " ${l}ncurses${lib}"
-  }
   if (llvm_enable_zlib) {
     system_libs += " ${l}z${lib}"
   }
diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index 4c0ef83..9f0b0d6 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -110,14 +110,6 @@ else()
   set(MLIR_ENABLE_EXECUTION_ENGINE 0)
 endif()
 
-# Build the CUDA conversions and run according tests if the NVPTX backend
-# is available
-if ("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
-  set(MLIR_ENABLE_CUDA_CONVERSIONS 1)
-else()
-  set(MLIR_ENABLE_CUDA_CONVERSIONS 0)
-endif()
-
 # Build the ROCm conversions and run according tests if the AMDGPU backend
 # is available.
 if ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)
diff --git a/mlir/docs/PassManagement.md b/mlir/docs/PassManagement.md
index e9ecb99..7b19a7b 100644
--- a/mlir/docs/PassManagement.md
+++ b/mlir/docs/PassManagement.md
@@ -1359,6 +1359,45 @@ func.func @simple_constant() -> (i32, i32) {
 }
 ```
 
+*   `mlir-print-ir-tree-dir=(directory path)`
+    *   Without setting this option, the IR printed by the instrumentation will
+        be printed to `stderr`. If you provide a directory using this option,
+        the output corresponding to each pass will be printed to a file in the
+        directory tree rooted at `(directory path)`. The path created for each
+        pass reflects the nesting structure of the IR and the pass pipeline.
+    *   The below example illustrates the file tree created by running a pass
+        pipeline on IR that has two `func.func` located within two nested
+        `builtin.module` ops.
+    *   The subdirectories are given names that reflect the parent op names and
+        the symbol names for those ops (if present).
+    *   The printer keeps a counter associated with ops that are targeted by
+        passes and their isolated-from-above parents. Each filename is given a
+        numeric prefix using the counter value for the op that the pass is
+        targeting. The counter values for each parent are then prepended. This
+        gives a naming where it is easy to distinguish which passes may have run
+        concurrently versus which have a clear ordering. In the below example,for
+        both `1_1_pass4.mlir` files, the first 1 refers to the counter for the
+        parent op, and the second refers to the counter for the respective
+        function.
+
+```
+$ pipeline="builtin.module(pass1,pass2,func.func(pass3,pass4),pass5)"
+$ mlir-opt foo.mlir -pass-pipeline="$pipeline" -mlir-print-ir-tree-dir=/tmp/pipeline_output
+$ tree /tmp/pipeline_output
+
+/tmp/pass_output
+├── builtin_module_the_symbol_name
+│   ├── 0_pass1.mlir
+│   ├── 1_pass2.mlir
+│   ├── 2_pass5.mlir
+│   ├── func_func_my_func_name
+│   │   ├── 1_0_pass3.mlir
+│   │   ├── 1_1_pass4.mlir
+│   ├── func_func_my_other_func_name
+│   │   ├── 1_0_pass3.mlir
+│   │   ├── 1_1_pass4.mlir
+```
+
 ## Crash and Failure Reproduction
 
 The [pass manager](#pass-manager) in MLIR contains a builtin mechanism to
diff --git a/mlir/include/mlir-c/Debug.h b/mlir/include/mlir-c/Debug.h
index 2502f2f..7dad735 100644
--- a/mlir/include/mlir-c/Debug.h
+++ b/mlir/include/mlir-c/Debug.h
@@ -21,6 +21,19 @@ MLIR_CAPI_EXPORTED void mlirEnableGlobalDebug(bool enable);
 /// Retuns `true` if the global debugging flag is set, false otherwise.
 MLIR_CAPI_EXPORTED bool mlirIsGlobalDebugEnabled();
 
+/// Sets the current debug type, similarly to `-debug-only=type` in the
+/// command-line tools. Note that global debug should be enabled for any output
+/// to be produced.
+MLIR_CAPI_EXPORTED void mlirSetGlobalDebugType(const char *type);
+
+/// Sets multiple current debug types, similarly to `-debug-only=type1,type2" in
+/// the command-line tools. Note that global debug should be enabled for any
+/// output to be produced.
+MLIR_CAPI_EXPORTED void mlirSetGlobalDebugTypes(const char **types, intptr_t n);
+
+/// Checks if `type` is set as the current debug type.
+MLIR_CAPI_EXPORTED bool mlirIsCurrentDebugType(const char *type);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/mlir/include/mlir/Analysis/SliceAnalysis.h b/mlir/include/mlir/Analysis/SliceAnalysis.h
index d5cdf72..99279fd 100644
--- a/mlir/include/mlir/Analysis/SliceAnalysis.h
+++ b/mlir/include/mlir/Analysis/SliceAnalysis.h
@@ -223,11 +223,6 @@ SetVector<Operation *>
 getSlice(Operation *op, const BackwardSliceOptions &backwardSliceOptions = {},
          const ForwardSliceOptions &forwardSliceOptions = {});
 
-/// Multi-root DAG topological sort.
-/// Performs a topological sort of the Operation in the `toSort` SetVector.
-/// Returns a topologically sorted SetVector.
-SetVector<Operation *> topologicalSort(const SetVector<Operation *> &toSort);
-
 /// Utility to match a generic reduction given a list of iteration-carried
 /// arguments, `iterCarriedArgs` and the position of the potential reduction
 /// argument within the list, `redPos`. If a reduction is matched, returns the
diff --git a/mlir/include/mlir/Transforms/TopologicalSortUtils.h b/mlir/include/mlir/Analysis/TopologicalSortUtils.h
index 74e44b1..ee98cd8 100644
--- a/mlir/include/mlir/Transforms/TopologicalSortUtils.h
+++ b/mlir/include/mlir/Analysis/TopologicalSortUtils.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_TRANSFORMS_TOPOLOGICALSORTUTILS_H
-#define MLIR_TRANSFORMS_TOPOLOGICALSORTUTILS_H
+#ifndef MLIR_ANALYSIS_TOPOLOGICALSORTUTILS_H
+#define MLIR_ANALYSIS_TOPOLOGICALSORTUTILS_H
 
 #include "mlir/IR/Block.h"
 
@@ -104,6 +104,14 @@ bool computeTopologicalSorting(
     MutableArrayRef<Operation *> ops,
     function_ref<bool(Value, Operation *)> isOperandReady = nullptr);
 
+/// Gets a list of blocks that is sorted according to dominance. This sort is
+/// stable.
+SetVector<Block *> getBlocksSortedByDominance(Region &region);
+
+/// Sorts all operations in `toSort` topologically while also considering region
+/// semantics. Does not support multi-sets.
+SetVector<Operation *> topologicalSort(const SetVector<Operation *> &toSort);
+
 } // end namespace mlir
 
-#endif // MLIR_TRANSFORMS_TOPOLOGICALSORTUTILS_H
+#endif // MLIR_ANALYSIS_TOPOLOGICALSORTUTILS_H
diff --git a/mlir/include/mlir/Config/mlir-config.h.cmake b/mlir/include/mlir/Config/mlir-config.h.cmake
index 9339ce0..abd6f41 100644
--- a/mlir/include/mlir/Config/mlir-config.h.cmake
+++ b/mlir/include/mlir/Config/mlir-config.h.cmake
@@ -39,10 +39,6 @@
 /* If set, enables PDL usage. */
 #cmakedefine01 MLIR_ENABLE_PDL_IN_PATTERNMATCH
 
-/* If set, enables CUDA-related features in CUDA-related transforms, pipelines,
-   and targets. */
-#cmakedefine01 MLIR_ENABLE_CUDA_CONVERSIONS
-
 /* If set, enables features that depend on the NVIDIA's PTX compiler. */
 #cmakedefine01 MLIR_ENABLE_NVPTXCOMPILER
 
diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
index 4e4c6fd..ead5233 100644
--- a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
+++ b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
@@ -1540,6 +1540,18 @@ def Arith_CmpFOp : Arith_CompareOp<"cmpf",
 // SelectOp
 //===----------------------------------------------------------------------===//
 
+class AnyBooleanTypeMatch<list<string> names> :
+    AnyMatchOperatorTrait<names, "$_self.getType().isSignlessInteger(1)",
+                          "scalar type">;
+
+class ScalarConditionOrMatchingShape<list<string> names> :
+    PredOpTrait<
+        !head(names) # " is scalar or has matching shape",
+        Or<[AnyBooleanTypeMatch<[!head(names)]>.predicate,
+            AllShapesMatch<names>.predicate]>> {
+  list<string> values = names;
+}
+
 def SelectOp : Arith_Op<"select", [Pure,
     AllTypesMatch<["true_value", "false_value", "result"]>,
     ScalarConditionOrMatchingShape<["condition", "result"]>,
@@ -1548,16 +1560,16 @@ def SelectOp : Arith_Op<"select", [Pure,
   let summary = "select operation";
   let description = [{
     The `arith.select` operation chooses one value based on a binary condition
-    supplied as its first operand. 
-    
-    If the value of the first operand (the condition) is `1`, then the second 
-    operand is returned, and the third operand is ignored, even if it was poison. 
-    
-    If the value of the first operand (the condition) is `0`, then the third 
-    operand is returned, and the second operand is ignored, even if it was poison. 
-    
-    If the value of the first operand (the condition) is poison, then the 
-    operation returns poison. 
+    supplied as its first operand.
+
+    If the value of the first operand (the condition) is `1`, then the second
+    operand is returned, and the third operand is ignored, even if it was poison.
+
+    If the value of the first operand (the condition) is `0`, then the third
+    operand is returned, and the second operand is ignored, even if it was poison.
+
+    If the value of the first operand (the condition) is poison, then the
+    operation returns poison.
 
     The operation applies to vectors and tensors elementwise given the _shape_
     of all operands is identical. The choice is made for each element
diff --git a/mlir/include/mlir/Dialect/CommonFolders.h b/mlir/include/mlir/Dialect/CommonFolders.h
index 7dabc78..6f497a2 100644
--- a/mlir/include/mlir/Dialect/CommonFolders.h
+++ b/mlir/include/mlir/Dialect/CommonFolders.h
@@ -298,7 +298,10 @@ Attribute constFoldCastOp(ArrayRef<Attribute> operands, Type resType,
         calculate(op.getSplatValue<ElementValueT>(), castStatus);
     if (!castStatus)
       return {};
-    return DenseElementsAttr::get(cast<ShapedType>(resType), elementResult);
+    auto shapedResType = cast<ShapedType>(resType);
+    if (!shapedResType.hasStaticShape())
+      return {};
+    return DenseElementsAttr::get(shapedResType, elementResult);
   }
   if (auto op = dyn_cast<ElementsAttr>(operands[0])) {
     // Operand is ElementsAttr-derived; perform an element-wise fold by
diff --git a/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.h b/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.h
index 13e10b2..a7bf879 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.h
@@ -13,7 +13,6 @@
 #include "mlir/Dialect/LLVMIR/Transforms/LegalizeForExport.h"
 #include "mlir/Dialect/LLVMIR/Transforms/OptimizeForNVVM.h"
 #include "mlir/Dialect/LLVMIR/Transforms/RequestCWrappers.h"
-#include "mlir/Dialect/LLVMIR/Transforms/TypeConsistency.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
diff --git a/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td b/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
index 0242cfd..11d1b94 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
@@ -43,24 +43,6 @@ def LLVMRequestCWrappers
   let constructor = "::mlir::LLVM::createRequestCWrappersPass()";
 }
 
-def LLVMTypeConsistency
-    : Pass<"llvm-type-consistency", "::mlir::LLVM::LLVMFuncOp"> {
-  let summary = "Rewrites to improve type consistency";
-  let description = [{
-    Set of rewrites to improve the coherency of types within an LLVM dialect
-    program. This will adjust operations operating on pointers so they interpret
-    their associated pointee type as consistently as possible.
-  }];
-  let constructor = "::mlir::LLVM::createTypeConsistencyPass()";
-
-  let options = [
-    Option<"maxVectorSplitSize", "max-vector-split-size", "unsigned",
-           /*default=*/"512",
-           "Maximum size in bits of a vector value in a load or store operation"
-           " operating on multiple elements that should still be split">,
-  ];
-}
-
 def NVVMOptimizeForTarget : Pass<"llvm-optimize-for-nvvm-target"> {
   let summary = "Optimize NVVM IR";
   let constructor = "::mlir::NVVM::createOptimizeForTargetPass()";
diff --git a/mlir/include/mlir/Dialect/LLVMIR/Transforms/TypeConsistency.h b/mlir/include/mlir/Dialect/LLVMIR/Transforms/TypeConsistency.h
deleted file mode 100644
index a4bb380..0000000
--- a/mlir/include/mlir/Dialect/LLVMIR/Transforms/TypeConsistency.h
+++ /dev/null
@@ -1,73 +0,0 @@
-//===- TypeConsistency.h - Rewrites to improve type consistency -----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Set of rewrites to improve the coherency of types within an LLVM dialect
-// program. This will adjust operations around a given pointer so they interpret
-// its pointee type as consistently as possible.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_DIALECT_LLVMIR_TRANSFORMS_TYPECONSISTENCY_H
-#define MLIR_DIALECT_LLVMIR_TRANSFORMS_TYPECONSISTENCY_H
-
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-namespace LLVM {
-
-#define GEN_PASS_DECL_LLVMTYPECONSISTENCY
-#include "mlir/Dialect/LLVMIR/Transforms/Passes.h.inc"
-
-/// Creates a pass that adjusts operations operating on pointers so they
-/// interpret pointee types as consistently as possible.
-std::unique_ptr<Pass> createTypeConsistencyPass();
-
-/// Canonicalizes GEPs of which the base type and the pointer's type hint do not
-/// match. This is done by replacing the original GEP into a GEP with the type
-/// hint as a base type when an element of the hinted type aligns with the
-/// original GEP.
-class CanonicalizeAlignedGep : public OpRewritePattern<GEPOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(GEPOp gep,
-                                PatternRewriter &rewriter) const override;
-};
-
-/// Splits stores which write into multiple adjacent elements of an aggregate
-/// through a pointer. Currently, integers and vector are split and stores
-/// are generated for every element being stored to in a type-consistent manner.
-/// This is done on a best-effort basis.
-class SplitStores : public OpRewritePattern<StoreOp> {
-  unsigned maxVectorSplitSize;
-
-public:
-  SplitStores(MLIRContext *context, unsigned maxVectorSplitSize)
-      : OpRewritePattern(context), maxVectorSplitSize(maxVectorSplitSize) {}
-
-  LogicalResult matchAndRewrite(StoreOp store,
-                                PatternRewriter &rewrite) const override;
-};
-
-/// Splits GEPs with more than two indices into multiple GEPs with exactly
-/// two indices. The created GEPs are then guaranteed to index into only
-/// one aggregate at a time.
-class SplitGEP : public OpRewritePattern<GEPOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(GEPOp gepOp,
-                                PatternRewriter &rewriter) const override;
-};
-
-} // namespace LLVM
-} // namespace mlir
-
-#endif // MLIR_DIALECT_LLVMIR_TRANSFORMS_TYPECONSISTENCY_H
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 5585ba2..93e2c2db 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -1681,7 +1681,7 @@ def TileReductionUsingForOp : Op<Transform_Dialect, "structured.tile_reduction_u
   // TODO: support mixed static-dynamic (see TileUsingForallOp).
   let arguments = (ins TransformHandleTypeInterface:$target,
                    DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$tile_sizes);
-  let results = (outs TransformHandleTypeInterface:$fill_op,
+  let results = (outs Variadic<TransformHandleTypeInterface>:$fill_op,
                       TransformHandleTypeInterface:$split_linalg_op,
                       TransformHandleTypeInterface:$combining_linalg_op,
                       TransformHandleTypeInterface:$for_op);
@@ -1787,7 +1787,7 @@ def TileReductionUsingForallOp :
                    DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$num_threads,
                    DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$tile_sizes,
                    OptionalAttr<DeviceMappingArrayAttr>:$mapping);
-  let results = (outs TransformHandleTypeInterface:$fill_op,
+  let results = (outs Variadic<TransformHandleTypeInterface>:$fill_op,
                       TransformHandleTypeInterface:$split_linalg_op,
                       TransformHandleTypeInterface:$combining_linalg_op,
                       TransformHandleTypeInterface:$forall_op);
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index f77c19e..308ce92 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -876,8 +876,8 @@ struct ForallReductionTilingResult {
   Operation *parallelTiledOp;
   /// The final reduction operation merging all the partial reductions.
   Operation *mergeOp;
-  /// The op initializing the tensor used for partial reductions.
-  Operation *initialOp;
+  /// Initial values used for partial reductions.
+  SmallVector<Value> initialValues;
   /// The `scf.forall` operation that iterate over the tiles.
   scf::ForallOp loops;
 };
diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td b/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td
index 9d9b589..3a85bf2 100644
--- a/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td
+++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td
@@ -151,7 +151,9 @@ def MeshSharding : AttrDef<Mesh_Dialect, "MeshSharding"> {
 
   let extraClassDeclaration = [{
     bool operator==(::mlir::Attribute rhs) const;
+    bool operator!=(::mlir::Attribute rhs) const;
     bool operator==(::mlir::mesh::MeshShardingAttr rhs) const;
+    bool operator!=(::mlir::mesh::MeshShardingAttr rhs) const;
   }];
 
   let genVerifyDecl = 1;
diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h
index 4569b77..7a24c20 100644
--- a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h
+++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h
@@ -51,15 +51,26 @@ void removeTrailingEmptySubArray(SmallVector<SmallVector<T>> &array) {
 
 // Is the same tensor replicated on all processes.
 inline bool isFullReplication(MeshShardingAttr attr) {
-  return attr.getPartialAxes().empty() && attr.getSplitAxes().empty();
+  return attr.getPartialAxes().empty() &&
+         llvm::all_of(attr.getSplitAxes(), [](MeshAxesAttr axes) {
+           return axes.asArrayRef().empty();
+         });
 }
 
-inline mesh::MeshOp getMesh(Operation *op, FlatSymbolRefAttr meshSymbol,
-                            SymbolTableCollection &symbolTableCollection) {
+inline mesh::MeshOp
+getMeshOrNull(Operation *op, FlatSymbolRefAttr meshSymbol,
+              SymbolTableCollection &symbolTableCollection) {
   return symbolTableCollection.lookupNearestSymbolFrom<mesh::MeshOp>(
       op, meshSymbol);
 }
 
+inline mesh::MeshOp getMesh(Operation *op, FlatSymbolRefAttr meshSymbol,
+                            SymbolTableCollection &symbolTableCollection) {
+  mesh::MeshOp meshOp = getMeshOrNull(op, meshSymbol, symbolTableCollection);
+  assert(meshOp);
+  return meshOp;
+}
+
 // Get the corresponding mesh op using the standard attribute nomenclature.
 template <typename Op>
 mesh::MeshOp getMesh(Op op, SymbolTableCollection &symbolTableCollection) {
@@ -128,6 +139,17 @@ ShapedType shardShapedType(ShapedType shape, MeshOp mesh,
 // `sharding` in that case must be null.
 Type shardType(Type type, MeshOp mesh, MeshShardingAttr sharding);
 
+// Insert shard op if there is not one that already has the same sharding.
+// May insert resharding if required.
+void maybeInsertTargetShardingAnnotation(MeshShardingAttr sharding,
+                                         OpOperand &operand,
+                                         OpBuilder &builder);
+void maybeInsertTargetShardingAnnotation(MeshShardingAttr sharding,
+                                         OpResult result, OpBuilder &builder);
+void maybeInsertSourceShardingAnnotation(MeshShardingAttr sharding,
+                                         OpOperand &operand,
+                                         OpBuilder &builder);
+
 } // namespace mesh
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.h b/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.h
index c47a7dd..216d7e1 100644
--- a/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.h
+++ b/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.h
@@ -37,6 +37,11 @@ struct ShardingOption {
   ShardingOption() = default;
   ShardingOption(ShardingArray shardingArray, FlatSymbolRefAttr mesh)
       : shardingArray(std::move(shardingArray)), mesh(mesh) {}
+  static ShardingOption makeEmpty() {
+    auto res = ShardingOption();
+    res.empty = true;
+    return res;
+  }
 };
 
 // This method retrieves the 'MeshShardingAttr' attribute from a given operation
@@ -56,6 +61,10 @@ defaultGetShardingOption(Operation *op,
                          ArrayRef<MeshShardingAttr> operandShardings,
                          ArrayRef<MeshShardingAttr> resultShardings);
 
+FailureOr<SmallVector<MeshShardingAttr>>
+defaultGetShardingAnnotations(Operation *op,
+                              const ShardingOption &shardingOption);
+
 LogicalResult
 defaultAddShardingAnnotations(Operation *op, OpBuilder &b,
                               const ShardingOption &shardingOption);
diff --git a/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.td b/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.td
index 1f75135..47a74f6 100644
--- a/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.td
+++ b/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.td
@@ -75,8 +75,11 @@ def ShardingInterface : OpInterface<"ShardingInterface"> {
       InterfaceMethod<
         /*desc=*/[{
           Given that certain operands or results of the operation may have
-          sharding annotations, this method leverages this information to deduce
-          how the operation should be sharded.
+          sharding annotations, this method leverages this information to
+          deduce how the operation should be sharded.
+          The passed sharding may be incomplete, this gives freedom for the
+          op to select the most appropriate shardings for all the operands
+          and results and the op itself.
         }],
         /*retTy=*/"FailureOr<ShardingOption>",
         /*methodName=*/"getShardingOption",
@@ -92,6 +95,24 @@ def ShardingInterface : OpInterface<"ShardingInterface"> {
       >,
       InterfaceMethod<
         /*desc=*/[{
+          Based on a given ShardingOption, get the operand and result
+          operations for the operands and results sharding annotations.
+          This is what shardings the operands and results need to have in order
+          to shard the op according to shardingOption.
+        }],
+        /*retTy=*/"FailureOr<SmallVector<MeshShardingAttr>>",
+        /*methodName=*/"getShardingAnnotations",
+        /*args=*/(ins
+          "const ShardingOption &":$shardingOption
+        ),
+        /*methodBody=*/"",
+        /*defaultImplementation=*/[{
+          return detail::defaultGetShardingAnnotations(
+            $_op.getOperation(), shardingOption);
+        }]
+      >,
+      InterfaceMethod<
+        /*desc=*/[{
           Based on a given ShardingOption, this method adds `mesh.shard`
           operations for the operands and results that previously lacked
           sharding annotations.
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 122abbe..dc9ac2b 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -152,13 +152,9 @@ def ParallelOp : OpenMP_Op<"parallel", [
     variable should be passed into the reduction region by value or by reference
     in `reduction_vars_byref`. Each reduction is identified by the accumulator
     it uses and accumulators must not be repeated in the same reduction. The
-    `omp.reduction` operation accepts the accumulator and a partial value which
-    is considered to be produced by the thread for the given reduction. If
-    multiple values are produced for the same accumulator, i.e. there are
-    multiple `omp.reduction`s, the last value is taken. The reduction
-    declaration specifies how to combine the values from each thread into the
-    final value, which is available in the accumulator after all the threads
-    complete.
+    reduction declaration specifies how to combine the values from each thread
+    into the final value, which is available in the accumulator after all the
+    threads complete.
 
     The optional $proc_bind_val attribute controls the thread affinity for the execution
     of the parallel region.
@@ -307,13 +303,9 @@ def SectionsOp : OpenMP_Op<"sections", [AttrSizedOperandSegments,
     accumulator variables in `reduction_vars` and symbols referring to reduction
     declarations in the `reductions` attribute. Each reduction is identified
     by the accumulator it uses and accumulators must not be repeated in the same
-    reduction. The `omp.reduction` operation accepts the accumulator and a
-    partial value which is considered to be produced by the section for the
-    given reduction. If multiple values are produced for the same accumulator,
-    i.e. there are multiple `omp.reduction`s, the last value is taken. The
-    reduction declaration specifies how to combine the values from each section
-    into the final value, which is available in the accumulator after all the
-    sections complete.
+    reduction. The reduction declaration specifies how to combine the values
+    from each section into the final value, which is available in the
+    accumulator after all the sections complete.
 
     The $allocators_vars and $allocate_vars parameters are a variadic list of values
     that specify the memory allocator to be used to obtain storage for private values.
@@ -912,11 +904,7 @@ def TaskloopOp : OpenMP_Op<"taskloop", [AttrSizedOperandSegments,
     variables in `reduction_vars` or `in_reduction_vars` and symbols referring
     to reduction declarations in the `reductions` or `in_reductions` attribute.
     Each reduction is identified by the accumulator it uses and accumulators
-    must not be repeated in the same reduction. The `omp.reduction` operation
-    accepts the accumulator and a partial value which is considered to be
-    produced by the current loop iteration for the given reduction. If multiple
-    values are produced for the same accumulator, i.e. there are multiple
-    `omp.reduction`s, the last value is taken. The reduction declaration
+    must not be repeated in the same reduction. The reduction declaration
     specifies how to combine the values from each iteration into the final
     value, which is available in the accumulator after the loop completes.
 
@@ -2159,24 +2147,4 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [Symbol,
   let hasRegionVerifier = 1;
 }
 
-//===----------------------------------------------------------------------===//
-// 2.19.5.4 reduction clause
-//===----------------------------------------------------------------------===//
-
-def ReductionOp : OpenMP_Op<"reduction"> {
-  let summary = "reduction construct";
-  let description = [{
-    Indicates the value that is produced by the current reduction-participating
-    entity for a reduction requested in some ancestor. The reduction is
-    identified by the accumulator, but the value of the accumulator may not be
-    updated immediately.
-  }];
-
-  let arguments= (ins AnyType:$operand, OpenMP_PointerLikeType:$accumulator);
-  let assemblyFormat = [{
-    $operand `,` $accumulator attr-dict `:` type($operand) `,` type($accumulator)
-  }];
-  let hasVerifier = 1;
-}
-
 #endif // OPENMP_OPS
diff --git a/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td b/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td
index 3ef899d..f99cbcc 100644
--- a/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td
+++ b/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td
@@ -52,8 +52,8 @@ def Polynomial_AddOp : Polynomial_BinaryOp<"add", [Commutative]> {
     // add two polynomials modulo x^1024 - 1
     #poly = #polynomial.int_polynomial<x**1024 - 1>
     #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536:i32, polynomialModulus=#poly>
-    %0 = polynomial.constant #polynomial.int_polynomial<1 + x**2> : !polynomial.polynomial<#ring>
-    %1 = polynomial.constant #polynomial.int_polynomial<x**5 - x + 1> : !polynomial.polynomial<#ring>
+    %0 = polynomial.constant int<1 + x**2> : !polynomial.polynomial<#ring>
+    %1 = polynomial.constant int<x**5 - x + 1> : !polynomial.polynomial<#ring>
     %2 = polynomial.add %0, %1 : !polynomial.polynomial<#ring>
     ```
   }];
@@ -76,8 +76,8 @@ def Polynomial_SubOp : Polynomial_BinaryOp<"sub"> {
     // subtract two polynomials modulo x^1024 - 1
     #poly = #polynomial.int_polynomial<x**1024 - 1>
     #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536:i32, polynomialModulus=#poly>
-    %0 = polynomial.constant #polynomial.int_polynomial<1 + x**2> : !polynomial.polynomial<#ring>
-    %1 = polynomial.constant #polynomial.int_polynomial<x**5 - x + 1> : !polynomial.polynomial<#ring>
+    %0 = polynomial.constant int<1 + x**2> : !polynomial.polynomial<#ring>
+    %1 = polynomial.constant int<x**5 - x + 1> : !polynomial.polynomial<#ring>
     %2 = polynomial.sub %0, %1 : !polynomial.polynomial<#ring>
     ```
   }];
@@ -101,8 +101,8 @@ def Polynomial_MulOp : Polynomial_BinaryOp<"mul", [Commutative]> {
     // multiply two polynomials modulo x^1024 - 1
     #poly = #polynomial.int_polynomial<x**1024 - 1>
     #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536:i32, polynomialModulus=#poly>
-    %0 = polynomial.constant #polynomial.int_polynomial<1 + x**2> : !polynomial.polynomial<#ring>
-    %1 = polynomial.constant #polynomial.int_polynomial<x**5 - x + 1> : !polynomial.polynomial<#ring>
+    %0 = polynomial.constant int<1 + x**2> : !polynomial.polynomial<#ring>
+    %1 = polynomial.constant int<x**5 - x + 1> : !polynomial.polynomial<#ring>
     %2 = polynomial.mul %0, %1 : !polynomial.polynomial<#ring>
     ```
   }];
@@ -126,7 +126,7 @@ def Polynomial_MulScalarOp : Polynomial_Op<"mul_scalar", [
     // multiply two polynomials modulo x^1024 - 1
     #poly = #polynomial.int_polynomial<x**1024 - 1>
     #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536:i32, polynomialModulus=#poly>
-    %0 = polynomial.constant #polynomial.int_polynomial<1 + x**2> : !polynomial.polynomial<#ring>
+    %0 = polynomial.constant int<1 + x**2> : !polynomial.polynomial<#ring>
     %1 = arith.constant 3 : i32
     %2 = polynomial.mul_scalar %0, %1 : !polynomial.polynomial<#ring>, i32
     ```
@@ -157,7 +157,7 @@ def Polynomial_LeadingTermOp: Polynomial_Op<"leading_term"> {
     ```mlir
     #poly = #polynomial.int_polynomial<x**1024 - 1>
     #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536:i32, polynomialModulus=#poly>
-    %0 = polynomial.constant #polynomial.int_polynomial<1 + x**2> : !polynomial.polynomial<#ring>
+    %0 = polynomial.constant int<1 + x**2> : !polynomial.polynomial<#ring>
     %1, %2 = polynomial.leading_term %0 : !polynomial.polynomial<#ring> -> (index, i32)
     ```
   }];
@@ -272,29 +272,29 @@ def Polynomial_ToTensorOp : Polynomial_Op<"to_tensor", [Pure]> {
   let hasVerifier = 1;
 }
 
-def Polynomial_AnyPolynomialAttr : AnyAttrOf<[
-  Polynomial_FloatPolynomialAttr,
-  Polynomial_IntPolynomialAttr
+def Polynomial_AnyTypedPolynomialAttr : AnyAttrOf<[
+  Polynomial_TypedFloatPolynomialAttr,
+  Polynomial_TypedIntPolynomialAttr
 ]>;
 
 // Not deriving from Polynomial_Op due to need for custom assembly format
-def Polynomial_ConstantOp : Op<Polynomial_Dialect, "constant", [Pure]> {
+def Polynomial_ConstantOp : Op<Polynomial_Dialect, "constant",
+    [Pure, InferTypeOpAdaptor]> {
   let summary = "Define a constant polynomial via an attribute.";
   let description = [{
     Example:
 
     ```mlir
-    #poly = #polynomial.int_polynomial<x**1024 - 1>
-    #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536:i32, polynomialModulus=#poly>
-    %0 = polynomial.constant #polynomial.int_polynomial<1 + x**2> : !polynomial.polynomial<#ring>
+    !int_poly_ty = !polynomial.polynomial<ring=<coefficientType=i32>>
+    %0 = polynomial.constant int<1 + x**2> : !int_poly_ty
 
-    #float_ring = #polynomial.ring<coefficientType=f32>
-    %0 = polynomial.constant #polynomial.float_polynomial<0.5 + 1.3e06 x**2> : !polynomial.polynomial<#float_ring>
+    !float_poly_ty = !polynomial.polynomial<ring=<coefficientType=f32>>
+    %1 = polynomial.constant float<0.5 + 1.3e06 x**2> : !float_poly_ty
     ```
   }];
-  let arguments = (ins Polynomial_AnyPolynomialAttr:$value);
+  let arguments = (ins Polynomial_AnyTypedPolynomialAttr:$value);
   let results = (outs Polynomial_PolynomialType:$output);
-  let assemblyFormat = "attr-dict `:` type($output)";
+  let hasCustomAssemblyFormat = 1;
 }
 
 def Polynomial_NTTOp : Polynomial_Op<"ntt", [Pure]> {
diff --git a/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialAttributes.td b/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialAttributes.td
index e5dbfa7..655020a 100644
--- a/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialAttributes.td
+++ b/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialAttributes.td
@@ -18,7 +18,7 @@ class Polynomial_Attr<string name, string attrMnemonic, list<Trait> traits = []>
 }
 
 def Polynomial_IntPolynomialAttr : Polynomial_Attr<"IntPolynomial", "int_polynomial"> {
-  let summary = "An attribute containing a single-variable polynomial with integer coefficients.";
+  let summary = "an attribute containing a single-variable polynomial with integer coefficients";
   let description = [{
     A polynomial attribute represents a single-variable polynomial with integer
     coefficients, which is used to define the modulus of a `RingAttr`, as well
@@ -41,7 +41,7 @@ def Polynomial_IntPolynomialAttr : Polynomial_Attr<"IntPolynomial", "int_polynom
 }
 
 def Polynomial_FloatPolynomialAttr : Polynomial_Attr<"FloatPolynomial", "float_polynomial"> {
-  let summary = "An attribute containing a single-variable polynomial with double precision floating point coefficients.";
+  let summary = "an attribute containing a single-variable polynomial with double precision floating point coefficients";
   let description = [{
     A polynomial attribute represents a single-variable polynomial with double
     precision floating point coefficients.
@@ -62,8 +62,72 @@ def Polynomial_FloatPolynomialAttr : Polynomial_Attr<"FloatPolynomial", "float_p
   let hasCustomAssemblyFormat = 1;
 }
 
+def Polynomial_TypedIntPolynomialAttr : Polynomial_Attr<
+    "TypedIntPolynomial", "typed_int_polynomial", [TypedAttrInterface]> {
+  let summary = "a typed int_polynomial";
+  let description = [{
+    Example:
+
+    ```mlir
+    !poly_ty = !polynomial.polynomial<ring=<coefficientType=i32>>
+    #poly = int<1 x**7 + 4> : !poly_ty
+    #poly_verbose = #polynomial.typed_int_polynomial<1 x**7 + 4> : !poly_ty
+    ```
+  }];
+  let parameters = (ins "::mlir::Type":$type, "::mlir::polynomial::IntPolynomialAttr":$value);
+  let assemblyFormat = "$value `:` $type";
+  let builders = [
+    AttrBuilderWithInferredContext<(ins "Type":$type,
+                                        "const IntPolynomial &":$value), [{
+      return $_get(
+        type.getContext(),
+        type,
+        IntPolynomialAttr::get(type.getContext(), value));
+    }]>,
+    AttrBuilderWithInferredContext<(ins "Type":$type,
+                                        "const Attribute &":$value), [{
+      return $_get(type.getContext(), type, ::llvm::cast<IntPolynomialAttr>(value));
+    }]>
+  ];
+  let extraClassDeclaration = [{
+    using ValueType = ::mlir::Attribute;
+  }];
+}
+
+def Polynomial_TypedFloatPolynomialAttr : Polynomial_Attr<
+    "TypedFloatPolynomial", "typed_float_polynomial", [TypedAttrInterface]> {
+  let summary = "a typed float_polynomial";
+  let description = [{
+    Example:
+
+    ```mlir
+    !poly_ty = !polynomial.polynomial<ring=<coefficientType=f32>>
+    #poly = float<1.4 x**7 + 4.5> : !poly_ty
+    #poly_verbose = #polynomial.typed_float_polynomial<1.4 x**7 + 4.5> : !poly_ty
+    ```
+  }];
+  let parameters = (ins "::mlir::Type":$type, "::mlir::polynomial::FloatPolynomialAttr":$value);
+  let assemblyFormat = "$value `:` $type";
+  let builders = [
+    AttrBuilderWithInferredContext<(ins "Type":$type,
+                                        "const FloatPolynomial &":$value), [{
+      return $_get(
+        type.getContext(),
+        type,
+        FloatPolynomialAttr::get(type.getContext(), value));
+    }]>,
+    AttrBuilderWithInferredContext<(ins "Type":$type,
+                                        "const Attribute &":$value), [{
+      return $_get(type.getContext(), type, ::llvm::cast<FloatPolynomialAttr>(value));
+    }]>
+  ];
+  let extraClassDeclaration = [{
+    using ValueType = ::mlir::Attribute;
+  }];
+}
+
 def Polynomial_RingAttr : Polynomial_Attr<"Ring", "ring"> {
-  let summary = "An attribute specifying a polynomial ring.";
+  let summary = "an attribute specifying a polynomial ring";
   let description = [{
     A ring describes the domain in which polynomial arithmetic occurs. The ring
     attribute in `polynomial` represents the more specific case of polynomials
diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
index 965ef9e..6d56717 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
@@ -250,8 +250,8 @@ struct SCFReductionTilingResult {
   Operation *parallelTiledOp;
   /// The final reduction operation merging all the partial reductions.
   Operation *mergeOp;
-  /// Initial op
-  Operation *initialOp;
+  /// Initial values used for reduction.
+  SmallVector<Value> initialValues;
   /// The loop operations that iterate over the tiles.
   SmallVector<LoopLikeOpInterface> loops;
 };
diff --git a/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h
index e8a09c4..dd6b0e8 100644
--- a/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h
@@ -59,8 +59,8 @@ void populateDropRedundantInsertSliceRankExpansionPatterns(
 /// `tensor.collapse_shape` into other ops.
 void populateReassociativeReshapeFoldingPatterns(RewritePatternSet &patterns);
 
-/// Populates `patterns` with patterns that fold tensor.empty with
-/// tensor.[extract_slice|expand_shape|collapse_shape].
+/// Populates `patterns` with patterns that fold tensor.empty with its
+/// consumers.
 ///
 /// If `singleUseOnly` is set to "true", only tensor.empty ops with a single
 /// use are folded.
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 332b5ad..2bb7540 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -543,6 +543,86 @@ def Vector_InterleaveOp :
   }];
 }
 
+class ResultIsHalfSourceVectorType<string result> : TypesMatchWith<
+  "the trailing dimension of the results is half the width of source trailing dimension",
+  "source", result,
+  [{
+    [&]() -> ::mlir::VectorType {
+      auto vectorType = ::llvm::cast<mlir::VectorType>($_self);
+      ::mlir::VectorType::Builder builder(vectorType);
+      auto lastDim = vectorType.getRank() - 1;
+      auto newDimSize = vectorType.getDimSize(lastDim) / 2;;
+      if (newDimSize <= 0)
+         return vectorType; // (invalid input type)
+      return builder.setDim(lastDim, newDimSize);
+    }()
+  }]
+>;
+
+def SourceVectorEvenElementCount : PredOpTrait<
+  "the trailing dimension of the source vector has an even number of elements",
+  CPred<[{
+    [&](){
+      auto srcVec = getSourceVectorType();
+      return srcVec.getDimSize(srcVec.getRank() - 1) % 2 == 0;
+    }()
+  }]>
+>;
+
+def Vector_DeinterleaveOp :
+  Vector_Op<"deinterleave", [Pure,
+    SourceVectorEvenElementCount,
+    ResultIsHalfSourceVectorType<"res1">,
+    AllTypesMatch<["res1", "res2"]>
+    ]> {
+      let summary = "constructs two vectors by deinterleaving an input vector";
+      let description = [{
+        The deinterleave operation constructs two vectors from a single input
+        vector. The first result vector contains the elements from even indexes
+        of the input, and the second contains elements from odd indexes. This is
+        the inverse of a `vector.interleave` operation.
+
+        Each output's trailing dimension is half of the size of the input
+        vector's trailing dimension. This operation requires the input vector
+        to have a rank > 0 and an even number of elements in its trailing
+        dimension.
+
+        The operation supports scalable vectors.
+
+        Example:
+        ```mlir
+        %0, %1 = vector.deinterleave %a
+                   : vector<8xi8> -> vector<4xi8>
+        %2, %3 = vector.deinterleave %b
+                   : vector<2x8xi8> -> vector<2x4xi8>
+        %4, %5 = vector.deinterleave %c
+                   : vector<2x8x4xi8> -> vector<2x8x2xi8>
+        %6, %7 = vector.deinterleave %d
+                   : vector<[8]xf32> -> vector<[4]xf32>
+        %8, %9 = vector.deinterleave %e
+                   : vector<2x[6]xf64> -> vector<2x[3]xf64>
+        %10, %11 = vector.deinterleave %f
+                   : vector<2x4x[6]xf64> -> vector<2x4x[3]xf64>
+        ```
+      }];
+
+      let arguments = (ins AnyVector:$source);
+      let results = (outs AnyVector:$res1, AnyVector:$res2);
+
+      let assemblyFormat = [{
+        $source attr-dict `:` type($source) `->` type($res1)
+      }];
+
+      let extraClassDeclaration = [{
+        VectorType getSourceVectorType() {
+          return ::llvm::cast<VectorType>(getSource().getType());
+        }
+        VectorType getResultVectorType() {
+          return ::llvm::cast<VectorType>(getRes1().getType());
+        }
+      }];
+    }
+
 def Vector_ExtractElementOp :
   Vector_Op<"extractelement", [Pure,
      TypesMatchWith<"result type matches element type of vector operand",
diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td
index 7866ac2..4481e56 100644
--- a/mlir/include/mlir/IR/OpBase.td
+++ b/mlir/include/mlir/IR/OpBase.td
@@ -670,16 +670,4 @@ class TCopVTEtAreSameAt<list<int> indices> : CPred<
       "[this](unsigned i) { return getElementTypeOrSelf(this->getOperand(i)); "
       "}))">;
 
-class AnyScalarTypeMatch<list<string> names> :
-    AnyMatchOperatorTrait<names, "$_self.getType().isSignlessInteger(1)",
-                          "scalar type">;
-
-class ScalarConditionOrMatchingShape<list<string> names> :
-    PredOpTrait<
-        !head(names) # " is scalar or has matching shape",
-        Or<[AnyScalarTypeMatch<[!head(names)]>.predicate,
-            AllShapesMatch<names>.predicate]>> {
-  list<string> values = names;
-}
-
 #endif // OP_BASE
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
index 90406f5..fedd773 100644
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -14,7 +14,6 @@
 #ifndef MLIR_INITALLPASSES_H_
 #define MLIR_INITALLPASSES_H_
 
-#include "mlir/Config/mlir-config.h"
 #include "mlir/Conversion/Passes.h"
 #include "mlir/Dialect/AMDGPU/Transforms/Passes.h"
 #include "mlir/Dialect/Affine/Passes.h"
@@ -99,7 +98,7 @@ inline void registerAllPasses() {
   bufferization::registerBufferizationPipelines();
   sparse_tensor::registerSparseTensorPipelines();
   tosa::registerTosaToLinalgPipelines();
-#if MLIR_ENABLE_CUDA_CONVERSIONS
+#if LLVM_HAS_NVPTX_TARGET
   gpu::registerGPUToNVVMPipeline();
 #endif
 }
diff --git a/mlir/include/mlir/Interfaces/TilingInterface.td b/mlir/include/mlir/Interfaces/TilingInterface.td
index 66382f2..14d775d 100644
--- a/mlir/include/mlir/Interfaces/TilingInterface.td
+++ b/mlir/include/mlir/Interfaces/TilingInterface.td
@@ -170,11 +170,11 @@ def PartialReductionOpInterface : OpInterface<"PartialReductionOpInterface"> {
           operation reduction. The tensor shape is equal to operation result
           shape with new dimension for each non zero tile size.
         }],
-        /*retType=*/"FailureOr<Operation*>",
+        /*retType=*/"FailureOr<SmallVector<Value>>",
         /*methodName=*/"generateInitialTensorForPartialReduction",
         /*args=*/(ins
             "OpBuilder &":$b,
-            "Location ":$loc,
+            "Location":$loc,
             "ArrayRef<OpFoldResult>":$sizes,
             "ArrayRef<int>":$reductionDim),
         /*methodBody=*/"",
diff --git a/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h b/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h
index 97c97c23..851bb53 100644
--- a/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h
+++ b/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h
@@ -16,6 +16,7 @@
 
 #include "mlir/Interfaces/InferIntRangeInterface.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitmaskEnum.h"
 #include <optional>
 
 namespace mlir {
@@ -31,6 +32,18 @@ static constexpr unsigned indexMaxWidth = 64;
 
 enum class CmpMode : uint32_t { Both, Signed, Unsigned };
 
+enum class OverflowFlags : uint32_t {
+  None = 0,
+  Nsw = 1,
+  Nuw = 2,
+  LLVM_MARK_AS_BITMASK_ENUM(Nuw)
+};
+
+/// Function that performs inference on an array of `ConstantIntRanges` while
+/// taking special overflow behavior into account.
+using InferRangeWithOvfFlagsFn =
+    function_ref<ConstantIntRanges(ArrayRef<ConstantIntRanges>, OverflowFlags)>;
+
 /// Compute `inferFn` on `ranges`, whose size should be the index storage
 /// bitwidth. Then, compute the function on `argRanges` again after truncating
 /// the ranges to 32 bits. Finally, if the truncation of the 64-bit result is
@@ -60,11 +73,14 @@ ConstantIntRanges extSIRange(const ConstantIntRanges &range,
 ConstantIntRanges truncRange(const ConstantIntRanges &range,
                              unsigned destWidth);
 
-ConstantIntRanges inferAdd(ArrayRef<ConstantIntRanges> argRanges);
+ConstantIntRanges inferAdd(ArrayRef<ConstantIntRanges> argRanges,
+                           OverflowFlags ovfFlags = OverflowFlags::None);
 
-ConstantIntRanges inferSub(ArrayRef<ConstantIntRanges> argRanges);
+ConstantIntRanges inferSub(ArrayRef<ConstantIntRanges> argRanges,
+                           OverflowFlags ovfFlags = OverflowFlags::None);
 
-ConstantIntRanges inferMul(ArrayRef<ConstantIntRanges> argRanges);
+ConstantIntRanges inferMul(ArrayRef<ConstantIntRanges> argRanges,
+                           OverflowFlags ovfFlags = OverflowFlags::None);
 
 ConstantIntRanges inferDivS(ArrayRef<ConstantIntRanges> argRanges);
 
@@ -94,7 +110,8 @@ ConstantIntRanges inferOr(ArrayRef<ConstantIntRanges> argRanges);
 
 ConstantIntRanges inferXor(ArrayRef<ConstantIntRanges> argRanges);
 
-ConstantIntRanges inferShl(ArrayRef<ConstantIntRanges> argRanges);
+ConstantIntRanges inferShl(ArrayRef<ConstantIntRanges> argRanges,
+                           OverflowFlags ovfFlags = OverflowFlags::None);
 
 ConstantIntRanges inferShrS(ArrayRef<ConstantIntRanges> argRanges);
 
diff --git a/mlir/include/mlir/Pass/PassManager.h b/mlir/include/mlir/Pass/PassManager.h
index 1b2e6a3..b3e4275 100644
--- a/mlir/include/mlir/Pass/PassManager.h
+++ b/mlir/include/mlir/Pass/PassManager.h
@@ -18,8 +18,8 @@
 #include "llvm/Support/raw_ostream.h"
 
 #include <functional>
-#include <vector>
 #include <optional>
+#include <vector>
 
 namespace mlir {
 class AnalysisManager;
@@ -387,6 +387,43 @@ public:
       bool printAfterOnlyOnFailure = false, raw_ostream &out = llvm::errs(),
       OpPrintingFlags opPrintingFlags = OpPrintingFlags());
 
+  /// Similar to `enableIRPrinting` above, except that instead of printing
+  /// the IR to a single output stream, the instrumentation will print the
+  /// output of each pass to a separate file. The files will be organized into a
+  /// directory tree rooted at `printTreeDir`. The directories mirror the
+  /// nesting structure of the IR. For example, if the IR is congruent to the
+  /// pass-pipeline "builtin.module(passA,passB,func.func(passC,passD),passE)",
+  /// and `printTreeDir=/tmp/pipeline_output`, then then the tree file tree
+  /// created will look like:
+  ///
+  /// ```
+  /// /tmp/pass_output
+  /// ├── builtin_module_the_symbol_name
+  /// │   ├── 0_passA.mlir
+  /// │   ├── 1_passB.mlir
+  /// │   ├── 2_passE.mlir
+  /// │   ├── func_func_my_func_name
+  /// │   │   ├── 1_0_passC.mlir
+  /// │   │   ├── 1_1__passD.mlir
+  /// │   ├── func_func_my_other_func_name
+  /// │   │   ├── 1_0_passC.mlir
+  /// │   │   ├── 1_1_passD.mlir
+  /// ```
+  ///
+  /// The subdirectories are given names that reflect the parent operation name
+  /// and symbol name (if present). The output MLIR files are prefixed using an
+  /// atomic counter to indicate the order the passes were printed in and to
+  /// prevent any potential name collisions.
+  void enableIRPrintingToFileTree(
+      std::function<bool(Pass *, Operation *)> shouldPrintBeforePass =
+          [](Pass *, Operation *) { return true; },
+      std::function<bool(Pass *, Operation *)> shouldPrintAfterPass =
+          [](Pass *, Operation *) { return true; },
+      bool printModuleScope = true, bool printAfterOnlyOnChange = true,
+      bool printAfterOnlyOnFailure = false,
+      llvm::StringRef printTreeDir = ".pass_manager_output",
+      OpPrintingFlags opPrintingFlags = OpPrintingFlags());
+
   //===--------------------------------------------------------------------===//
   // Pass Timing
 
diff --git a/mlir/include/mlir/Transforms/RegionUtils.h b/mlir/include/mlir/Transforms/RegionUtils.h
index f65d0d44..06eebff 100644
--- a/mlir/include/mlir/Transforms/RegionUtils.h
+++ b/mlir/include/mlir/Transforms/RegionUtils.h
@@ -87,10 +87,6 @@ LogicalResult eraseUnreachableBlocks(RewriterBase &rewriter,
 LogicalResult runRegionDCE(RewriterBase &rewriter,
                            MutableArrayRef<Region> regions);
 
-/// Get a list of blocks that is sorted according to dominance. This sort is
-/// stable.
-SetVector<Block *> getBlocksSortedByDominance(Region &region);
-
 } // namespace mlir
 
 #endif // MLIR_TRANSFORMS_REGIONUTILS_H_
diff --git a/mlir/lib/Analysis/CMakeLists.txt b/mlir/lib/Analysis/CMakeLists.txt
index 005814d..38d8415 100644
--- a/mlir/lib/Analysis/CMakeLists.txt
+++ b/mlir/lib/Analysis/CMakeLists.txt
@@ -6,6 +6,7 @@ set(LLVM_OPTIONAL_SOURCES
   Liveness.cpp
   CFGLoopInfo.cpp
   SliceAnalysis.cpp
+  TopologicalSortUtils.cpp
 
   AliasAnalysis/LocalAliasAnalysis.cpp
 
@@ -28,6 +29,7 @@ add_mlir_library(MLIRAnalysis
   Liveness.cpp
   CFGLoopInfo.cpp
   SliceAnalysis.cpp
+  TopologicalSortUtils.cpp
 
   AliasAnalysis/LocalAliasAnalysis.cpp
 
diff --git a/mlir/lib/Analysis/Liveness.cpp b/mlir/lib/Analysis/Liveness.cpp
index a8e0dae..e3245d6 100644
--- a/mlir/lib/Analysis/Liveness.cpp
+++ b/mlir/lib/Analysis/Liveness.cpp
@@ -72,6 +72,10 @@ struct BlockInfoBuilder {
         defValues.insert(result);
       for (Value operand : op->getOperands())
         useValues.insert(operand);
+      for (Region &region : op->getRegions())
+        for (Block &child : region.getBlocks())
+          for (BlockArgument arg : child.getArguments())
+            defValues.insert(arg);
     });
     llvm::set_subtract(useValues, defValues);
   }
diff --git a/mlir/lib/Analysis/SliceAnalysis.cpp b/mlir/lib/Analysis/SliceAnalysis.cpp
index 26fe8e3..2b1cf41 100644
--- a/mlir/lib/Analysis/SliceAnalysis.cpp
+++ b/mlir/lib/Analysis/SliceAnalysis.cpp
@@ -11,7 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/SliceAnalysis.h"
-#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
+#include "mlir/IR/Block.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Support/LLVM.h"
@@ -164,62 +165,6 @@ mlir::getSlice(Operation *op, const BackwardSliceOptions &backwardSliceOptions,
   return topologicalSort(slice);
 }
 
-namespace {
-/// DFS post-order implementation that maintains a global count to work across
-/// multiple invocations, to help implement topological sort on multi-root DAGs.
-/// We traverse all operations but only record the ones that appear in
-/// `toSort` for the final result.
-struct DFSState {
-  DFSState(const SetVector<Operation *> &set) : toSort(set), seen() {}
-  const SetVector<Operation *> &toSort;
-  SmallVector<Operation *, 16> topologicalCounts;
-  DenseSet<Operation *> seen;
-};
-} // namespace
-
-static void dfsPostorder(Operation *root, DFSState *state) {
-  SmallVector<Operation *> queue(1, root);
-  std::vector<Operation *> ops;
-  while (!queue.empty()) {
-    Operation *current = queue.pop_back_val();
-    ops.push_back(current);
-    for (Operation *op : current->getUsers())
-      queue.push_back(op);
-    for (Region &region : current->getRegions()) {
-      for (Operation &op : region.getOps())
-        queue.push_back(&op);
-    }
-  }
-
-  for (Operation *op : llvm::reverse(ops)) {
-    if (state->seen.insert(op).second && state->toSort.count(op) > 0)
-      state->topologicalCounts.push_back(op);
-  }
-}
-
-SetVector<Operation *>
-mlir::topologicalSort(const SetVector<Operation *> &toSort) {
-  if (toSort.empty()) {
-    return toSort;
-  }
-
-  // Run from each root with global count and `seen` set.
-  DFSState state(toSort);
-  for (auto *s : toSort) {
-    assert(toSort.count(s) == 1 && "NYI: multi-sets not supported");
-    dfsPostorder(s, &state);
-  }
-
-  // Reorder and return.
-  SetVector<Operation *> res;
-  for (auto it = state.topologicalCounts.rbegin(),
-            eit = state.topologicalCounts.rend();
-       it != eit; ++it) {
-    res.insert(*it);
-  }
-  return res;
-}
-
 /// Returns true if `value` (transitively) depends on iteration-carried values
 /// of the given `ancestorOp`.
 static bool dependsOnCarriedVals(Value value,
diff --git a/mlir/lib/Transforms/Utils/TopologicalSortUtils.cpp b/mlir/lib/Analysis/TopologicalSortUtils.cpp
index f3a9d21..c406960 100644
--- a/mlir/lib/Transforms/Utils/TopologicalSortUtils.cpp
+++ b/mlir/lib/Analysis/TopologicalSortUtils.cpp
@@ -1,4 +1,4 @@
-//===- TopologicalSortUtils.h - Topological sort utilities ------*- C++ -*-===//
+//===- TopologicalSortUtils.cpp - Topological sort utilities --------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Transforms/TopologicalSortUtils.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
+#include "mlir/IR/Block.h"
 #include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/RegionGraphTraits.h"
+
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
 
 using namespace mlir;
 
@@ -146,3 +151,135 @@ bool mlir::computeTopologicalSorting(
 
   return allOpsScheduled;
 }
+
+SetVector<Block *> mlir::getBlocksSortedByDominance(Region &region) {
+  // For each block that has not been visited yet (i.e. that has no
+  // predecessors), add it to the list as well as its successors.
+  SetVector<Block *> blocks;
+  for (Block &b : region) {
+    if (blocks.count(&b) == 0) {
+      llvm::ReversePostOrderTraversal<Block *> traversal(&b);
+      blocks.insert(traversal.begin(), traversal.end());
+    }
+  }
+  assert(blocks.size() == region.getBlocks().size() &&
+         "some blocks are not sorted");
+
+  return blocks;
+}
+
+namespace {
+class TopoSortHelper {
+public:
+  explicit TopoSortHelper(const SetVector<Operation *> &toSort)
+      : toSort(toSort) {}
+
+  /// Executes the topological sort of the operations this instance was
+  /// constructed with. This function will destroy the internal state of the
+  /// instance.
+  SetVector<Operation *> sort() {
+    if (toSort.size() <= 1) {
+      // Note: Creates a copy on purpose.
+      return toSort;
+    }
+
+    // First, find the root region to start the traversal through the IR. This
+    // additionally enriches the internal caches with all relevant ancestor
+    // regions and blocks.
+    Region *rootRegion = findCommonAncestorRegion();
+    assert(rootRegion && "expected all ops to have a common ancestor");
+
+    // Sort all elements in `toSort` by traversing the IR in the appropriate
+    // order.
+    SetVector<Operation *> result = topoSortRegion(*rootRegion);
+    assert(result.size() == toSort.size() &&
+           "expected all operations to be present in the result");
+    return result;
+  }
+
+private:
+  /// Computes the closest common ancestor region of all operations in `toSort`.
+  Region *findCommonAncestorRegion() {
+    // Map to count the number of times a region was encountered.
+    DenseMap<Region *, size_t> regionCounts;
+    size_t expectedCount = toSort.size();
+
+    // Walk the region tree for each operation towards the root and add to the
+    // region count.
+    Region *res = nullptr;
+    for (Operation *op : toSort) {
+      Region *current = op->getParentRegion();
+      // Store the block as an ancestor block.
+      ancestorBlocks.insert(op->getBlock());
+      while (current) {
+        // Insert or update the count and compare it.
+        if (++regionCounts[current] == expectedCount) {
+          res = current;
+          break;
+        }
+        ancestorBlocks.insert(current->getParentOp()->getBlock());
+        current = current->getParentRegion();
+      }
+    }
+    auto firstRange = llvm::make_first_range(regionCounts);
+    ancestorRegions.insert(firstRange.begin(), firstRange.end());
+    return res;
+  }
+
+  /// Performs the dominance respecting IR walk to collect the topological order
+  /// of the operation to sort.
+  SetVector<Operation *> topoSortRegion(Region &rootRegion) {
+    using StackT = PointerUnion<Region *, Block *, Operation *>;
+
+    SetVector<Operation *> result;
+    // Stack that stores the different IR constructs to traverse.
+    SmallVector<StackT> stack;
+    stack.push_back(&rootRegion);
+
+    // Traverse the IR in a dominance respecting pre-order walk.
+    while (!stack.empty()) {
+      StackT current = stack.pop_back_val();
+      if (auto *region = dyn_cast<Region *>(current)) {
+        // A region's blocks need to be traversed in dominance order.
+        SetVector<Block *> sortedBlocks = getBlocksSortedByDominance(*region);
+        for (Block *block : llvm::reverse(sortedBlocks)) {
+          // Only add blocks to the stack that are ancestors of the operations
+          // to sort.
+          if (ancestorBlocks.contains(block))
+            stack.push_back(block);
+        }
+        continue;
+      }
+
+      if (auto *block = dyn_cast<Block *>(current)) {
+        // Add all of the blocks operations to the stack.
+        for (Operation &op : llvm::reverse(*block))
+          stack.push_back(&op);
+        continue;
+      }
+
+      auto *op = cast<Operation *>(current);
+      if (toSort.contains(op))
+        result.insert(op);
+
+      // Add all the subregions that are ancestors of the operations to sort.
+      for (Region &subRegion : op->getRegions())
+        if (ancestorRegions.contains(&subRegion))
+          stack.push_back(&subRegion);
+    }
+    return result;
+  }
+
+  /// Operations to sort.
+  const SetVector<Operation *> &toSort;
+  /// Set containing all the ancestor regions of the operations to sort.
+  DenseSet<Region *> ancestorRegions;
+  /// Set containing all the ancestor blocks of the operations to sort.
+  DenseSet<Block *> ancestorBlocks;
+};
+} // namespace
+
+SetVector<Operation *>
+mlir::topologicalSort(const SetVector<Operation *> &toSort) {
+  return TopoSortHelper(toSort).sort();
+}
diff --git a/mlir/lib/Bindings/Python/IRAttributes.cpp b/mlir/lib/Bindings/Python/IRAttributes.cpp
index dda2003..b5f31aa 100644
--- a/mlir/lib/Bindings/Python/IRAttributes.cpp
+++ b/mlir/lib/Bindings/Python/IRAttributes.cpp
@@ -15,6 +15,7 @@
 #include "PybindUtils.h"
 
 #include "llvm/ADT/ScopeExit.h"
+#include "llvm/Support/raw_ostream.h"
 
 #include "mlir-c/BuiltinAttributes.h"
 #include "mlir-c/BuiltinTypes.h"
@@ -72,6 +73,27 @@ Raises:
     type or if the buffer does not meet expectations.
 )";
 
+static const char kDenseElementsAttrGetFromListDocstring[] =
+    R"(Gets a DenseElementsAttr from a Python list of attributes.
+
+Note that it can be expensive to construct attributes individually.
+For a large number of elements, consider using a Python buffer or array instead.
+
+Args:
+  attrs: A list of attributes.
+  type: The desired shape and type of the resulting DenseElementsAttr.
+    If not provided, the element type is determined based on the type
+    of the 0th attribute and the shape is `[len(attrs)]`.
+  context: Explicit context, if not from context manager.
+
+Returns:
+  DenseElementsAttr on success.
+
+Raises:
+  ValueError: If the type of the attributes does not match the type
+    specified by `shaped_type`.
+)";
+
 static const char kDenseResourceElementsAttrGetFromBufferDocstring[] =
     R"(Gets a DenseResourceElementsAttr from a Python buffer or array.
 
@@ -648,6 +670,57 @@ public:
   using PyConcreteAttribute::PyConcreteAttribute;
 
   static PyDenseElementsAttribute
+  getFromList(py::list attributes, std::optional<PyType> explicitType,
+              DefaultingPyMlirContext contextWrapper) {
+
+    const size_t numAttributes = py::len(attributes);
+    if (numAttributes == 0)
+      throw py::value_error("Attributes list must be non-empty.");
+
+    MlirType shapedType;
+    if (explicitType) {
+      if ((!mlirTypeIsAShaped(*explicitType) ||
+           !mlirShapedTypeHasStaticShape(*explicitType))) {
+
+        std::string message;
+        llvm::raw_string_ostream os(message);
+        os << "Expected a static ShapedType for the shaped_type parameter: "
+           << py::repr(py::cast(*explicitType));
+        throw py::value_error(os.str());
+      }
+      shapedType = *explicitType;
+    } else {
+      SmallVector<int64_t> shape{static_cast<int64_t>(numAttributes)};
+      shapedType = mlirRankedTensorTypeGet(
+          shape.size(), shape.data(),
+          mlirAttributeGetType(pyTryCast<PyAttribute>(attributes[0])),
+          mlirAttributeGetNull());
+    }
+
+    SmallVector<MlirAttribute> mlirAttributes;
+    mlirAttributes.reserve(numAttributes);
+    for (const py::handle &attribute : attributes) {
+      MlirAttribute mlirAttribute = pyTryCast<PyAttribute>(attribute);
+      MlirType attrType = mlirAttributeGetType(mlirAttribute);
+      mlirAttributes.push_back(mlirAttribute);
+
+      if (!mlirTypeEqual(mlirShapedTypeGetElementType(shapedType), attrType)) {
+        std::string message;
+        llvm::raw_string_ostream os(message);
+        os << "All attributes must be of the same type and match "
+           << "the type parameter: expected=" << py::repr(py::cast(shapedType))
+           << ", but got=" << py::repr(py::cast(attrType));
+        throw py::value_error(os.str());
+      }
+    }
+
+    MlirAttribute elements = mlirDenseElementsAttrGet(
+        shapedType, mlirAttributes.size(), mlirAttributes.data());
+
+    return PyDenseElementsAttribute(contextWrapper->getRef(), elements);
+  }
+
+  static PyDenseElementsAttribute
   getFromBuffer(py::buffer array, bool signless,
                 std::optional<PyType> explicitType,
                 std::optional<std::vector<int64_t>> explicitShape,
@@ -883,6 +956,10 @@ public:
                     py::arg("type") = py::none(), py::arg("shape") = py::none(),
                     py::arg("context") = py::none(),
                     kDenseElementsAttrGetDocstring)
+        .def_static("get", PyDenseElementsAttribute::getFromList,
+                    py::arg("attrs"), py::arg("type") = py::none(),
+                    py::arg("context") = py::none(),
+                    kDenseElementsAttrGetFromListDocstring)
         .def_static("get_splat", PyDenseElementsAttribute::getSplat,
                     py::arg("shaped_type"), py::arg("element_attr"),
                     "Gets a DenseElementsAttr where all values are the same")
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 01678a9..2b2792e 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -240,7 +240,20 @@ struct PyGlobalDebugFlag {
     // Debug flags.
     py::class_<PyGlobalDebugFlag>(m, "_GlobalDebug", py::module_local())
         .def_property_static("flag", &PyGlobalDebugFlag::get,
-                             &PyGlobalDebugFlag::set, "LLVM-wide debug flag");
+                             &PyGlobalDebugFlag::set, "LLVM-wide debug flag")
+        .def_static(
+            "set_types",
+            [](const std::string &type) {
+              mlirSetGlobalDebugType(type.c_str());
+            },
+            "types"_a, "Sets specific debug types to be produced by LLVM")
+        .def_static("set_types", [](const std::vector<std::string> &types) {
+          std::vector<const char *> pointers;
+          pointers.reserve(types.size());
+          for (const std::string &str : types)
+            pointers.push_back(str.c_str());
+          mlirSetGlobalDebugTypes(pointers.data(), pointers.size());
+        });
   }
 };
 
diff --git a/mlir/lib/CAPI/Debug/Debug.cpp b/mlir/lib/CAPI/Debug/Debug.cpp
index 288ecd6..320ece4 100644
--- a/mlir/lib/CAPI/Debug/Debug.cpp
+++ b/mlir/lib/CAPI/Debug/Debug.cpp
@@ -16,3 +16,21 @@
 void mlirEnableGlobalDebug(bool enable) { llvm::DebugFlag = enable; }
 
 bool mlirIsGlobalDebugEnabled() { return llvm::DebugFlag; }
+
+void mlirSetGlobalDebugType(const char *type) {
+  // Depending on the NDEBUG flag, this name can be either a function or a macro
+  // that expands to something that isn't a funciton call, so we cannot
+  // explicitly prefix it with `llvm::` or declare `using` it.
+  using namespace llvm;
+  setCurrentDebugType(type);
+}
+
+void mlirSetGlobalDebugTypes(const char **types, intptr_t n) {
+  using namespace llvm;
+  setCurrentDebugTypes(types, n);
+}
+
+bool mlirIsCurrentDebugType(const char *type) {
+  using namespace llvm;
+  return isCurrentDebugType(type);
+}
diff --git a/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp b/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
index 1447b18..0be3d76 100644
--- a/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
+++ b/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
@@ -15,6 +15,7 @@
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/EmitC/IR/EmitC.h"
+#include "mlir/Tools/PDLL/AST/Types.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 using namespace mlir;
@@ -112,6 +113,93 @@ public:
   }
 };
 
+template <typename ArithOp, bool castToUnsigned>
+class CastConversion : public OpConversionPattern<ArithOp> {
+public:
+  using OpConversionPattern<ArithOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(ArithOp op, typename ArithOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    Type opReturnType = this->getTypeConverter()->convertType(op.getType());
+    if (!isa_and_nonnull<IntegerType>(opReturnType))
+      return rewriter.notifyMatchFailure(op, "expected integer result type");
+
+    if (adaptor.getOperands().size() != 1) {
+      return rewriter.notifyMatchFailure(
+          op, "CastConversion only supports unary ops");
+    }
+
+    Type operandType = adaptor.getIn().getType();
+    if (!isa_and_nonnull<IntegerType>(operandType))
+      return rewriter.notifyMatchFailure(op, "expected integer operand type");
+
+    // Signed (sign-extending) casts from i1 are not supported.
+    if (operandType.isInteger(1) && !castToUnsigned)
+      return rewriter.notifyMatchFailure(op,
+                                         "operation not supported on i1 type");
+
+    // to-i1 conversions: arith semantics want truncation, whereas (bool)(v) is
+    // equivalent to (v != 0). Implementing as (bool)(v & 0x01) gives
+    // truncation.
+    if (opReturnType.isInteger(1)) {
+      auto constOne = rewriter.create<emitc::ConstantOp>(
+          op.getLoc(), operandType, rewriter.getIntegerAttr(operandType, 1));
+      auto oneAndOperand = rewriter.create<emitc::BitwiseAndOp>(
+          op.getLoc(), operandType, adaptor.getIn(), constOne);
+      rewriter.replaceOpWithNewOp<emitc::CastOp>(op, opReturnType,
+                                                 oneAndOperand);
+      return success();
+    }
+
+    bool isTruncation = operandType.getIntOrFloatBitWidth() >
+                        opReturnType.getIntOrFloatBitWidth();
+    bool doUnsigned = castToUnsigned || isTruncation;
+
+    Type castType = opReturnType;
+    // If the op is a ui variant and the type wanted as
+    // return type isn't unsigned, we need to issue an unsigned type to do
+    // the conversion.
+    if (castType.isUnsignedInteger() != doUnsigned) {
+      castType = rewriter.getIntegerType(opReturnType.getIntOrFloatBitWidth(),
+                                         /*isSigned=*/!doUnsigned);
+    }
+
+    Value actualOp = adaptor.getIn();
+    // Adapt the signedness of the operand if necessary
+    if (operandType.isUnsignedInteger() != doUnsigned) {
+      Type correctSignednessType =
+          rewriter.getIntegerType(operandType.getIntOrFloatBitWidth(),
+                                  /*isSigned=*/!doUnsigned);
+      actualOp = rewriter.template create<emitc::CastOp>(
+          op.getLoc(), correctSignednessType, actualOp);
+    }
+
+    auto result = rewriter.template create<emitc::CastOp>(op.getLoc(), castType,
+                                                          actualOp);
+
+    // Cast to the expected output type
+    if (castType != opReturnType) {
+      result = rewriter.template create<emitc::CastOp>(op.getLoc(),
+                                                       opReturnType, result);
+    }
+
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+
+template <typename ArithOp>
+class UnsignedCastConversion : public CastConversion<ArithOp, true> {
+  using CastConversion<ArithOp, true>::CastConversion;
+};
+
+template <typename ArithOp>
+class SignedCastConversion : public CastConversion<ArithOp, false> {
+  using CastConversion<ArithOp, false>::CastConversion;
+};
+
 template <typename ArithOp, typename EmitCOp>
 class ArithOpConversion final : public OpConversionPattern<ArithOp> {
 public:
@@ -313,6 +401,10 @@ void mlir::populateArithToEmitCPatterns(TypeConverter &typeConverter,
     IntegerOpConversion<arith::SubIOp, emitc::SubOp>,
     CmpIOpConversion,
     SelectOpConversion,
+    // Truncation is guaranteed for unsigned types.
+    UnsignedCastConversion<arith::TruncIOp>,
+    SignedCastConversion<arith::ExtSIOp>,
+    UnsignedCastConversion<arith::ExtUIOp>,
     ItoFCastOpConversion<arith::SIToFPOp>,
     ItoFCastOpConversion<arith::UIToFPOp>,
     FtoICastOpConversion<arith::FPToSIOp>,
diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
index 53b44aa..94b7c8d 100644
--- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
+++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
@@ -449,61 +449,47 @@ mlir::convertFuncOpToLLVMFuncOp(FunctionOpInterface funcOp,
                                        "region types conversion failed");
   }
 
+  if (!shouldUseBarePtrCallConv(funcOp, &converter)) {
+    if (funcOp->getAttrOfType<UnitAttr>(
+            LLVM::LLVMDialect::getEmitCWrapperAttrName())) {
+      if (newFuncOp.isVarArg())
+        return funcOp.emitError("C interface for variadic functions is not "
+                                "supported yet.");
+
+      if (newFuncOp.isExternal())
+        wrapExternalFunction(rewriter, funcOp->getLoc(), converter, funcOp,
+                             newFuncOp);
+      else
+        wrapForExternalCallers(rewriter, funcOp->getLoc(), converter, funcOp,
+                               newFuncOp);
+    }
+  } else {
+    modifyFuncOpToUseBarePtrCallingConv(
+        rewriter, funcOp->getLoc(), converter, newFuncOp,
+        llvm::cast<FunctionType>(funcOp.getFunctionType()).getInputs());
+  }
+
   return newFuncOp;
 }
 
 namespace {
 
-struct FuncOpConversionBase : public ConvertOpToLLVMPattern<func::FuncOp> {
-protected:
-  using ConvertOpToLLVMPattern<func::FuncOp>::ConvertOpToLLVMPattern;
-
-  // Convert input FuncOp to LLVMFuncOp by using the LLVMTypeConverter provided
-  // to this legalization pattern.
-  FailureOr<LLVM::LLVMFuncOp>
-  convertFuncOpToLLVMFuncOp(func::FuncOp funcOp,
-                            ConversionPatternRewriter &rewriter) const {
-    return mlir::convertFuncOpToLLVMFuncOp(
-        cast<FunctionOpInterface>(funcOp.getOperation()), rewriter,
-        *getTypeConverter());
-  }
-};
-
 /// FuncOp legalization pattern that converts MemRef arguments to pointers to
 /// MemRef descriptors (LLVM struct data types) containing all the MemRef type
 /// information.
-struct FuncOpConversion : public FuncOpConversionBase {
+struct FuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
   FuncOpConversion(const LLVMTypeConverter &converter)
-      : FuncOpConversionBase(converter) {}
+      : ConvertOpToLLVMPattern(converter) {}
 
   LogicalResult
   matchAndRewrite(func::FuncOp funcOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    FailureOr<LLVM::LLVMFuncOp> newFuncOp =
-        convertFuncOpToLLVMFuncOp(funcOp, rewriter);
+    FailureOr<LLVM::LLVMFuncOp> newFuncOp = mlir::convertFuncOpToLLVMFuncOp(
+        cast<FunctionOpInterface>(funcOp.getOperation()), rewriter,
+        *getTypeConverter());
     if (failed(newFuncOp))
       return rewriter.notifyMatchFailure(funcOp, "Could not convert funcop");
 
-    if (!shouldUseBarePtrCallConv(funcOp, this->getTypeConverter())) {
-      if (funcOp->getAttrOfType<UnitAttr>(
-              LLVM::LLVMDialect::getEmitCWrapperAttrName())) {
-        if (newFuncOp->isVarArg())
-          return funcOp->emitError("C interface for variadic functions is not "
-                                   "supported yet.");
-
-        if (newFuncOp->isExternal())
-          wrapExternalFunction(rewriter, funcOp->getLoc(), *getTypeConverter(),
-                               funcOp, *newFuncOp);
-        else
-          wrapForExternalCallers(rewriter, funcOp->getLoc(),
-                                 *getTypeConverter(), funcOp, *newFuncOp);
-      }
-    } else {
-      modifyFuncOpToUseBarePtrCallingConv(rewriter, funcOp->getLoc(),
-                                          *getTypeConverter(), *newFuncOp,
-                                          funcOp.getFunctionType().getInputs());
-    }
-
     rewriter.eraseOp(funcOp);
     return success();
   }
diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
index a206c7b..f6a6d1d 100644
--- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
+++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
@@ -185,21 +185,6 @@ struct MapInfoOpConversion : public ConvertOpToLLVMPattern<omp::MapInfoOp> {
   }
 };
 
-struct ReductionOpConversion : public ConvertOpToLLVMPattern<omp::ReductionOp> {
-  using ConvertOpToLLVMPattern<omp::ReductionOp>::ConvertOpToLLVMPattern;
-  LogicalResult
-  matchAndRewrite(omp::ReductionOp curOp, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    if (isa<MemRefType>(curOp.getAccumulator().getType())) {
-      // TODO: Support memref type in variable operands
-      return rewriter.notifyMatchFailure(curOp, "memref is not supported yet");
-    }
-    rewriter.replaceOpWithNewOp<omp::ReductionOp>(
-        curOp, TypeRange(), adaptor.getOperands(), curOp->getAttrs());
-    return success();
-  }
-};
-
 template <typename OpType>
 struct MultiRegionOpConversion : public ConvertOpToLLVMPattern<OpType> {
   using ConvertOpToLLVMPattern<OpType>::ConvertOpToLLVMPattern;
@@ -246,9 +231,6 @@ void mlir::configureOpenMPToLLVMConversionLegality(
         return typeConverter.isLegal(op->getOperandTypes()) &&
                typeConverter.isLegal(op->getResultTypes());
       });
-  target.addDynamicallyLegalOp<mlir::omp::ReductionOp>([&](Operation *op) {
-    return typeConverter.isLegal(op->getOperandTypes());
-  });
   target.addDynamicallyLegalOp<
       mlir::omp::AtomicUpdateOp, mlir::omp::CriticalOp, mlir::omp::TargetOp,
       mlir::omp::TargetDataOp, mlir::omp::LoopNestOp,
@@ -275,11 +257,11 @@ void mlir::populateOpenMPToLLVMConversionPatterns(LLVMTypeConverter &converter,
       [&](omp::MapBoundsType type) -> Type { return type; });
 
   patterns.add<
-      AtomicReadOpConversion, MapInfoOpConversion, ReductionOpConversion,
+      AtomicReadOpConversion, MapInfoOpConversion,
       MultiRegionOpConversion<omp::DeclareReductionOp>,
       MultiRegionOpConversion<omp::PrivateClauseOp>,
       RegionOpConversion<omp::CriticalOp>, RegionOpConversion<omp::LoopNestOp>,
-      RegionOpConversion<omp::MasterOp>, ReductionOpConversion,
+      RegionOpConversion<omp::MasterOp>,
       RegionOpConversion<omp::OrderedRegionOp>,
       RegionOpConversion<omp::ParallelOp>, RegionOpConversion<omp::WsloopOp>,
       RegionOpConversion<omp::SectionsOp>, RegionOpConversion<omp::SectionOp>,
diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
index 332f0a2..4496c2b 100644
--- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
+++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
@@ -15,6 +15,7 @@
 #include <type_traits>
 
 #include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp
index 84ae4b5..7f3e43d 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp
@@ -12,6 +12,7 @@
 
 #include "mlir/Dialect/Affine/LoopFusionUtils.h"
 #include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
diff --git a/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp
index 71eb36b..fbe2eca 100644
--- a/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp
+++ b/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp
@@ -19,6 +19,16 @@ using namespace mlir;
 using namespace mlir::arith;
 using namespace mlir::intrange;
 
+static intrange::OverflowFlags
+convertArithOverflowFlags(arith::IntegerOverflowFlags flags) {
+  intrange::OverflowFlags retFlags = intrange::OverflowFlags::None;
+  if (bitEnumContainsAny(flags, arith::IntegerOverflowFlags::nsw))
+    retFlags |= intrange::OverflowFlags::Nsw;
+  if (bitEnumContainsAny(flags, arith::IntegerOverflowFlags::nuw))
+    retFlags |= intrange::OverflowFlags::Nuw;
+  return retFlags;
+}
+
 //===----------------------------------------------------------------------===//
 // ConstantOp
 //===----------------------------------------------------------------------===//
@@ -38,7 +48,8 @@ void arith::ConstantOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
 
 void arith::AddIOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
                                       SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), inferAdd(argRanges));
+  setResultRange(getResult(), inferAdd(argRanges, convertArithOverflowFlags(
+                                                      getOverflowFlags())));
 }
 
 //===----------------------------------------------------------------------===//
@@ -47,7 +58,8 @@ void arith::AddIOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
 
 void arith::SubIOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
                                       SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), inferSub(argRanges));
+  setResultRange(getResult(), inferSub(argRanges, convertArithOverflowFlags(
+                                                      getOverflowFlags())));
 }
 
 //===----------------------------------------------------------------------===//
@@ -56,7 +68,8 @@ void arith::SubIOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
 
 void arith::MulIOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
                                       SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), inferMul(argRanges));
+  setResultRange(getResult(), inferMul(argRanges, convertArithOverflowFlags(
+                                                      getOverflowFlags())));
 }
 
 //===----------------------------------------------------------------------===//
@@ -302,7 +315,8 @@ void arith::SelectOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
 
 void arith::ShLIOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
                                       SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), inferShl(argRanges));
+  setResultRange(getResult(), inferShl(argRanges, convertArithOverflowFlags(
+                                                      getOverflowFlags())));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp b/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp
index acbbbe9..733e758 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp
@@ -46,6 +46,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/Liveness.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/ArmSME/IR/ArmSME.h"
 #include "mlir/Dialect/ArmSME/Transforms/Passes.h"
 #include "mlir/Dialect/ArmSME/Transforms/Transforms.h"
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
index db1974d..f457303 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Config/mlir-config.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
@@ -39,7 +38,7 @@
 
 using namespace mlir;
 
-#if MLIR_ENABLE_CUDA_CONVERSIONS
+#if LLVM_HAS_NVPTX_TARGET
 namespace {
 
 //===----------------------------------------------------------------------===//
@@ -128,4 +127,4 @@ void mlir::gpu::registerGPUToNVVMPipeline() {
       buildLowerToNVVMPassPipeline);
 }
 
-#endif // MLIR_ENABLE_CUDA_CONVERSIONS
+#endif // LLVM_HAS_NVPTX_TARGET
diff --git a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
index 836e939..1e7596e 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
@@ -13,7 +13,6 @@
 
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 
-#include "mlir/Config/mlir-config.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -49,7 +48,7 @@ void GpuModuleToBinaryPass::getDependentDialects(
   // Register all GPU related translations.
   registry.insert<gpu::GPUDialect>();
   registry.insert<LLVM::LLVMDialect>();
-#if MLIR_ENABLE_CUDA_CONVERSIONS
+#if LLVM_HAS_NVPTX_TARGET
   registry.insert<NVVM::NVVMDialect>();
 #endif
 #if MLIR_ENABLE_ROCM_CONVERSIONS
diff --git a/mlir/lib/Dialect/Index/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/Index/IR/InferIntRangeInterfaceImpls.cpp
index b6b8a13..64adb6b 100644
--- a/mlir/lib/Dialect/Index/IR/InferIntRangeInterfaceImpls.cpp
+++ b/mlir/lib/Dialect/Index/IR/InferIntRangeInterfaceImpls.cpp
@@ -44,19 +44,32 @@ void BoolConstantOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
 // we take the 64-bit result).
 //===----------------------------------------------------------------------===//
 
+// Some arithmetic inference functions allow specifying special overflow / wrap
+// behavior. We do not require this for the IndexOps and use this helper to call
+// the inference function without any `OverflowFlags`.
+static std::function<ConstantIntRanges(ArrayRef<ConstantIntRanges>)>
+inferWithoutOverflowFlags(InferRangeWithOvfFlagsFn inferWithOvfFn) {
+  return [inferWithOvfFn](ArrayRef<ConstantIntRanges> argRanges) {
+    return inferWithOvfFn(argRanges, OverflowFlags::None);
+  };
+}
+
 void AddOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
                               SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), inferIndexOp(inferAdd, argRanges, CmpMode::Both));
+  setResultRange(getResult(), inferIndexOp(inferWithoutOverflowFlags(inferAdd),
+                                           argRanges, CmpMode::Both));
 }
 
 void SubOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
                               SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), inferIndexOp(inferSub, argRanges, CmpMode::Both));
+  setResultRange(getResult(), inferIndexOp(inferWithoutOverflowFlags(inferSub),
+                                           argRanges, CmpMode::Both));
 }
 
 void MulOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
                               SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), inferIndexOp(inferMul, argRanges, CmpMode::Both));
+  setResultRange(getResult(), inferIndexOp(inferWithoutOverflowFlags(inferMul),
+                                           argRanges, CmpMode::Both));
 }
 
 void DivUOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
@@ -127,7 +140,8 @@ void MinUOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
 
 void ShlOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
                               SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), inferIndexOp(inferShl, argRanges, CmpMode::Both));
+  setResultRange(getResult(), inferIndexOp(inferWithoutOverflowFlags(inferShl),
+                                           argRanges, CmpMode::Both));
 }
 
 void ShrSOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt b/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
index c80494a4..728885f 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
@@ -6,7 +6,6 @@ add_mlir_dialect_library(MLIRLLVMIRTransforms
   LegalizeForExport.cpp
   OptimizeForNVVM.cpp
   RequestCWrappers.cpp
-  TypeConsistency.cpp
 
   DEPENDS
   MLIRLLVMPassIncGen
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp
deleted file mode 100644
index 0a372ad..0000000
--- a/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp
+++ /dev/null
@@ -1,575 +0,0 @@
-//===- TypeConsistency.cpp - Rewrites to improve type consistency ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/LLVMIR/Transforms/TypeConsistency.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "llvm/ADT/TypeSwitch.h"
-
-namespace mlir {
-namespace LLVM {
-#define GEN_PASS_DEF_LLVMTYPECONSISTENCY
-#include "mlir/Dialect/LLVMIR/Transforms/Passes.h.inc"
-} // namespace LLVM
-} // namespace mlir
-
-using namespace mlir;
-using namespace LLVM;
-
-//===----------------------------------------------------------------------===//
-// Utils
-//===----------------------------------------------------------------------===//
-
-/// Checks that a pointer value has a pointee type hint consistent with the
-/// expected type. Returns the type it actually hints to if it differs, or
-/// nullptr if the type is consistent or impossible to analyze.
-static Type isElementTypeInconsistent(Value addr, Type expectedType) {
-  auto defOp = dyn_cast_or_null<GetResultPtrElementType>(addr.getDefiningOp());
-  if (!defOp)
-    return nullptr;
-
-  Type elemType = defOp.getResultPtrElementType();
-  if (!elemType)
-    return nullptr;
-
-  if (elemType == expectedType)
-    return nullptr;
-
-  return elemType;
-}
-
-//===----------------------------------------------------------------------===//
-// CanonicalizeAlignedGep
-//===----------------------------------------------------------------------===//
-
-/// Returns the amount of bytes the provided GEP elements will offset the
-/// pointer by. Returns nullopt if the offset could not be computed.
-static std::optional<uint64_t> gepToByteOffset(DataLayout &layout, GEPOp gep) {
-
-  SmallVector<uint32_t> indices;
-  // Ensures all indices are static and fetches them.
-  for (auto index : gep.getIndices()) {
-    IntegerAttr indexInt = llvm::dyn_cast_if_present<IntegerAttr>(index);
-    if (!indexInt)
-      return std::nullopt;
-    int32_t gepIndex = indexInt.getInt();
-    if (gepIndex < 0)
-      return std::nullopt;
-    indices.push_back(static_cast<uint32_t>(gepIndex));
-  }
-
-  uint64_t offset = indices[0] * layout.getTypeSize(gep.getElemType());
-
-  Type currentType = gep.getElemType();
-  for (uint32_t index : llvm::drop_begin(indices)) {
-    bool shouldCancel =
-        TypeSwitch<Type, bool>(currentType)
-            .Case([&](LLVMArrayType arrayType) {
-              if (arrayType.getNumElements() <= index)
-                return true;
-              offset += index * layout.getTypeSize(arrayType.getElementType());
-              currentType = arrayType.getElementType();
-              return false;
-            })
-            .Case([&](LLVMStructType structType) {
-              ArrayRef<Type> body = structType.getBody();
-              if (body.size() <= index)
-                return true;
-              for (uint32_t i = 0; i < index; i++) {
-                if (!structType.isPacked())
-                  offset = llvm::alignTo(offset,
-                                         layout.getTypeABIAlignment(body[i]));
-                offset += layout.getTypeSize(body[i]);
-              }
-              currentType = body[index];
-              return false;
-            })
-            .Default([](Type) { return true; });
-
-    if (shouldCancel)
-      return std::nullopt;
-  }
-
-  return offset;
-}
-
-/// Fills in `equivalentIndicesOut` with GEP indices that would be equivalent to
-/// offsetting a pointer by `offset` bytes, assuming the GEP has `base` as base
-/// type.
-static LogicalResult
-findIndicesForOffset(DataLayout &layout, Type base, uint64_t offset,
-                     SmallVectorImpl<GEPArg> &equivalentIndicesOut) {
-
-  uint64_t baseSize = layout.getTypeSize(base);
-  uint64_t rootIndex = offset / baseSize;
-  if (rootIndex > std::numeric_limits<uint32_t>::max())
-    return failure();
-  equivalentIndicesOut.push_back(rootIndex);
-
-  uint64_t distanceToStart = rootIndex * baseSize;
-
-#ifndef NDEBUG
-  auto isWithinCurrentType = [&](Type currentType) {
-    return offset < distanceToStart + layout.getTypeSize(currentType);
-  };
-#endif
-
-  Type currentType = base;
-  while (distanceToStart < offset) {
-    // While an index that does not perfectly align with offset has not been
-    // reached...
-
-    assert(isWithinCurrentType(currentType));
-
-    bool shouldCancel =
-        TypeSwitch<Type, bool>(currentType)
-            .Case([&](LLVMArrayType arrayType) {
-              // Find which element of the array contains the offset.
-              uint64_t elemSize =
-                  layout.getTypeSize(arrayType.getElementType());
-              uint64_t index = (offset - distanceToStart) / elemSize;
-              equivalentIndicesOut.push_back(index);
-              distanceToStart += index * elemSize;
-
-              // Then, try to find where in the element the offset is. If the
-              // offset is exactly the beginning of the element, the loop is
-              // complete.
-              currentType = arrayType.getElementType();
-
-              // Only continue if the element in question can be indexed using
-              // an i32.
-              return index > std::numeric_limits<uint32_t>::max();
-            })
-            .Case([&](LLVMStructType structType) {
-              ArrayRef<Type> body = structType.getBody();
-              uint32_t index = 0;
-
-              // Walk over the elements of the struct to find in which of them
-              // the offset is.
-              for (Type elem : body) {
-                uint64_t elemSize = layout.getTypeSize(elem);
-                if (!structType.isPacked()) {
-                  distanceToStart = llvm::alignTo(
-                      distanceToStart, layout.getTypeABIAlignment(elem));
-                  // If the offset is in padding, cancel the rewrite.
-                  if (offset < distanceToStart)
-                    return true;
-                }
-
-                if (offset < distanceToStart + elemSize) {
-                  // The offset is within this element, stop iterating the
-                  // struct and look within the current element.
-                  equivalentIndicesOut.push_back(index);
-                  currentType = elem;
-                  return false;
-                }
-
-                // The offset is not within this element, continue walking over
-                // the struct.
-                distanceToStart += elemSize;
-                index++;
-              }
-
-              // The offset was supposed to be within this struct but is not.
-              // This can happen if the offset points into final padding.
-              // Anyway, nothing can be done.
-              return true;
-            })
-            .Default([](Type) {
-              // If the offset is within a type that cannot be split, no indices
-              // will yield this offset. This can happen if the offset is not
-              // perfectly aligned with a leaf type.
-              // TODO: support vectors.
-              return true;
-            });
-
-    if (shouldCancel)
-      return failure();
-  }
-
-  return success();
-}
-
-/// Returns the consistent type for the GEP if the GEP is not type-consistent.
-/// Returns failure if the GEP is already consistent.
-static FailureOr<Type> getRequiredConsistentGEPType(GEPOp gep) {
-  // GEP of typed pointers are not supported.
-  if (!gep.getElemType())
-    return failure();
-
-  std::optional<Type> maybeBaseType = gep.getElemType();
-  if (!maybeBaseType)
-    return failure();
-  Type baseType = *maybeBaseType;
-
-  Type typeHint = isElementTypeInconsistent(gep.getBase(), baseType);
-  if (!typeHint)
-    return failure();
-  return typeHint;
-}
-
-LogicalResult
-CanonicalizeAlignedGep::matchAndRewrite(GEPOp gep,
-                                        PatternRewriter &rewriter) const {
-  FailureOr<Type> typeHint = getRequiredConsistentGEPType(gep);
-  if (failed(typeHint)) {
-    // GEP is already canonical, nothing to do here.
-    return failure();
-  }
-
-  DataLayout layout = DataLayout::closest(gep);
-  std::optional<uint64_t> desiredOffset = gepToByteOffset(layout, gep);
-  if (!desiredOffset)
-    return failure();
-
-  SmallVector<GEPArg> newIndices;
-  if (failed(
-          findIndicesForOffset(layout, *typeHint, *desiredOffset, newIndices)))
-    return failure();
-
-  rewriter.replaceOpWithNewOp<GEPOp>(
-      gep, LLVM::LLVMPointerType::get(getContext()), *typeHint, gep.getBase(),
-      newIndices, gep.getInbounds());
-
-  return success();
-}
-
-namespace {
-/// Class abstracting over both array and struct types, turning each into ranges
-/// of their sub-types.
-class DestructurableTypeRange
-    : public llvm::indexed_accessor_range<DestructurableTypeRange,
-                                          DestructurableTypeInterface, Type,
-                                          Type *, Type> {
-
-  using Base = llvm::indexed_accessor_range<
-      DestructurableTypeRange, DestructurableTypeInterface, Type, Type *, Type>;
-
-public:
-  using Base::Base;
-
-  /// Constructs a DestructurableTypeRange from either a LLVMStructType or
-  /// LLVMArrayType.
-  explicit DestructurableTypeRange(DestructurableTypeInterface base)
-      : Base(base, 0, [&]() -> ptrdiff_t {
-          return TypeSwitch<DestructurableTypeInterface, ptrdiff_t>(base)
-              .Case([](LLVMStructType structType) {
-                return structType.getBody().size();
-              })
-              .Case([](LLVMArrayType arrayType) {
-                return arrayType.getNumElements();
-              })
-              .Default([](auto) -> ptrdiff_t {
-                llvm_unreachable(
-                    "Only LLVMStructType or LLVMArrayType supported");
-              });
-        }()) {}
-
-  /// Returns true if this is a range over a packed struct.
-  bool isPacked() const {
-    if (auto structType = dyn_cast<LLVMStructType>(getBase()))
-      return structType.isPacked();
-    return false;
-  }
-
-private:
-  static Type dereference(DestructurableTypeInterface base, ptrdiff_t index) {
-    // i32 chosen because the implementations of ArrayType and StructType
-    // specifically expect it to be 32 bit. They will fail otherwise.
-    Type result = base.getTypeAtIndex(
-        IntegerAttr::get(IntegerType::get(base.getContext(), 32), index));
-    assert(result && "Should always succeed");
-    return result;
-  }
-
-  friend Base;
-};
-} // namespace
-
-/// Returns the list of elements of `destructurableType` that are written to by
-/// a store operation writing `storeSize` bytes at `storeOffset`.
-/// `storeOffset` is required to cleanly point to an immediate element within
-/// the type. If the write operation were to write to any padding, write beyond
-/// the aggregate or partially write to a non-aggregate, failure is returned.
-static FailureOr<DestructurableTypeRange>
-getWrittenToFields(const DataLayout &dataLayout,
-                   DestructurableTypeInterface destructurableType,
-                   unsigned storeSize, unsigned storeOffset) {
-  DestructurableTypeRange destructurableTypeRange(destructurableType);
-
-  unsigned currentOffset = 0;
-  for (; !destructurableTypeRange.empty();
-       destructurableTypeRange = destructurableTypeRange.drop_front()) {
-    Type type = destructurableTypeRange.front();
-    if (!destructurableTypeRange.isPacked()) {
-      unsigned alignment = dataLayout.getTypeABIAlignment(type);
-      currentOffset = llvm::alignTo(currentOffset, alignment);
-    }
-
-    // currentOffset is guaranteed to be equal to offset since offset is either
-    // 0 or stems from a type-consistent GEP indexing into just a single
-    // aggregate.
-    if (currentOffset == storeOffset)
-      break;
-
-    assert(currentOffset < storeOffset &&
-           "storeOffset should cleanly point into an immediate field");
-
-    currentOffset += dataLayout.getTypeSize(type);
-  }
-
-  size_t exclusiveEnd = 0;
-  for (; exclusiveEnd < destructurableTypeRange.size() && storeSize > 0;
-       exclusiveEnd++) {
-    if (!destructurableTypeRange.isPacked()) {
-      unsigned alignment =
-          dataLayout.getTypeABIAlignment(destructurableTypeRange[exclusiveEnd]);
-      // No padding allowed inbetween fields at this point in time.
-      if (!llvm::isAligned(llvm::Align(alignment), currentOffset))
-        return failure();
-    }
-
-    unsigned fieldSize =
-        dataLayout.getTypeSize(destructurableTypeRange[exclusiveEnd]);
-    if (fieldSize > storeSize) {
-      // Partial writes into an aggregate are okay since subsequent pattern
-      // applications can further split these up into writes into the
-      // sub-elements.
-      auto subAggregate = dyn_cast<DestructurableTypeInterface>(
-          destructurableTypeRange[exclusiveEnd]);
-      if (!subAggregate)
-        return failure();
-
-      // Avoid splitting redundantly by making sure the store into the
-      // aggregate can actually be split.
-      if (failed(getWrittenToFields(dataLayout, subAggregate, storeSize,
-                                    /*storeOffset=*/0)))
-        return failure();
-
-      return destructurableTypeRange.take_front(exclusiveEnd + 1);
-    }
-    currentOffset += fieldSize;
-    storeSize -= fieldSize;
-  }
-
-  // If the storeSize is not 0 at this point we are  writing past the aggregate
-  // as a whole. Abort.
-  if (storeSize > 0)
-    return failure();
-  return destructurableTypeRange.take_front(exclusiveEnd);
-}
-
-/// Splits a store of the vector `value` into `address` at `storeOffset` into
-/// multiple stores of each element with the goal of each generated store
-/// becoming type-consistent through subsequent pattern applications.
-static void splitVectorStore(const DataLayout &dataLayout, Location loc,
-                             RewriterBase &rewriter, Value address,
-                             TypedValue<VectorType> value,
-                             unsigned storeOffset) {
-  VectorType vectorType = value.getType();
-  unsigned elementSize = dataLayout.getTypeSize(vectorType.getElementType());
-
-  // Extract every element in the vector and store it in the given address.
-  for (size_t index : llvm::seq<size_t>(0, vectorType.getNumElements())) {
-    auto pos =
-        rewriter.create<ConstantOp>(loc, rewriter.getI32IntegerAttr(index));
-    auto extractOp = rewriter.create<ExtractElementOp>(loc, value, pos);
-
-    // For convenience, we do indexing by calculating the final byte offset.
-    // Other patterns will turn this into a type-consistent GEP.
-    auto gepOp = rewriter.create<GEPOp>(
-        loc, address.getType(), rewriter.getI8Type(), address,
-        ArrayRef<GEPArg>{
-            static_cast<int32_t>(storeOffset + index * elementSize)});
-
-    rewriter.create<StoreOp>(loc, extractOp, gepOp);
-  }
-}
-
-/// Splits a store of the integer `value` into `address` at `storeOffset` into
-/// multiple stores to each 'writtenToFields', making each store operation
-/// type-consistent.
-static void splitIntegerStore(const DataLayout &dataLayout, Location loc,
-                              RewriterBase &rewriter, Value address,
-                              Value value, unsigned storeSize,
-                              unsigned storeOffset,
-                              DestructurableTypeRange writtenToFields) {
-  unsigned currentOffset = storeOffset;
-  for (Type type : writtenToFields) {
-    unsigned fieldSize = dataLayout.getTypeSize(type);
-
-    // Extract the data out of the integer by first shifting right and then
-    // truncating it.
-    auto pos = rewriter.create<ConstantOp>(
-        loc, rewriter.getIntegerAttr(value.getType(),
-                                     (currentOffset - storeOffset) * 8));
-
-    auto shrOp = rewriter.create<LShrOp>(loc, value, pos);
-
-    // If we are doing a partial write into a direct field the remaining
-    // `storeSize` will be less than the size of the field. We have to truncate
-    // to the `storeSize` to avoid creating a store that wasn't in the original
-    // code.
-    IntegerType fieldIntType =
-        rewriter.getIntegerType(std::min(fieldSize, storeSize) * 8);
-    Value valueToStore = rewriter.create<TruncOp>(loc, fieldIntType, shrOp);
-
-    // We create an `i8` indexed GEP here as that is the easiest (offset is
-    // already known). Other patterns turn this into a type-consistent GEP.
-    auto gepOp = rewriter.create<GEPOp>(
-        loc, address.getType(), rewriter.getI8Type(), address,
-        ArrayRef<GEPArg>{static_cast<int32_t>(currentOffset)});
-    rewriter.create<StoreOp>(loc, valueToStore, gepOp);
-
-    // No need to care about padding here since we already checked previously
-    // that no padding exists in this range.
-    currentOffset += fieldSize;
-    storeSize -= fieldSize;
-  }
-}
-
-LogicalResult SplitStores::matchAndRewrite(StoreOp store,
-                                           PatternRewriter &rewriter) const {
-  Type sourceType = store.getValue().getType();
-  if (!isa<IntegerType, VectorType>(sourceType)) {
-    // We currently only support integer and vector sources.
-    return failure();
-  }
-
-  Type typeHint = isElementTypeInconsistent(store.getAddr(), sourceType);
-  if (!typeHint) {
-    // Nothing to do, since it is already consistent.
-    return failure();
-  }
-
-  auto dataLayout = DataLayout::closest(store);
-
-  unsigned storeSize = dataLayout.getTypeSize(sourceType);
-  unsigned offset = 0;
-  Value address = store.getAddr();
-  if (auto gepOp = address.getDefiningOp<GEPOp>()) {
-    // Currently only handle canonical GEPs with exactly two indices,
-    // indexing a single aggregate deep.
-    // If the GEP is not canonical we have to fail, otherwise we would not
-    // create type-consistent IR.
-    if (gepOp.getIndices().size() != 2 ||
-        succeeded(getRequiredConsistentGEPType(gepOp)))
-      return failure();
-
-    // If the size of the element indexed by the  GEP is smaller than the store
-    // size, it is pointing into the middle of an aggregate with the store
-    // storing into multiple adjacent elements. Destructure into the base
-    // address of the aggregate with a store offset.
-    if (storeSize > dataLayout.getTypeSize(gepOp.getResultPtrElementType())) {
-      std::optional<uint64_t> byteOffset = gepToByteOffset(dataLayout, gepOp);
-      if (!byteOffset)
-        return failure();
-
-      offset = *byteOffset;
-      typeHint = gepOp.getElemType();
-      address = gepOp.getBase();
-    }
-  }
-
-  auto destructurableType = dyn_cast<DestructurableTypeInterface>(typeHint);
-  if (!destructurableType)
-    return failure();
-
-  FailureOr<DestructurableTypeRange> writtenToElements =
-      getWrittenToFields(dataLayout, destructurableType, storeSize, offset);
-  if (failed(writtenToElements))
-    return failure();
-
-  if (writtenToElements->size() <= 1) {
-    // Other patterns should take care of this case, we are only interested in
-    // splitting element stores.
-    return failure();
-  }
-
-  if (isa<IntegerType>(sourceType)) {
-    splitIntegerStore(dataLayout, store.getLoc(), rewriter, address,
-                      store.getValue(), storeSize, offset, *writtenToElements);
-    rewriter.eraseOp(store);
-    return success();
-  }
-
-  // Add a reasonable bound to not split very large vectors that would end up
-  // generating lots of code.
-  if (dataLayout.getTypeSizeInBits(sourceType) > maxVectorSplitSize)
-    return failure();
-
-  // Vector types are simply split into its elements and new stores generated
-  // with those. Subsequent pattern applications will split these stores further
-  // if required.
-  splitVectorStore(dataLayout, store.getLoc(), rewriter, address,
-                   cast<TypedValue<VectorType>>(store.getValue()), offset);
-  rewriter.eraseOp(store);
-  return success();
-}
-
-LogicalResult SplitGEP::matchAndRewrite(GEPOp gepOp,
-                                        PatternRewriter &rewriter) const {
-  FailureOr<Type> typeHint = getRequiredConsistentGEPType(gepOp);
-  if (succeeded(typeHint) || gepOp.getIndices().size() <= 2) {
-    // GEP is not canonical or a single aggregate deep, nothing to do here.
-    return failure();
-  }
-
-  auto indexToGEPArg =
-      [](GEPIndicesAdaptor<ValueRange>::value_type index) -> GEPArg {
-    if (auto integerAttr = dyn_cast<IntegerAttr>(index))
-      return integerAttr.getValue().getSExtValue();
-    return cast<Value>(index);
-  };
-
-  GEPIndicesAdaptor<ValueRange> indices = gepOp.getIndices();
-
-  auto splitIter = std::next(indices.begin(), 2);
-
-  // Split of the first GEP using the first two indices.
-  auto subGepOp = rewriter.create<GEPOp>(
-      gepOp.getLoc(), gepOp.getType(), gepOp.getElemType(), gepOp.getBase(),
-      llvm::map_to_vector(llvm::make_range(indices.begin(), splitIter),
-                          indexToGEPArg),
-      gepOp.getInbounds());
-
-  // The second GEP indexes on the result pointer element type of the previous
-  // with all the remaining indices and a zero upfront. If this GEP has more
-  // than two indices remaining it'll be further split in subsequent pattern
-  // applications.
-  SmallVector<GEPArg> newIndices = {0};
-  llvm::transform(llvm::make_range(splitIter, indices.end()),
-                  std::back_inserter(newIndices), indexToGEPArg);
-  rewriter.replaceOpWithNewOp<GEPOp>(gepOp, gepOp.getType(),
-                                     subGepOp.getResultPtrElementType(),
-                                     subGepOp, newIndices, gepOp.getInbounds());
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// Type consistency pass
-//===----------------------------------------------------------------------===//
-
-namespace {
-struct LLVMTypeConsistencyPass
-    : public LLVM::impl::LLVMTypeConsistencyBase<LLVMTypeConsistencyPass> {
-  void runOnOperation() override {
-    RewritePatternSet rewritePatterns(&getContext());
-    rewritePatterns.add<CanonicalizeAlignedGep>(&getContext());
-    rewritePatterns.add<SplitStores>(&getContext(), maxVectorSplitSize);
-    rewritePatterns.add<SplitGEP>(&getContext());
-    FrozenRewritePatternSet frozen(std::move(rewritePatterns));
-
-    if (failed(applyPatternsAndFoldGreedily(getOperation(), frozen)))
-      signalPassFailure();
-  }
-};
-} // namespace
-
-std::unique_ptr<Pass> LLVM::createTypeConsistencyPass() {
-  return std::make_unique<LLVMTypeConsistencyPass>();
-}
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 13582a1..9b31217 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -2523,7 +2523,8 @@ DiagnosedSilenceableFailure transform::TileReductionUsingForOp::applyToOne(
 
   if (failed(result))
     return emitDefaultSilenceableFailure(target);
-  results.push_back(result->initialOp);
+  for (Value initValue : result->initialValues)
+    results.push_back(initValue.getDefiningOp());
   results.push_back(result->parallelTiledOp);
   results.push_back(result->mergeOp);
   results.push_back(result->loops.front());
@@ -2574,7 +2575,8 @@ DiagnosedSilenceableFailure transform::TileReductionUsingForallOp::applyToOne(
     diag.attachNote(target.getLoc()) << "target operation";
     return diag;
   }
-  results.push_back(result->initialOp);
+  for (Value initValue : result->initialValues)
+    results.push_back(initValue.getDefiningOp());
   results.push_back(result->parallelTiledOp);
   results.push_back(result->mergeOp);
   results.push_back(result->loops);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/BlockPackMatmul.cpp b/mlir/lib/Dialect/Linalg/Transforms/BlockPackMatmul.cpp
index c07d138..91d4efa 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/BlockPackMatmul.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/BlockPackMatmul.cpp
@@ -244,8 +244,7 @@ struct BlockPackMatmul<linalg::GenericOp>
   LogicalResult matchAndRewrite(linalg::GenericOp linalgOp,
                                 PatternRewriter &rewriter) const override {
     // Match suitable generics.
-    if (failed(linalg::detail::verifyContractionInterface(
-            linalgOp.getOperation()))) {
+    if (!linalg::isaContractionOpInterface(linalgOp)) {
       return rewriter.notifyMatchFailure(linalgOp, "not a contraction");
     }
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/MeshShardingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/MeshShardingInterfaceImpl.cpp
index 146e880..24001c5 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/MeshShardingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/MeshShardingInterfaceImpl.cpp
@@ -36,6 +36,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include <iterator>
+#include <numeric>
 #include <optional>
 #include <utility>
 
@@ -155,12 +156,12 @@ static Value createDestinationPassingStyleInitOperand(
         tensor::getMixedSizes(builder, builder.getLoc(), spmdizedOperand);
     PartialReductionOpInterface partialReductionIface =
         llvm::cast<PartialReductionOpInterface>(op.getOperation());
-    FailureOr<Operation *> reductionNeutralTensorOp =
+    assert(op->getNumResults() == 1 && "Multiple results not supported.");
+    FailureOr<SmallVector<Value>> reductionNeutralTensor =
         partialReductionIface.generateInitialTensorForPartialReduction(
             builder, builder.getLoc(), shape, {});
-    assert(succeeded(reductionNeutralTensorOp));
-    builder.create<scf::YieldOp>(
-        reductionNeutralTensorOp.value()->getResult(0));
+    assert(succeeded(reductionNeutralTensor));
+    builder.create<scf::YieldOp>(reductionNeutralTensor.value());
   }
   return ifOp.getResult(0);
 }
@@ -173,8 +174,7 @@ static SmallVector<Value> createDestinationPassingStyleInitOperands(
     ImplicitLocOpBuilder &builder) {
   // TODO: add support for multiple destination passing style initial value
   // operands.
-  // PartialReductionOpInterface::generateInitialTensorForPartialReduction
-  // needs to also support multiple DPS initial operands.
+  assert(op.getNumDpsInits() == 1 && "Multiple initial values not supported.");
   SmallVector<Value> newOperands = llvm::to_vector(spmdizedOperands);
   auto operandIdx = op.getDpsInitOperand(0)->getOperandNumber();
   Value spmdizedInitOperand =
@@ -279,6 +279,20 @@ struct StructuredOpShardingInterface
     return res;
   }
 
+  SmallVector<ReductionKind>
+  getReductionLoopIteratorKinds(Operation *op) const {
+    LinalgOp linalgOp = llvm::cast<LinalgOp>(op);
+    SmallVector<utils::IteratorType> iteratorTypes =
+        linalgOp.getIteratorTypesArray();
+    unsigned reductionItersCount = std::accumulate(
+        iteratorTypes.begin(), iteratorTypes.end(), 0,
+        [](unsigned count, utils::IteratorType iter) {
+          return count + (iter == utils::IteratorType::reduction);
+        });
+    mesh::ReductionKind reductionKind = getReductionKindOfLinalgOp(linalgOp);
+    return SmallVector<ReductionKind>(reductionItersCount, reductionKind);
+  }
+
   LogicalResult spmdize(Operation *op, ArrayRef<Value> spmdizedOperands,
                         ArrayRef<MeshShardingAttr> operandShardings,
                         ArrayRef<MeshShardingAttr> resultShardings,
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
index df4089d6..fd314ef 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -692,12 +692,13 @@ FailureOr<linalg::ForallReductionTilingResult> linalg::tileReductionUsingForall(
         op, "reduction dimension must be mapped to threads");
 
   // 1. Create the inital tensor value.
-  FailureOr<Operation *> identityTensor =
+  FailureOr<SmallVector<Value>> maybeInitTensors =
       op.generateInitialTensorForPartialReduction(b, loc, numThreads,
                                                   reductionDim);
-  if (failed(identityTensor))
-    return b.notifyMatchFailure(op,
-                                "cannot create a tensor of identity value.");
+  if (failed(maybeInitTensors))
+    return b.notifyMatchFailure(
+        op, "Failed to create inital tensors for partial reduction");
+  SmallVector<Value> &initTensors = maybeInitTensors.value();
 
   // Gather destination tensors.
   SmallVector<Value> dest;
@@ -715,8 +716,8 @@ FailureOr<linalg::ForallReductionTilingResult> linalg::tileReductionUsingForall(
 
   // 2. Create the ForallOp with an empty region.
   scf::ForallOp forallOp = b.create<scf::ForallOp>(
-      loc, getAsOpFoldResult(materializedNonZeroNumThreads),
-      (*identityTensor)->getResults(), mapping);
+      loc, getAsOpFoldResult(materializedNonZeroNumThreads), initTensors,
+      mapping);
 
   // 3. Calculate the tile offsets and sizes for the subsequent loop that will
   // be nested under `forallOp`.
@@ -726,7 +727,7 @@ FailureOr<linalg::ForallReductionTilingResult> linalg::tileReductionUsingForall(
                                /*nominalTileSizes=*/std::nullopt, tiledOffsets,
                                tiledSizes);
 
-  // 4. Clone the tileable op and update its destination operands to use the
+  // 4b. Clone the tileable op and update its destination operands to use the
   // output bbArgs of the ForallOp.
   SmallVector<Value> tilingResults;
   ArrayRef<BlockArgument> destBbArgs = forallOp.getRegionIterArgs();
@@ -838,7 +839,7 @@ FailureOr<linalg::ForallReductionTilingResult> linalg::tileReductionUsingForall(
 
   // 8. Return.
   ForallReductionTilingResult results;
-  results.initialOp = *identityTensor;
+  results.initialValues = initTensors;
   results.loops = forallOp;
   results.parallelTiledOp = tiledOp;
   results.mergeOp = mergeOp;
diff --git a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
index bd870d4..f512be4 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
@@ -250,7 +250,7 @@ template <typename LinalgOpTy>
 struct LinalgOpPartialReductionInterface
     : public PartialReductionOpInterface::ExternalModel<
           LinalgOpPartialReductionInterface<LinalgOpTy>, LinalgOpTy> {
-  FailureOr<Operation *> generateInitialTensorForPartialReduction(
+  FailureOr<SmallVector<Value>> generateInitialTensorForPartialReduction(
       Operation *op, OpBuilder &b, Location loc, ArrayRef<OpFoldResult> sizes,
       ArrayRef<int> reductionDims) const {
     auto linalgOp = cast<LinalgOp>(op);
@@ -258,50 +258,58 @@ struct LinalgOpPartialReductionInterface
 
     if (linalgOp.hasPureBufferSemantics())
       return op->emitOpError("expected operation to have tensor semantics");
-    // Insert the new parallel dimension based on the index of the reduction
-    // loops. This could be controlled by user for more flexibility.
 
-    SmallVector<Operation *, 4> combinerOps;
-    if (!matchReduction(linalgOp.getRegionOutputArgs(), 0, combinerOps) ||
-        combinerOps.size() != 1)
-      return op->emitOpError("Failed to anaysis the reduction operation.");
-
-    Operation *reductionOp = combinerOps[0];
-    std::optional<TypedAttr> identity = arith::getNeutralElement(reductionOp);
-    if (!identity.has_value())
-      return op->emitOpError(
-          "Failed to get an identity value for the reduction operation.");
-
-    ArrayRef<int64_t> oldShape =
-        linalgOp.getShape(linalgOp.getDpsInitOperand(0));
-
-    // Calculate the new shape, we insert the new dimensions based on the index
-    // of the reduction dimensions.
-    SmallVector<int64_t> newOutputShape;
-    SmallVector<Value> dynamicDims;
-    int64_t currReductionDims = 0;
-    DenseSet<int> reductionDimsSet(reductionDims.begin(), reductionDims.end());
-    for (int64_t idx :
-         llvm::seq<int64_t>(0, oldShape.size() + reductionDims.size())) {
-      if (reductionDimsSet.contains(idx)) {
-        dispatchIndexOpFoldResults(sizes[idx], dynamicDims, newOutputShape);
-        currReductionDims++;
-        continue;
+    SmallVector<Value> inits;
+    for (int initIdx = 0, e = linalgOp.getNumDpsInits(); initIdx < e;
+         ++initIdx) {
+      // Insert the new parallel dimension based on the index of the reduction
+      // loops. This could be controlled by user for more flexibility.
+      SmallVector<Operation *, 4> combinerOps;
+      if (!matchReduction(linalgOp.getRegionOutputArgs(), initIdx,
+                          combinerOps) ||
+          combinerOps.size() != 1)
+        return op->emitOpError("Failed to anaysis the reduction operation.");
+
+      Operation *reductionOp = combinerOps[0];
+      std::optional<TypedAttr> identity = arith::getNeutralElement(reductionOp);
+      if (!identity.has_value())
+        return op->emitOpError(
+            "Failed to get an identity value for the reduction operation.");
+
+      ArrayRef<int64_t> oldShape =
+          linalgOp.getShape(linalgOp.getDpsInitOperand(initIdx));
+
+      // Calculate the new shape, we insert the new dimensions based on the
+      // index of the reduction dimensions.
+      SmallVector<int64_t> newOutputShape;
+      SmallVector<Value> dynamicDims;
+      int64_t currReductionDims = 0;
+      DenseSet<int> reductionDimsSet(reductionDims.begin(),
+                                     reductionDims.end());
+      for (int64_t idx :
+           llvm::seq<int64_t>(0, oldShape.size() + reductionDims.size())) {
+        if (reductionDimsSet.contains(idx)) {
+          dispatchIndexOpFoldResults(sizes[idx], dynamicDims, newOutputShape);
+          currReductionDims++;
+          continue;
+        }
+        int64_t oldIdx = idx - currReductionDims;
+        int64_t dim = oldShape[oldIdx];
+        newOutputShape.push_back(dim);
+        if (ShapedType::isDynamic(dim))
+          dynamicDims.push_back(b.create<tensor::DimOp>(
+              loc, linalgOp.getDpsInitOperand(initIdx)->get(), oldIdx));
       }
-      int64_t oldIdx = idx - currReductionDims;
-      int64_t dim = oldShape[oldIdx];
-      newOutputShape.push_back(dim);
-      if (ShapedType::isDynamic(dim))
-        dynamicDims.push_back(b.create<tensor::DimOp>(
-            loc, linalgOp.getDpsInitOperand(0)->get(), oldIdx));
+      Value emptyTensor = b.create<tensor::EmptyOp>(
+          loc, newOutputShape,
+          linalgOp.getRegionOutputArgs()[initIdx].getType(), dynamicDims);
+      Value constantOp = b.create<arith::ConstantOp>(loc, *identity);
+      auto identityTensor =
+          b.create<linalg::FillOp>(loc, constantOp, emptyTensor);
+      inits.push_back(identityTensor.getResult(0));
     }
-    Value emptyTensor = b.create<tensor::EmptyOp>(
-        loc, newOutputShape, linalgOp.getRegionOutputArgs()[0].getType(),
-        dynamicDims);
-    Value constantOp = b.create<arith::ConstantOp>(loc, *identity);
-    auto identityTensor =
-        b.create<linalg::FillOp>(loc, constantOp, emptyTensor);
-    return identityTensor.getOperation();
+
+    return inits;
   }
 
   Operation *tileToPartialReduction(Operation *op, OpBuilder &b, Location loc,
@@ -312,44 +320,64 @@ struct LinalgOpPartialReductionInterface
     OpBuilder::InsertionGuard guard(b);
     auto linalgOp = cast<LinalgOp>(op);
 
-    AffineMap oldOutputMap =
-        linalgOp.getMatchingIndexingMap(linalgOp.getDpsInitOperand(0));
-    SmallVector<AffineExpr> outputExpr(oldOutputMap.getNumResults() +
-                                       reductionDims.size());
-
-    for (int idx : reductionDims)
-      outputExpr[idx] = b.getAffineDimExpr(idx);
-    int currExpr = 0;
-    for (int idx : llvm::seq<int>(0, outputExpr.size())) {
-      if (outputExpr[idx])
-        continue;
-      outputExpr[idx] = oldOutputMap.getResult(currExpr++);
+    // Step 1. Extend init maps to have reduction dimension dims, since we
+    // are converting them to parallel dimensions.
+    SmallVector<AffineMap> newInitMaps;
+    newInitMaps.reserve(linalgOp.getNumDpsInits());
+    for (int idx : llvm::seq<int>(0, linalgOp.getNumDpsInits())) {
+      // TODO: linalg::Generic doesn't have getDpsInitOperands. Can replace
+      // this with a for range loop when we have it.
+      AffineMap newMap =
+          linalgOp.getMatchingIndexingMap(linalgOp.getDpsInitOperand(idx));
+      for (int redPos : reductionDims) {
+        newMap = newMap.insertResult(b.getAffineDimExpr(redPos),
+                                     newMap.getNumResults());
+      }
+      newInitMaps.push_back(newMap);
     }
 
-    // Step 1: Extract a slice of the input operands.
-    SmallVector<Value> valuesToTile = linalgOp.getDpsInputs();
-    SmallVector<Value, 4> tiledOperands = makeTiledShapes(
-        b, loc, linalgOp, valuesToTile, offsets, sizes, {}, true);
+    // Step 2a: Extract a slice of the input operands.
+    SmallVector<Value, 4> tiledInputs = makeTiledShapes(
+        b, loc, linalgOp, linalgOp.getDpsInputs(), offsets, sizes, {}, true);
+
+    // Step 2b: Extract a slice of the init operands.
+    SmallVector<Value, 1> tiledInits;
+    for (auto [valueMap, valueToTile] : llvm::zip_equal(newInitMaps, init)) {
+      int64_t initRank = valueMap.getNumResults();
+      SmallVector<OpFoldResult> initOffset(initRank, b.getIndexAttr(0));
+      SmallVector<OpFoldResult> initStride(initRank, b.getIndexAttr(1));
+      SmallVector<OpFoldResult> initSizes;
+      for (AffineExpr dimExpr : valueMap.getResults()) {
+        auto dim = cast<AffineDimExpr>(dimExpr);
+        initSizes.push_back(sizes[dim.getPosition()]);
+      }
+      // TODO: Use SubsetExtractOpInterface here once available.
+      auto extractSlice = b.create<tensor::ExtractSliceOp>(
+          loc, valueToTile, initOffset, initSizes, initStride);
+      tiledInits.push_back(extractSlice);
+    }
 
-    // Step 2: Extract the accumulator operands
-    SmallVector<OpFoldResult> strides(offsets.size(), b.getIndexAttr(1));
-    SmallVector<OpFoldResult> outOffsets(offsets.size(), b.getIndexAttr(0));
-    // TODO: use SubsetExtractOpInterface once it is available.
-    Value out = b.create<tensor::ExtractSliceOp>(loc, init[0], outOffsets,
-                                                 sizes, strides);
+    // Update the indexing maps.
+    SmallVector<AffineMap> newMaps = linalgOp.getIndexingMapsArray();
+    // Change the init maps.
+    for (int idx : llvm::seq<int>(0, linalgOp.getNumDpsInits())) {
+      // TODO: linalg::Generic doesn't have getDpsInitOperands. Can replace
+      // this with a for range loop when we have it.
+      OpOperand *initOperand = linalgOp.getDpsInitOperand(idx);
+      int64_t mapIdx = linalgOp.getIndexingMapIndex(initOperand);
+      newMaps[mapIdx] = newInitMaps[idx];
+    }
 
-    // Step3. Create a generic op where the reduction dimensions are replaced
-    // by a parallel dimension of the size of reduction.
+    // Step 3. Change the reduction dim iterator types.
     SmallVector<utils::IteratorType> newIteratorTypes =
         linalgOp.getIteratorTypesArray();
     for (int dim : reductionDims)
       newIteratorTypes[dim] = utils::IteratorType::parallel;
-    SmallVector<AffineMap> newMaps = linalgOp.getIndexingMapsArray();
-    newMaps.back() = AffineMap::get(newMaps.back().getNumDims(), 0, outputExpr,
-                                    linalgOp.getContext());
+
+    // Step 4. Create the new generic op.
     auto genericOp =
-        b.create<GenericOp>(loc, TypeRange({out.getType()}), tiledOperands,
-                            ValueRange({out}), newMaps, newIteratorTypes);
+        b.create<GenericOp>(loc, ValueRange(tiledInits).getTypes(), tiledInputs,
+                            tiledInits, newMaps, newIteratorTypes);
     IRMapping mapping;
     op->getRegion(0).cloneInto(&genericOp.getRegion(),
                                genericOp.getRegion().begin(), mapping);
@@ -361,40 +389,53 @@ struct LinalgOpPartialReductionInterface
                              ArrayRef<int> reductionDims) const {
     auto linalgOp = cast<LinalgOp>(op);
 
-    DenseSet<int> reductionDimsSet(reductionDims.begin(), reductionDims.end());
-
-    // Then create a new reduction that only reduce the newly added dimensions
-    // from the previous op.
-    int64_t intermRank = cast<ShapedType>(partialReduce[0].getType()).getRank();
-    AffineMap inputMap = b.getMultiDimIdentityMap(intermRank);
-    SmallVector<utils::IteratorType> reductionIteratorTypes;
-    SmallVector<AffineExpr> exprs;
-
-    for (int64_t i : llvm::seq<int64_t>(0, intermRank)) {
-      if (reductionDimsSet.contains(i)) {
-        reductionIteratorTypes.push_back(utils::IteratorType::reduction);
-      } else {
-        exprs.push_back(b.getAffineDimExpr(i));
-        reductionIteratorTypes.push_back(utils::IteratorType::parallel);
+    // Step 1. Recover the dims that actually need to be merged from the
+    // original operation. We can classify the original iterators as follows:
+    //
+    // parallel                         --> parallel
+    // reduction + not in reductionDims --> parallel (already reduced)
+    // reduction + in reductionDims     --> reduction (will reduce now)
+    SmallVector<utils::IteratorType> iterators(linalgOp.getNumLoops(),
+                                               utils::IteratorType::parallel);
+    for (int redIdx : reductionDims)
+      iterators[redIdx] = utils::IteratorType::reduction;
+
+    // Step 2. For each partial result, create a map to index it. This map
+    // is simply the indexing map for the original result with reductionDims
+    // appended (as produced in tileToPartialReduction).
+    int64_t numInits = linalgOp.getNumDpsInits();
+    SmallVector<AffineMap> indexingMaps(numInits * 2);
+    for (int idx : llvm::seq<int>(0, numInits)) {
+      AffineMap &inputMap = indexingMaps[idx];
+      AffineMap &outputMap = indexingMaps[numInits + idx];
+
+      outputMap =
+          linalgOp.getMatchingIndexingMap(linalgOp.getDpsInitOperand(idx));
+      inputMap = outputMap;
+      for (int redPos : reductionDims) {
+        inputMap = inputMap.insertResult(b.getAffineDimExpr(redPos),
+                                         inputMap.getNumResults());
       }
     }
 
-    AffineMap outputMap =
-        AffineMap::get(intermRank, 0, exprs, op->getContext());
-    SmallVector<AffineMap> reductionMaps = {inputMap, outputMap};
-
-    SmallVector<Operation *, 4> combinerOps;
-    matchReduction(linalgOp.getRegionOutputArgs(), 0, combinerOps);
-    Operation *reductionOp = combinerOps[0];
-
     auto reduction = b.create<GenericOp>(
-        loc, op->getResultTypes(), ValueRange({partialReduce[0]}),
-        linalgOp.getDpsInits(), reductionMaps, reductionIteratorTypes,
-        [reductionOp](OpBuilder &b, Location loc, ValueRange inputs) {
-          Operation *clonedReductionOp = b.clone(*reductionOp);
-          clonedReductionOp->setOperand(0, inputs[0]);
-          clonedReductionOp->setOperand(1, inputs[1]);
-          b.create<linalg::YieldOp>(loc, clonedReductionOp->getResult(0));
+        loc, op->getResultTypes(), partialReduce, linalgOp.getDpsInits(),
+        indexingMaps, iterators,
+        [&linalgOp](OpBuilder &b, Location loc, ValueRange inputs) {
+          int64_t numInits = linalgOp.getNumDpsInits();
+          SmallVector<Value> yieldedValues;
+          for (int idx : llvm::seq<int>(0, numInits)) {
+            // Get the combiner op.
+            SmallVector<Operation *, 4> combinerOps;
+            matchReduction(linalgOp.getRegionOutputArgs(), idx, combinerOps);
+            Operation *clonedReductionOp = b.clone(*combinerOps[0]);
+            // Combine the input at idx and output at numInits + idx.
+            clonedReductionOp->setOperand(0, inputs[idx]);
+            clonedReductionOp->setOperand(1, inputs[numInits + idx]);
+            // Yield.
+            yieldedValues.push_back(clonedReductionOp->getResult(0));
+          }
+          b.create<linalg::YieldOp>(loc, yieldedValues);
         });
     return reduction.getOperation();
   }
diff --git a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
index d4329b4..ec1acbb 100644
--- a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
+++ b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
@@ -20,6 +20,7 @@
 #include "mlir/IR/Location.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Value.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
@@ -28,6 +29,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Casting.h"
 #include <algorithm>
 #include <functional>
 #include <iterator>
@@ -99,7 +101,7 @@ Operation *MeshDialect::materializeConstant(OpBuilder &builder, Attribute value,
 static FailureOr<MeshOp> getMeshAndVerify(Operation *op,
                                           FlatSymbolRefAttr meshSymbol,
                                           SymbolTableCollection &symbolTable) {
-  mesh::MeshOp mesh = getMesh(op, meshSymbol, symbolTable);
+  mesh::MeshOp mesh = getMeshOrNull(op, meshSymbol, symbolTable);
   if (!mesh) {
     return op->emitError() << "Undefined required mesh symbol \""
                            << meshSymbol.getValue() << "\".";
@@ -178,6 +180,88 @@ Type mesh::shardType(Type type, MeshOp mesh, MeshShardingAttr sharding) {
   return type;
 }
 
+void mlir::mesh::maybeInsertTargetShardingAnnotation(MeshShardingAttr sharding,
+                                                     OpOperand &operand,
+                                                     OpBuilder &builder) {
+  OpBuilder::InsertionGuard insertionGuard(builder);
+  Value operandValue = operand.get();
+  Operation *operandOp = operand.getOwner();
+  builder.setInsertionPointAfterValue(operandValue);
+  ShardOp shardOp = dyn_cast<ShardOp>(operandOp);
+  if (shardOp && shardOp.getShard() == sharding &&
+      !shardOp.getAnnotateForUsers()) {
+    // No need for anything the correct sharding is already set.
+    return;
+  }
+
+  auto newShardOp =
+      builder.create<ShardOp>(operandValue.getLoc(), operandValue, sharding,
+                              /*annotate_for_users*/ false);
+  IRRewriter rewriter(builder);
+  rewriter.replaceUsesWithIf(
+      operandValue, newShardOp, [operandOp, operandValue](OpOperand &use) {
+        return use.getOwner() == operandOp && use.get() == operandValue;
+      });
+
+  if (!shardOp || shardOp.getAnnotateForUsers()) {
+    return;
+  }
+
+  auto newShardOp2 = builder.create<ShardOp>(
+      operandValue.getLoc(), newShardOp, sharding, /*annotate_for_users*/ true);
+  rewriter.replaceAllUsesExcept(newShardOp, newShardOp2, newShardOp2);
+}
+
+void mlir::mesh::maybeInsertTargetShardingAnnotation(MeshShardingAttr sharding,
+                                                     OpResult result,
+                                                     OpBuilder &builder) {
+  for (auto &use : llvm::make_early_inc_range(result.getUses())) {
+    maybeInsertTargetShardingAnnotation(sharding, use, builder);
+  }
+}
+
+void mlir::mesh::maybeInsertSourceShardingAnnotation(MeshShardingAttr sharding,
+                                                     OpOperand &operand,
+                                                     OpBuilder &builder) {
+  OpBuilder::InsertionGuard insertionGuard(builder);
+  Value operandValue = operand.get();
+  Operation *operandOp = operand.getOwner();
+  Operation *operandSrcOp = operandValue.getDefiningOp();
+  bool isBlockArg = !operandSrcOp;
+  ShardOp shardOp = dyn_cast_or_null<ShardOp>(operandSrcOp);
+
+  if (shardOp && shardOp.getShard() == sharding &&
+      shardOp.getAnnotateForUsers()) {
+    // No need for anything the correct sharding is already set.
+    return;
+  }
+
+  builder.setInsertionPoint(operandOp);
+  auto newShardOp =
+      builder.create<ShardOp>(operandValue.getLoc(), operandValue, sharding,
+                              /*annotate_for_users*/ true);
+  IRRewriter rewriter(builder);
+  rewriter.replaceUsesWithIf(
+      operandValue, newShardOp, [operandOp, operandValue](OpOperand &use) {
+        return use.getOwner() == operandOp && use.get() == operandValue;
+      });
+
+  if (isBlockArg || !shardOp || !shardOp.getAnnotateForUsers()) {
+    // No need for resharding.
+    return;
+  }
+
+  builder.setInsertionPoint(newShardOp);
+  auto newPreceedingShardOp =
+      builder.create<ShardOp>(operandValue.getLoc(), operandValue, sharding,
+                              /*annotate_for_users*/ false);
+  rewriter.replaceUsesWithIf(newShardOp.getOperand(), newPreceedingShardOp,
+                             [&newShardOp](OpOperand &use) {
+                               return use.getOwner() ==
+                                      newShardOp.getOperation();
+                             });
+}
+
 //===----------------------------------------------------------------------===//
 // mesh.mesh op
 //===----------------------------------------------------------------------===//
@@ -286,6 +370,10 @@ bool MeshShardingAttr::operator==(Attribute rhs) const {
   return rhsAsMeshShardingAttr && *this == rhsAsMeshShardingAttr;
 }
 
+bool MeshShardingAttr::operator!=(Attribute rhs) const {
+  return !(*this == rhs);
+}
+
 bool MeshShardingAttr::operator==(MeshShardingAttr rhs) const {
   if (getMesh() != rhs.getMesh() || getPartialAxes() != rhs.getPartialAxes()) {
     return false;
@@ -311,6 +399,10 @@ bool MeshShardingAttr::operator==(MeshShardingAttr rhs) const {
                       std::mem_fn(&MeshAxesAttr::empty));
 }
 
+bool MeshShardingAttr::operator!=(MeshShardingAttr rhs) const {
+  return !(*this == rhs);
+}
+
 //===----------------------------------------------------------------------===//
 // mesh.shard op
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp b/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp
index dbb9e66..54fc91c 100644
--- a/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp
+++ b/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp
@@ -13,6 +13,7 @@
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
@@ -388,22 +389,11 @@ FailureOr<ShardingOption> mesh::detail::defaultGetShardingOption(
   return shardingOption;
 }
 
-//===----------------------------------------------------------------------===//
-// detail::defaultAddShardingAnnotations
-//===----------------------------------------------------------------------===//
-
-// To add a `mesh.shard` op for the given result, based on the details provided
-// in `shardingOption`, `map`, and `loopTypes`.
-static LogicalResult addShardOp(OpBuilder &b, OpResult result,
-                                const ShardingOption &shardingOption,
-                                AffineMap map,
-                                ArrayRef<utils::IteratorType> loopTypes,
-                                ArrayRef<ReductionKind> reductionLoopKinds) {
-  FailureOr<std::pair<bool, MeshShardingAttr>> maybeSharding =
-      getMeshShardingAttr(result);
-  if (succeeded(maybeSharding) && !maybeSharding->first)
-    return success();
-
+// Get the sharding attributed for the given result and sharding option.
+MeshShardingAttr
+getShardingAttribute(OpResult result, const ShardingOption &shardingOption,
+                     AffineMap map, ArrayRef<utils::IteratorType> loopTypes,
+                     ArrayRef<ReductionKind> reductionLoopKinds) {
   auto resultType = cast<RankedTensorType>(result.getType());
   SmallVector<SmallVector<MeshAxis>> splitAxes(resultType.getRank());
   SmallVector<MeshAxis> partialAxes;
@@ -438,26 +428,15 @@ static LogicalResult addShardOp(OpBuilder &b, OpResult result,
   }
 
   removeTrailingEmptySubArray(splitAxes);
-  MeshShardingAttr shardAttr = MeshShardingAttr::get(
-      b.getContext(), shardingOption.mesh, splitAxes, partialAxes, partialType);
-  OpBuilder::InsertionGuard guard(b);
-  b.setInsertionPointAfterValue(result);
-  auto shardOp = b.create<ShardOp>(result.getLoc(), resultType, result,
-                                   shardAttr, /*annotate_for_users*/ false);
-  result.replaceAllUsesExcept(shardOp, shardOp);
-  return success();
+  return MeshShardingAttr::get(result.getContext(), shardingOption.mesh,
+                               splitAxes, partialAxes, partialType);
 }
 
-// To add a `mesh.shard` op for the given operand, based on the details provided
-// in `shardingOption`, `map`, and `loopTypes`.
-static LogicalResult addShardOp(OpBuilder &b, OpOperand &opOperand,
-                                const ShardingOption &shardingOption,
-                                AffineMap map) {
-  auto maybeShardingAttr = getMeshShardingAttr(opOperand);
-  if (succeeded(maybeShardingAttr) && maybeShardingAttr->first)
-    return success();
-  Value operand = opOperand.get();
-  auto operandType = cast<RankedTensorType>(operand.getType());
+static FailureOr<MeshShardingAttr>
+getShardingAttribute(OpOperand &opOperand, const ShardingOption &shardingOption,
+                     AffineMap map) {
+  Value operandValue = opOperand.get();
+  auto operandType = cast<RankedTensorType>(operandValue.getType());
   SmallVector<SmallVector<MeshAxis>> splitAxes(operandType.getRank());
   unsigned numDims = map.getNumDims();
   for (auto it : llvm::enumerate(map.getResults())) {
@@ -483,19 +462,79 @@ static LogicalResult addShardOp(OpBuilder &b, OpOperand &opOperand,
   }
 
   removeTrailingEmptySubArray(splitAxes);
-  MeshShardingAttr shardAttr =
-      MeshShardingAttr::get(b.getContext(), shardingOption.mesh, splitAxes);
+  return MeshShardingAttr::get(opOperand.get().getContext(),
+                               shardingOption.mesh, splitAxes);
+}
+
+FailureOr<SmallVector<MeshShardingAttr>>
+mesh::detail::defaultGetShardingAnnotations(
+    Operation *op, const ShardingOption &shardingOption) {
+  SmallVector<MeshShardingAttr> res;
+
+  ShardingInterface shardingOp = llvm::cast<ShardingInterface>(op);
+  SmallVector<utils::IteratorType> loopTypes =
+      shardingOp.getLoopIteratorTypes();
+  SmallVector<ReductionKind> reductionKinds =
+      shardingOp.getReductionLoopIteratorKinds();
+  SmallVector<AffineMap> maps = shardingOp.getIndexingMaps();
+  unsigned numOperands = op->getNumOperands();
+
+  for (OpOperand &opOperand : op->getOpOperands()) {
+    FailureOr<MeshShardingAttr> shardingAttr = getShardingAttribute(
+        opOperand, shardingOption, maps[opOperand.getOperandNumber()]);
+    if (failed(shardingAttr))
+      return failure();
+    res.push_back(*shardingAttr);
+  }
+
+  for (OpResult result : op->getResults()) {
+    res.push_back(getShardingAttribute(
+        result, shardingOption, maps[numOperands + result.getResultNumber()],
+        loopTypes, reductionKinds));
+  }
+
+  return res;
+}
+
+//===----------------------------------------------------------------------===//
+// detail::defaultAddShardingAnnotations
+//===----------------------------------------------------------------------===//
+
+// To add a `mesh.shard` op for the given result, based on the details provided
+// in `shardingOption`, `map`, and `loopTypes`.
+static LogicalResult addShardOp(OpBuilder &b, OpResult result,
+                                const ShardingOption &shardingOption,
+                                AffineMap map,
+                                ArrayRef<utils::IteratorType> loopTypes,
+                                ArrayRef<ReductionKind> reductionLoopKinds) {
+  MeshShardingAttr shardAttr = getShardingAttribute(
+      result, shardingOption, map, loopTypes, reductionLoopKinds);
+  maybeInsertTargetShardingAnnotation(shardAttr, result, b);
+
+  return success();
+}
+
+// To add a `mesh.shard` op for the given operand, based on the details provided
+// in `shardingOption`, `map`, and `loopTypes`.
+static LogicalResult addShardOp(OpBuilder &b, OpOperand &opOperand,
+                                const ShardingOption &shardingOption,
+                                AffineMap map) {
+
+  FailureOr<MeshShardingAttr> shardAttr =
+      getShardingAttribute(opOperand, shardingOption, map);
+  if (failed(shardAttr)) {
+    return failure();
+  }
   OpBuilder::InsertionGuard guard(b);
-  b.setInsertionPoint(opOperand.getOwner());
-  auto shardOp = b.create<ShardOp>(operand.getLoc(), operandType, operand,
-                                   shardAttr, true);
-  opOperand.set(shardOp);
+  maybeInsertSourceShardingAnnotation(*shardAttr, opOperand, b);
 
   return success();
 }
 
 LogicalResult mesh::detail::defaultAddShardingAnnotations(
     Operation *op, OpBuilder &b, const ShardingOption &shardingOption) {
+  assert(!shardingOption.empty && shardingOption.mesh);
+
   ShardingInterface shardingOp = llvm::cast<ShardingInterface>(op);
   SmallVector<utils::IteratorType> loopTypes =
       shardingOp.getLoopIteratorTypes();
diff --git a/mlir/lib/Dialect/Mesh/Transforms/ShardingPropagation.cpp b/mlir/lib/Dialect/Mesh/Transforms/ShardingPropagation.cpp
index 29320f1..870ac4a 100644
--- a/mlir/lib/Dialect/Mesh/Transforms/ShardingPropagation.cpp
+++ b/mlir/lib/Dialect/Mesh/Transforms/ShardingPropagation.cpp
@@ -12,9 +12,16 @@
 #include "mlir/Dialect/Mesh/IR/MeshDialect.h"
 #include "mlir/Dialect/Mesh/IR/MeshOps.h"
 #include "mlir/Dialect/Mesh/Interfaces/ShardingInterface.h"
+#include "mlir/IR/Verifier.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 #include <vector>
 
 namespace mlir {
@@ -30,6 +37,70 @@ namespace mesh {
 using namespace mlir;
 using namespace mlir::mesh;
 
+enum class ReshardingRquirementKind {
+  NO_RESHARDING = 0,
+  NO_RESHARDING_FOR_EXPLICIT_ANNOTATIONS,
+  RESHARDING_FOR_EXPLICIT_ANNOTATIONS
+};
+
+#ifdef LLVM_DEBUG
+
+template <typename T>
+static llvm::raw_ostream &operator<<(llvm::raw_ostream &stream,
+                                     const SmallVector<T> &vec);
+template <typename... Ts>
+static llvm::raw_ostream &operator<<(llvm::raw_ostream &stream,
+                                     const std::tuple<Ts...> &t);
+static llvm::raw_ostream &operator<<(llvm::raw_ostream &stream,
+                                     ReshardingRquirementKind v);
+
+template <typename Stream, typename Range>
+static Stream &printRange(Stream &stream, Range &&range) {
+  stream << "[";
+  llvm::for_each(range, [&stream](auto &v) {
+    stream << v;
+    stream << ", ";
+  });
+  return stream << "]";
+}
+
+template <typename T>
+static llvm::raw_ostream &operator<<(llvm::raw_ostream &stream,
+                                     const SmallVector<T> &vec) {
+  return printRange(stream, vec);
+}
+
+[[maybe_unused]] static llvm::raw_ostream &operator<<(llvm::raw_ostream &stream,
+                                                      const ShardingOption &v) {
+  return stream << "{empty = " << v.empty << ", mesh" << v.mesh
+                << ", shardingArray = " << v.shardingArray << "}";
+}
+
+template <typename Stream, typename... Ts, size_t... Is>
+static Stream &printTuple(Stream &stream, std::tuple<Ts...> tuple,
+                          std::index_sequence<Is...>) {
+  static_assert(sizeof...(Is) == sizeof...(Ts),
+                "Indices must have same number of elements as tuple types!");
+  static_assert(sizeof...(Ts) > 0, "Cannot insert empty tuple into stream.");
+
+  stream << "{";
+  ((stream << std::get<Is>(tuple) << ", "), ...);
+  return stream << "}";
+}
+
+template <typename... Ts>
+static llvm::raw_ostream &operator<<(llvm::raw_ostream &stream,
+                                     const std::tuple<Ts...> &t) {
+  return printTuple(stream, t, std::index_sequence_for<Ts...>{});
+}
+
+[[maybe_unused]] static llvm::raw_ostream &
+operator<<(llvm::raw_ostream &stream, ReshardingRquirementKind v) {
+  return stream << static_cast<int>(v);
+}
+
+#endif // LLVM_DEBUG
+
 //===----------------------------------------------------------------------===//
 // Utilities
 //===----------------------------------------------------------------------===//
@@ -77,6 +148,138 @@ getOrderedPossibleShardingAttrs(ArrayRef<MeshShardingAttr> mustShardings,
   return allShardingAttrs;
 }
 
+// The order of preference is form highest to lowest:
+// 1. No resharding is required (all existing annotations are compatible).
+// 2. No resharding for operands/results that have annotation specifically
+//   targeting this operation. This means
+//   * operands that are the result of `mesh.shard` ops marked with
+//     `annotate_for_users`.
+//   * results that are annotated with `mesh.shard` ops without
+//     `annotate_for_users`.
+// 3. All other cases. Resharding is required for operands/results with
+//   annotation targeting explicitly this operation.
+ReshardingRquirementKind getReshardingRquirementKind(
+    Operation *op,
+    const SmallVector<MeshShardingAttr> &operandAndResultShardings) {
+  ReshardingRquirementKind res = ReshardingRquirementKind::NO_RESHARDING;
+
+  size_t operandsCount = op->getOperands().size();
+  auto operandShardings =
+      llvm::make_range(operandAndResultShardings.begin(),
+                       operandAndResultShardings.begin() + operandsCount);
+  auto resultShardings =
+      llvm::make_range(operandAndResultShardings.begin() + operandsCount,
+                       operandAndResultShardings.end());
+
+  for (auto [operand, sharding] :
+       llvm::zip_equal(op->getOperands(), operandShardings)) {
+    ShardOp shardOp = llvm::dyn_cast_or_null<ShardOp>(operand.getDefiningOp());
+    if (!shardOp) {
+      continue;
+    }
+    bool needsResharding = shardOp.getShardAttr() != sharding;
+    bool isExplicitAnnotationForThisOp = shardOp.getAnnotateForUsers();
+    if (needsResharding) {
+      if (isExplicitAnnotationForThisOp) {
+        // This is the worst case. No need to continue.
+        return ReshardingRquirementKind::RESHARDING_FOR_EXPLICIT_ANNOTATIONS;
+      }
+      res = ReshardingRquirementKind::NO_RESHARDING_FOR_EXPLICIT_ANNOTATIONS;
+    }
+  }
+
+  for (auto [result, sharding] :
+       llvm::zip_equal(op->getResults(), resultShardings)) {
+    for (auto user : result.getUsers()) {
+      ShardOp shardOp = llvm::dyn_cast<ShardOp>(user);
+      if (!shardOp) {
+        continue;
+      }
+      bool needsResharding = shardOp.getShardAttr() != sharding;
+      bool isExplicitAnnotationForThisOp = !shardOp.getAnnotateForUsers();
+      if (needsResharding) {
+        if (isExplicitAnnotationForThisOp) {
+          // This is the worst case. No need to continue.
+          return ReshardingRquirementKind::RESHARDING_FOR_EXPLICIT_ANNOTATIONS;
+        }
+        res = ReshardingRquirementKind::NO_RESHARDING_FOR_EXPLICIT_ANNOTATIONS;
+      }
+    }
+  }
+
+  return res;
+}
+
+// From all the operand and result sharding combinations,
+// return the one that is most desirable.
+// The order of preference is:
+// 1. No resharding with respect to existing sharding annotations.
+// 2. Resharding for values that have already annotations that do not target
+//    this op.
+// 3. Resharding of existing explicit sharding annotations for this op.
+static FailureOr<ShardingOption> selectShardingOption(
+    ShardingInterface shardingOp,
+    ArrayRef<SmallVector<MeshShardingAttr>> possibleOperandShardingAttrs,
+    ArrayRef<SmallVector<MeshShardingAttr>> possibleResultShardingAttrs) {
+  SmallVector<std::tuple<ShardingOption, ReshardingRquirementKind>>
+      shardingOptionsAndReshardingRequirements;
+
+  for (ArrayRef<MeshShardingAttr> resultShardings :
+       possibleResultShardingAttrs) {
+    for (ArrayRef<MeshShardingAttr> operandShardings :
+         possibleOperandShardingAttrs) {
+      FailureOr<ShardingOption> shardingOption =
+          shardingOp.getShardingOption(operandShardings, resultShardings);
+      if (failed(shardingOption) || shardingOption->empty) {
+        continue;
+      }
+      // These shardings may not be the same as those in operandShardings and
+      // resultShardings.
+      // They may be missing some annotations.
+      // Whatever is returned by getShardingAnnotations is exactly what the op
+      // needs.
+      FailureOr<SmallVector<MeshShardingAttr>> operandAndResultShardings =
+          shardingOp.getShardingAnnotations(*shardingOption);
+      if (failed(operandAndResultShardings)) {
+        return failure();
+      }
+
+      LLVM_DEBUG(DBGS() << "operandAndResultShardings = "
+                        << *operandAndResultShardings << "\n";);
+
+      ReshardingRquirementKind reshardingRquirement =
+          getReshardingRquirementKind(shardingOp, *operandAndResultShardings);
+      if (reshardingRquirement == ReshardingRquirementKind::NO_RESHARDING) {
+        // This is the best case. No need to go on.
+        return *shardingOption;
+      }
+
+      shardingOptionsAndReshardingRequirements.emplace_back(
+          std::move(*shardingOption), reshardingRquirement);
+    }
+  }
+
+  if (shardingOptionsAndReshardingRequirements.empty()) {
+    return ShardingOption::makeEmpty();
+  }
+
+  std::partial_sort(
+      shardingOptionsAndReshardingRequirements.begin(),
+      shardingOptionsAndReshardingRequirements.begin() + 1,
+      shardingOptionsAndReshardingRequirements.end(),
+      [](const std::tuple<ShardingOption, ReshardingRquirementKind> &a,
+         const std::tuple<ShardingOption, ReshardingRquirementKind> &b) {
+        return std::get<ReshardingRquirementKind>(a) <
+               std::get<ReshardingRquirementKind>(b);
+      });
+
+  LLVM_DEBUG(DBGS() << "shardingOptionsAndReshardingRequirements = "
+                    << shardingOptionsAndReshardingRequirements << "\n";);
+
+  return std::get<ShardingOption>(
+      shardingOptionsAndReshardingRequirements.front());
+}
+
 // For each operation that implements the ShardingInterface, infer the sharding
 // option of the operation from its operands and/or results using the
 // `getShardingOption` method. If the inferred sharding option is not empty, add
@@ -135,32 +338,21 @@ static LogicalResult visitOp(Operation *op, OpBuilder &builder) {
   SmallVector<SmallVector<MeshShardingAttr>> possibleResultShardingAttrs =
       getOrderedPossibleShardingAttrs(resultMustShardings,
                                       allowConflictsResultShardings);
-  FailureOr<ShardingOption> finalShardingOption = failure();
-  for (ArrayRef<MeshShardingAttr> resultShardings :
-       possibleResultShardingAttrs) {
-    if (succeeded(finalShardingOption))
-      break;
-    for (ArrayRef<MeshShardingAttr> operandShardings :
-         possibleOperandShardingAttrs) {
-      FailureOr<ShardingOption> shardingOption =
-          shardingOp.getShardingOption(operandShardings, resultShardings);
-      if (succeeded(shardingOption)) {
-        finalShardingOption = shardingOption;
-        break;
-      }
-    }
-  }
+  FailureOr<ShardingOption> shardingOption = selectShardingOption(
+      shardingOp, possibleOperandShardingAttrs, possibleResultShardingAttrs);
 
-  if (failed(finalShardingOption)) {
+  if (failed(shardingOption)) {
     op->emitOpError() << "fail to get sharding option.";
     return failure();
   }
+
+  LLVM_DEBUG(DBGS() << "Selected sharding option: " << *shardingOption << "\n");
+
   // sharding info is empty, return immediately
-  if (finalShardingOption->empty)
+  if (shardingOption->empty)
     return success();
 
-  if (failed(
-          shardingOp.addShardingAnnotations(builder, *finalShardingOption))) {
+  if (failed(shardingOp.addShardingAnnotations(builder, *shardingOption))) {
     op->emitOpError() << "fail to set sharding annotations.";
     return failure();
   }
@@ -199,6 +391,7 @@ struct ShardingPropagation
 
     LLVM_DEBUG(DBGS() << "After reversed order propagation:\n"
                       << funcOp << "\n");
+    LLVM_DEBUG(assert(succeeded(mlir::verify(funcOp))));
 
     // 2. propagate in original order
     for (Operation &op : llvm::make_early_inc_range(block))
diff --git a/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
index 6b1326d..f3e4b15 100644
--- a/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
+++ b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
@@ -493,8 +493,6 @@ TypedValue<ShapedType> reshard(ImplicitLocOpBuilder &builder, MeshOp mesh,
 TypedValue<ShapedType> reshard(OpBuilder &builder, MeshOp mesh, ShardOp source,
                                ShardOp target,
                                TypedValue<ShapedType> sourceShardValue) {
-  assert(!source.getAnnotateForUsers());
-  assert(target.getAnnotateForUsers());
   assert(source.getResult() == target.getOperand());
   ImplicitLocOpBuilder implicitLocOpBuilder(target->getLoc(), builder);
   return reshard(
@@ -628,7 +626,6 @@ spmdizeOperation(ShardOp shardOp, IRMapping &spmdizationMap,
     targetSpmdValue = spmdizationMap.lookup(shardOp.getOperand());
   } else {
     // Insert resharding.
-    assert(!srcShardOp.getAnnotateForUsers() && shardOp.getAnnotateForUsers());
     TypedValue<ShapedType> srcSpmdValue = cast<TypedValue<ShapedType>>(
         spmdizationMap.lookup(srcShardOp.getOperand()));
     targetSpmdValue = reshard(builder, srcShardOp, shardOp, srcSpmdValue,
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 24a6d5b..1108730 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -1789,7 +1789,7 @@ LogicalResult DistributeOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
-// ReductionOp
+// DeclareReductionOp
 //===----------------------------------------------------------------------===//
 
 static ParseResult parseAtomicReductionRegion(OpAsmParser &parser,
@@ -1881,21 +1881,6 @@ LogicalResult DeclareReductionOp::verifyRegions() {
   return success();
 }
 
-LogicalResult ReductionOp::verify() {
-  auto *op = (*this)->getParentWithTrait<ReductionClauseInterface::Trait>();
-  if (!op)
-    return emitOpError() << "must be used within an operation supporting "
-                            "reduction clause interface";
-  while (op) {
-    for (const auto &var :
-         cast<ReductionClauseInterface>(op).getAllReductionVars())
-      if (var == getAccumulator())
-        return success();
-    op = op->getParentWithTrait<ReductionClauseInterface::Trait>();
-  }
-  return emitOpError() << "the accumulator is not used by the parent";
-}
-
 //===----------------------------------------------------------------------===//
 // TaskOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp b/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp
index 890ce52..cc7d317 100644
--- a/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp
+++ b/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp
@@ -101,7 +101,7 @@ parseMonomial(AsmParser &parser, Monomial &monomial, llvm::StringRef &variable,
   return success();
 }
 
-template <typename PolynoimalAttrTy, typename Monomial>
+template <typename Monomial>
 LogicalResult
 parsePolynomialAttr(AsmParser &parser, llvm::SmallVector<Monomial> &monomials,
                     llvm::StringSet<> &variables,
@@ -155,7 +155,7 @@ Attribute IntPolynomialAttr::parse(AsmParser &parser, Type type) {
   llvm::SmallVector<IntMonomial> monomials;
   llvm::StringSet<> variables;
 
-  if (failed(parsePolynomialAttr<IntPolynomialAttr, IntMonomial>(
+  if (failed(parsePolynomialAttr<IntMonomial>(
           parser, monomials, variables,
           [&](IntMonomial &monomial) -> OptionalParseResult {
             APInt parsedCoeff(apintBitWidth, 1);
@@ -175,7 +175,6 @@ Attribute IntPolynomialAttr::parse(AsmParser &parser, Type type) {
   }
   return IntPolynomialAttr::get(parser.getContext(), result.value());
 }
-
 Attribute FloatPolynomialAttr::parse(AsmParser &parser, Type type) {
   if (failed(parser.parseLess()))
     return {};
@@ -191,8 +190,8 @@ Attribute FloatPolynomialAttr::parse(AsmParser &parser, Type type) {
     return OptionalParseResult(result);
   };
 
-  if (failed(parsePolynomialAttr<FloatPolynomialAttr, FloatMonomial>(
-          parser, monomials, variables, parseAndStoreCoefficient))) {
+  if (failed(parsePolynomialAttr<FloatMonomial>(parser, monomials, variables,
+                                                parseAndStoreCoefficient))) {
     return {};
   }
 
diff --git a/mlir/lib/Dialect/Polynomial/IR/PolynomialCanonicalization.td b/mlir/lib/Dialect/Polynomial/IR/PolynomialCanonicalization.td
index 9d09799..e37bcf7 100644
--- a/mlir/lib/Dialect/Polynomial/IR/PolynomialCanonicalization.td
+++ b/mlir/lib/Dialect/Polynomial/IR/PolynomialCanonicalization.td
@@ -9,11 +9,14 @@
 #ifndef POLYNOMIAL_CANONICALIZATION
 #define POLYNOMIAL_CANONICALIZATION
 
-include "mlir/Dialect/Polynomial/IR/Polynomial.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "mlir/Dialect/Polynomial/IR/Polynomial.td"
+include "mlir/IR/EnumAttr.td"
 include "mlir/IR/OpBase.td"
 include "mlir/IR/PatternBase.td"
 
+defvar DefOverflow = ConstantEnumCase<Arith_IntegerOverflowAttr, "none">;
+
 // Get a -1 integer attribute of the same type as the polynomial SSA value's
 // ring coefficient type.
 def getMinusOne
@@ -39,4 +42,40 @@ def NTTAfterINTT : Pat<
   []
 >;
 
+// NTTs are expensive, and addition in coefficient or NTT domain should be
+// equivalently expensive, so reducing the number of NTTs is optimal.
+// ntt(a) + ntt(b) -> ntt(a + b)
+def NTTOfAdd : Pat<
+  (Arith_AddIOp
+    (Polynomial_NTTOp $p1),
+    (Polynomial_NTTOp $p2),
+    $overflow),
+  (Polynomial_NTTOp (Polynomial_AddOp $p1, $p2)),
+  []
+>;
+// intt(a) + intt(b) -> intt(a + b)
+def INTTOfAdd : Pat<
+  (Polynomial_AddOp
+    (Polynomial_INTTOp $t1),
+    (Polynomial_INTTOp $t2)),
+  (Polynomial_INTTOp (Arith_AddIOp $t1, $t2, DefOverflow)),
+  []
+>;
+// repeated for sub
+def NTTOfSub : Pat<
+  (Arith_SubIOp
+    (Polynomial_NTTOp $p1),
+    (Polynomial_NTTOp $p2),
+    $overflow),
+  (Polynomial_NTTOp (Polynomial_SubOp $p1, $p2)),
+  []
+>;
+def INTTOfSub : Pat<
+  (Polynomial_SubOp
+    (Polynomial_INTTOp $t1),
+    (Polynomial_INTTOp $t2)),
+  (Polynomial_INTTOp (Arith_SubIOp $t1, $t2, DefOverflow)),
+  []
+>;
+
 #endif  // POLYNOMIAL_CANONICALIZATION
diff --git a/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp b/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp
index 1a2439f..3d30279 100644
--- a/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp
+++ b/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp
@@ -186,6 +186,88 @@ LogicalResult INTTOp::verify() {
   return verifyNTTOp(this->getOperation(), ring, tensorType);
 }
 
+ParseResult ConstantOp::parse(OpAsmParser &parser, OperationState &result) {
+  // Using the built-in parser.parseAttribute requires the full
+  // #polynomial.typed_int_polynomial syntax, which is excessive.
+  // Instead we parse a keyword int to signal it's an integer polynomial
+  Type type;
+  if (succeeded(parser.parseOptionalKeyword("float"))) {
+    Attribute floatPolyAttr = FloatPolynomialAttr::parse(parser, nullptr);
+    if (floatPolyAttr) {
+      if (parser.parseColon() || parser.parseType(type))
+        return failure();
+      result.addAttribute("value",
+                          TypedFloatPolynomialAttr::get(type, floatPolyAttr));
+      result.addTypes(type);
+      return success();
+    }
+  }
+
+  if (succeeded(parser.parseOptionalKeyword("int"))) {
+    Attribute intPolyAttr = IntPolynomialAttr::parse(parser, nullptr);
+    if (intPolyAttr) {
+      if (parser.parseColon() || parser.parseType(type))
+        return failure();
+
+      result.addAttribute("value",
+                          TypedIntPolynomialAttr::get(type, intPolyAttr));
+      result.addTypes(type);
+      return success();
+    }
+  }
+
+  // In the worst case, still accept the verbose versions.
+  TypedIntPolynomialAttr typedIntPolyAttr;
+  OptionalParseResult res =
+      parser.parseOptionalAttribute<TypedIntPolynomialAttr>(
+          typedIntPolyAttr, "value", result.attributes);
+  if (res.has_value() && succeeded(res.value())) {
+    result.addTypes(typedIntPolyAttr.getType());
+    return success();
+  }
+
+  TypedFloatPolynomialAttr typedFloatPolyAttr;
+  res = parser.parseAttribute<TypedFloatPolynomialAttr>(
+      typedFloatPolyAttr, "value", result.attributes);
+  if (res.has_value() && succeeded(res.value())) {
+    result.addTypes(typedFloatPolyAttr.getType());
+    return success();
+  }
+
+  return failure();
+}
+
+void ConstantOp::print(OpAsmPrinter &p) {
+  p << " ";
+  if (auto intPoly = dyn_cast<TypedIntPolynomialAttr>(getValue())) {
+    p << "int";
+    intPoly.getValue().print(p);
+  } else if (auto floatPoly = dyn_cast<TypedFloatPolynomialAttr>(getValue())) {
+    p << "float";
+    floatPoly.getValue().print(p);
+  } else {
+    assert(false && "unexpected attribute type");
+  }
+  p << " : ";
+  p.printType(getOutput().getType());
+}
+
+LogicalResult ConstantOp::inferReturnTypes(
+    MLIRContext *context, std::optional<mlir::Location> location,
+    ConstantOp::Adaptor adaptor,
+    llvm::SmallVectorImpl<mlir::Type> &inferredReturnTypes) {
+  Attribute operand = adaptor.getValue();
+  if (auto intPoly = dyn_cast<TypedIntPolynomialAttr>(operand)) {
+    inferredReturnTypes.push_back(intPoly.getType());
+  } else if (auto floatPoly = dyn_cast<TypedFloatPolynomialAttr>(operand)) {
+    inferredReturnTypes.push_back(floatPoly.getType());
+  } else {
+    assert(false && "unexpected attribute type");
+    return failure();
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // TableGen'd canonicalization patterns
 //===----------------------------------------------------------------------===//
@@ -201,10 +283,10 @@ void SubOp::getCanonicalizationPatterns(RewritePatternSet &results,
 
 void NTTOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                         MLIRContext *context) {
-  results.add<NTTAfterINTT>(context);
+  results.add<NTTAfterINTT, NTTOfAdd, NTTOfSub>(context);
 }
 
 void INTTOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                          MLIRContext *context) {
-  results.add<INTTAfterNTT>(context);
+  results.add<INTTAfterNTT, INTTOfAdd, INTTOfSub>(context);
 }
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index 1a84a59..a72dafe 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -182,6 +182,9 @@ static LogicalResult generateLoopNestUsingForOp(
   if (loops.empty())
     return success();
 
+  assert(tiledResults.size() == destinationTensors.size() &&
+         "Number of results of body should be equal to number of iter args");
+
   // 6. Yield all the results of the tiled operation.
   SmallVector<Value> yieldedValues;
   for (auto [tiledValue, destinationTensor, resultOffset, resultSize] :
@@ -694,9 +697,6 @@ mlir::scf::tileReductionUsingScf(RewriterBase &b,
     tileSizesVector.append(iterationDomain.size() - tileSizesVector.size(),
                            zero);
   }
-  if (op->getNumResults() != 1)
-    return b.notifyMatchFailure(
-        op, "don't support ops with multiple results for now");
   SmallVector<utils::IteratorType> iterators =
       tilingInterfaceOp.getLoopIteratorTypes();
 
@@ -708,12 +708,13 @@ mlir::scf::tileReductionUsingScf(RewriterBase &b,
   }
 
   // 2. create the inital tensor value.
-  FailureOr<Operation *> identityTensor =
+  FailureOr<SmallVector<Value>> maybeInitTensors =
       op.generateInitialTensorForPartialReduction(b, loc, tileSizesVector,
                                                   reductionDims);
-  if (failed(identityTensor))
-    return b.notifyMatchFailure(op,
-                                "cannot create a tensor of identity value.");
+  if (failed(maybeInitTensors)) {
+    return b.notifyMatchFailure(op, "Failed to create initial tensors.");
+  }
+  SmallVector<Value> &initTensors = maybeInitTensors.value();
 
   // 3. Define the callback to use for generating the inner most tile loop body.
   Operation *parallelOp = nullptr;
@@ -753,29 +754,26 @@ mlir::scf::tileReductionUsingScf(RewriterBase &b,
     tiledResult.append(parallelOp->result_begin(), parallelOp->result_end());
     // 4d. Compute the offsets and sizes needed to insert the result of the
     // tiled value back into destination before yielding the destination.
-    SmallVector<OpFoldResult> outOffsets(offsets.size(), b.getIndexAttr(0));
-    resultOffsets.emplace_back(std::move(outOffsets));
-
-    SmallVector<OpFoldResult> outSizes;
-    for (size_t i = 0; i < offsets.size(); i++) {
-      outSizes.push_back(
-          tensor::getMixedSize(b, loc, parallelOp->getResult(0), i));
+    for (int resultIdx : llvm::seq<int>(0, parallelOp->getNumResults())) {
+      SmallVector<OpFoldResult> outOffsets(offsets.size(), b.getIndexAttr(0));
+      resultOffsets.emplace_back(std::move(outOffsets));
+
+      SmallVector<OpFoldResult> outSizes;
+      for (size_t i = 0; i < offsets.size(); i++) {
+        outSizes.push_back(
+            tensor::getMixedSize(b, loc, parallelOp->getResult(resultIdx), i));
+      }
+      resultSizes.emplace_back(std::move(outSizes));
     }
-    resultSizes.emplace_back(std::move(outSizes));
     return success();
   };
 
   // 5. Generate the tiled implementation using the destination tensors.
-  SmallVector<Value> destinationTensors =
-      llvm::map_to_vector(identityTensor.value()->getResults(),
-                          [](OpResult res) -> Value { return res; });
-
   SmallVector<LoopLikeOpInterface> loops;
   scf::SCFTilingOptions options;
   options.setLoopType(scf::SCFTilingOptions::LoopType::ForOp);
   if (failed(generateLoopNest(b, loc, options, iterationDomain, tileSizesVector,
-                              destinationTensors, innerYieldTiledValuesFn,
-                              loops)))
+                              initTensors, innerYieldTiledValuesFn, loops)))
     return b.notifyMatchFailure(op, "failed to tile for parallel reduction");
 
   SmallVector<Value> replacements = llvm::map_to_vector(
@@ -787,7 +785,7 @@ mlir::scf::tileReductionUsingScf(RewriterBase &b,
   b.replaceOp(op, mergeOp->getResults());
 
   SCFReductionTilingResult results;
-  results.initialOp = *identityTensor;
+  results.initialValues = initTensors;
   results.loops = loops;
   results.parallelTiledOp = parallelOp;
   results.mergeOp = mergeOp;
diff --git a/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp
index 7a707e7..43ad0ac 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp
@@ -93,6 +93,49 @@ private:
   bool foldSingleUseOnly = false;
 };
 
+/// tensor.empty does not define any tensor contents, so an unpadded pack
+/// can be folded away.
+struct FoldEmptyTensorWithPackOp : public OpRewritePattern<PackOp> {
+  using OpRewritePattern<PackOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(PackOp packOp,
+                                PatternRewriter &rewriter) const override {
+    // Check for tensor.empty source.
+    auto emptyOp = packOp.getSource().getDefiningOp<EmptyOp>();
+    if (!emptyOp)
+      return failure();
+
+    // Check for padding.
+    // Packing with padding cannot be simply removed.
+    if (packOp.getPaddingValue())
+      return rewriter.notifyMatchFailure(packOp, "expects no padding value");
+
+    // Replace the pack directly with its destination.
+    rewriter.replaceOp(packOp, packOp.getDest());
+
+    return success();
+  }
+};
+
+/// tensor.empty does not define any tensor contents, so an unpack
+/// can be folded away.
+struct FoldEmptyTensorWithUnPackOp : public OpRewritePattern<UnPackOp> {
+  using OpRewritePattern<UnPackOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(UnPackOp unPackOp,
+                                PatternRewriter &rewriter) const override {
+    // Check for tensor.empty source.
+    auto emptyOp = unPackOp.getSource().getDefiningOp<EmptyOp>();
+    if (!emptyOp)
+      return failure();
+
+    // Replace the unpack directly with its destination.
+    rewriter.replaceOp(unPackOp, unPackOp.getDest());
+
+    return success();
+  }
+};
+
 } // namespace
 
 void mlir::tensor::populateFoldTensorEmptyPatterns(RewritePatternSet &patterns,
@@ -101,4 +144,6 @@ void mlir::tensor::populateFoldTensorEmptyPatterns(RewritePatternSet &patterns,
                FoldEmptyTensorWithReshapeOp<tensor::ExpandShapeOp>,
                FoldEmptyTensorWithReshapeOp<tensor::CollapseShapeOp>>(
       patterns.getContext(), /*benefit=*/1, foldSingleUseOnly);
+  patterns.add<FoldEmptyTensorWithPackOp, FoldEmptyTensorWithUnPackOp>(
+      patterns.getContext(), /*benefit=*/1);
 }
diff --git a/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp
index ebcb34e..5d6e3ec 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp
@@ -91,7 +91,8 @@ struct SimplifyPackToExpandShape : public OpRewritePattern<PackOp> {
     RankedTensorType sourceType = packOp.getSourceType();
     if (failed(isPackOnInnerMostDim(rewriter, packOp)) &&
         failed(isPackOn1D(rewriter, packOp, sourceType.getShape(),
-                          packOp.getStaticTiles()))) {
+                          packOp.getStaticTiles())) &&
+        !packOp.isLikePad()) {
       return failure();
     }
 
@@ -152,7 +153,8 @@ struct SimplifyUnPackToCollapseShape : public OpRewritePattern<UnPackOp> {
     RankedTensorType destType = unpackOp.getDestType();
     if (failed(isUnpackOnInnerMostDim(rewriter, unpackOp)) &&
         failed(isPackOn1D(rewriter, unpackOp, destType.getShape(),
-                          unpackOp.getStaticTiles()))) {
+                          unpackOp.getStaticTiles())) &&
+        !unpackOp.isLikeUnPad()) {
       return failure();
     }
 
diff --git a/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp
index d40e5f3..6cf0f84 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp
@@ -79,12 +79,42 @@ struct FoldInsertOfRankReducingInsert : public OpRewritePattern<OpTy> {
     return success();
   }
 };
+
+/// Fold expand_shape which only adds static dimensions of size `1`
+/// into insert_slice.
+template <typename OpTy>
+struct FoldPaddingExpandIntoInsert : public OpRewritePattern<OpTy> {
+  using OpRewritePattern<OpTy>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(OpTy insertSliceOp,
+                                PatternRewriter &rewriter) const override {
+    auto expandShapeOp = insertSliceOp.getSource()
+                             .template getDefiningOp<tensor::ExpandShapeOp>();
+    if (!expandShapeOp)
+      return failure();
+
+    // Only fold away simple expansion where all added dimensions have static
+    // size `1`.
+    SliceVerificationResult res = isRankReducedType(
+        expandShapeOp.getResultType(), expandShapeOp.getSrcType());
+    if (res != SliceVerificationResult::Success)
+      return rewriter.notifyMatchFailure(insertSliceOp,
+                                         "expected rank increasing expansion");
+
+    rewriter.modifyOpInPlace(insertSliceOp, [&]() {
+      insertSliceOp.getSourceMutable().assign(expandShapeOp.getSrc());
+    });
+    return success();
+  }
+};
 } // namespace
 
 void mlir::tensor::populateReassociativeReshapeFoldingPatterns(
     RewritePatternSet &patterns) {
   patterns.add<FoldExpandOfRankReducingExtract,
                FoldInsertOfRankReducingInsert<tensor::InsertSliceOp>,
-               FoldInsertOfRankReducingInsert<tensor::ParallelInsertSliceOp>>(
+               FoldInsertOfRankReducingInsert<tensor::ParallelInsertSliceOp>,
+               FoldPaddingExpandIntoInsert<tensor::InsertSliceOp>,
+               FoldPaddingExpandIntoInsert<tensor::ParallelInsertSliceOp>>(
       patterns.getContext());
 }
diff --git a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
index 6af229c..fe1a67d 100644
--- a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
+++ b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
@@ -178,18 +178,24 @@ ConstantIntRanges mlir::intrange::truncRange(const ConstantIntRanges &range,
 //===----------------------------------------------------------------------===//
 
 ConstantIntRanges
-mlir::intrange::inferAdd(ArrayRef<ConstantIntRanges> argRanges) {
+mlir::intrange::inferAdd(ArrayRef<ConstantIntRanges> argRanges,
+                         OverflowFlags ovfFlags) {
   const ConstantIntRanges &lhs = argRanges[0], &rhs = argRanges[1];
-  ConstArithFn uadd = [](const APInt &a,
-                         const APInt &b) -> std::optional<APInt> {
+
+  std::function uadd = [=](const APInt &a,
+                           const APInt &b) -> std::optional<APInt> {
     bool overflowed = false;
-    APInt result = a.uadd_ov(b, overflowed);
+    APInt result = any(ovfFlags & OverflowFlags::Nuw)
+                       ? a.uadd_sat(b)
+                       : a.uadd_ov(b, overflowed);
     return overflowed ? std::optional<APInt>() : result;
   };
-  ConstArithFn sadd = [](const APInt &a,
-                         const APInt &b) -> std::optional<APInt> {
+  std::function sadd = [=](const APInt &a,
+                           const APInt &b) -> std::optional<APInt> {
     bool overflowed = false;
-    APInt result = a.sadd_ov(b, overflowed);
+    APInt result = any(ovfFlags & OverflowFlags::Nsw)
+                       ? a.sadd_sat(b)
+                       : a.sadd_ov(b, overflowed);
     return overflowed ? std::optional<APInt>() : result;
   };
 
@@ -205,19 +211,24 @@ mlir::intrange::inferAdd(ArrayRef<ConstantIntRanges> argRanges) {
 //===----------------------------------------------------------------------===//
 
 ConstantIntRanges
-mlir::intrange::inferSub(ArrayRef<ConstantIntRanges> argRanges) {
+mlir::intrange::inferSub(ArrayRef<ConstantIntRanges> argRanges,
+                         OverflowFlags ovfFlags) {
   const ConstantIntRanges &lhs = argRanges[0], &rhs = argRanges[1];
 
-  ConstArithFn usub = [](const APInt &a,
-                         const APInt &b) -> std::optional<APInt> {
+  std::function usub = [=](const APInt &a,
+                           const APInt &b) -> std::optional<APInt> {
     bool overflowed = false;
-    APInt result = a.usub_ov(b, overflowed);
+    APInt result = any(ovfFlags & OverflowFlags::Nuw)
+                       ? a.usub_sat(b)
+                       : a.usub_ov(b, overflowed);
     return overflowed ? std::optional<APInt>() : result;
   };
-  ConstArithFn ssub = [](const APInt &a,
-                         const APInt &b) -> std::optional<APInt> {
+  std::function ssub = [=](const APInt &a,
+                           const APInt &b) -> std::optional<APInt> {
     bool overflowed = false;
-    APInt result = a.ssub_ov(b, overflowed);
+    APInt result = any(ovfFlags & OverflowFlags::Nsw)
+                       ? a.ssub_sat(b)
+                       : a.ssub_ov(b, overflowed);
     return overflowed ? std::optional<APInt>() : result;
   };
   ConstantIntRanges urange = computeBoundsBy(
@@ -232,19 +243,24 @@ mlir::intrange::inferSub(ArrayRef<ConstantIntRanges> argRanges) {
 //===----------------------------------------------------------------------===//
 
 ConstantIntRanges
-mlir::intrange::inferMul(ArrayRef<ConstantIntRanges> argRanges) {
+mlir::intrange::inferMul(ArrayRef<ConstantIntRanges> argRanges,
+                         OverflowFlags ovfFlags) {
   const ConstantIntRanges &lhs = argRanges[0], &rhs = argRanges[1];
 
-  ConstArithFn umul = [](const APInt &a,
-                         const APInt &b) -> std::optional<APInt> {
+  std::function umul = [=](const APInt &a,
+                           const APInt &b) -> std::optional<APInt> {
     bool overflowed = false;
-    APInt result = a.umul_ov(b, overflowed);
+    APInt result = any(ovfFlags & OverflowFlags::Nuw)
+                       ? a.umul_sat(b)
+                       : a.umul_ov(b, overflowed);
     return overflowed ? std::optional<APInt>() : result;
   };
-  ConstArithFn smul = [](const APInt &a,
-                         const APInt &b) -> std::optional<APInt> {
+  std::function smul = [=](const APInt &a,
+                           const APInt &b) -> std::optional<APInt> {
     bool overflowed = false;
-    APInt result = a.smul_ov(b, overflowed);
+    APInt result = any(ovfFlags & OverflowFlags::Nsw)
+                       ? a.smul_sat(b)
+                       : a.smul_ov(b, overflowed);
     return overflowed ? std::optional<APInt>() : result;
   };
 
@@ -542,32 +558,35 @@ mlir::intrange::inferXor(ArrayRef<ConstantIntRanges> argRanges) {
 //===----------------------------------------------------------------------===//
 
 ConstantIntRanges
-mlir::intrange::inferShl(ArrayRef<ConstantIntRanges> argRanges) {
+mlir::intrange::inferShl(ArrayRef<ConstantIntRanges> argRanges,
+                         OverflowFlags ovfFlags) {
   const ConstantIntRanges &lhs = argRanges[0], &rhs = argRanges[1];
-  const APInt &lhsSMin = lhs.smin(), &lhsSMax = lhs.smax(),
-              &lhsUMax = lhs.umax(), &rhsUMin = rhs.umin(),
-              &rhsUMax = rhs.umax();
+  const APInt &rhsUMin = rhs.umin(), &rhsUMax = rhs.umax();
 
-  ConstArithFn shl = [](const APInt &l,
-                        const APInt &r) -> std::optional<APInt> {
-    return r.uge(r.getBitWidth()) ? std::optional<APInt>() : l.shl(r);
+  // The signed/unsigned overflow behavior of shl by `rhs` matches a mul with
+  // 2^rhs.
+  std::function ushl = [=](const APInt &l,
+                           const APInt &r) -> std::optional<APInt> {
+    bool overflowed = false;
+    APInt result = any(ovfFlags & OverflowFlags::Nuw)
+                       ? l.ushl_sat(r)
+                       : l.ushl_ov(r, overflowed);
+    return overflowed ? std::optional<APInt>() : result;
+  };
+  std::function sshl = [=](const APInt &l,
+                           const APInt &r) -> std::optional<APInt> {
+    bool overflowed = false;
+    APInt result = any(ovfFlags & OverflowFlags::Nsw)
+                       ? l.sshl_sat(r)
+                       : l.sshl_ov(r, overflowed);
+    return overflowed ? std::optional<APInt>() : result;
   };
-
-  // The minMax inference does not work when there is danger of overflow. In the
-  // signed case, this leads to the obvious problem that the sign bit might
-  // change. In the unsigned case, it also leads to problems because the largest
-  // LHS shifted by the largest RHS does not necessarily result in the largest
-  // result anymore.
-  assert(rhsUMax.isNonNegative() && "Unexpected negative shift count");
-  if (rhsUMax.uge(lhsSMin.getNumSignBits()) ||
-      rhsUMax.uge(lhsSMax.getNumSignBits()))
-    return ConstantIntRanges::maxRange(lhsUMax.getBitWidth());
 
   ConstantIntRanges urange =
-      minMaxBy(shl, {lhs.umin(), lhsUMax}, {rhsUMin, rhsUMax},
+      minMaxBy(ushl, {lhs.umin(), lhs.umax()}, {rhsUMin, rhsUMax},
                /*isSigned=*/false);
   ConstantIntRanges srange =
-      minMaxBy(shl, {lhsSMin, lhsSMax}, {rhsUMin, rhsUMax},
+      minMaxBy(sshl, {lhs.smin(), lhs.smax()}, {rhsUMin, rhsUMax},
                /*isSigned=*/true);
   return urange.intersection(srange);
 }
diff --git a/mlir/lib/Pass/IRPrinting.cpp b/mlir/lib/Pass/IRPrinting.cpp
index 72b94ee..a12bdd9 100644
--- a/mlir/lib/Pass/IRPrinting.cpp
+++ b/mlir/lib/Pass/IRPrinting.cpp
@@ -9,8 +9,12 @@
 #include "PassDetail.h"
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/Pass/PassManager.h"
-#include "llvm/Support/Format.h"
+#include "mlir/Support/FileUtilities.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/ToolOutputFile.h"
 
 using namespace mlir;
 using namespace mlir::detail;
@@ -200,6 +204,149 @@ struct BasicIRPrinterConfig : public PassManager::IRPrinterConfig {
 };
 } // namespace
 
+/// Return pairs of (sanitized op name, symbol name) for `op` and all parent
+/// operations. Op names are sanitized by replacing periods with underscores.
+/// The pairs are returned in order of outer-most to inner-most (ancestors of
+/// `op` first, `op` last). This information is used to construct the directory
+/// tree for the `FileTreeIRPrinterConfig` below.
+/// The counter for `op` will be incremented by this call.
+static std::pair<SmallVector<std::pair<std::string, StringRef>>, std::string>
+getOpAndSymbolNames(Operation *op, StringRef passName,
+                    llvm::DenseMap<Operation *, unsigned> &counters) {
+  SmallVector<std::pair<std::string, StringRef>> pathElements;
+  SmallVector<unsigned> countPrefix;
+
+  if (!counters.contains(op))
+    counters[op] = -1;
+
+  Operation *iter = op;
+  ++counters[op];
+  while (iter) {
+    countPrefix.push_back(counters[iter]);
+    StringAttr symbolName =
+        iter->getAttrOfType<StringAttr>(SymbolTable::getSymbolAttrName());
+    std::string opName =
+        llvm::join(llvm::split(iter->getName().getStringRef().str(), '.'), "_");
+    pathElements.emplace_back(opName, symbolName ? symbolName.strref()
+                                                 : "no-symbol-name");
+    iter = iter->getParentOp();
+  }
+  // Return in the order of top level (module) down to `op`.
+  std::reverse(countPrefix.begin(), countPrefix.end());
+  std::reverse(pathElements.begin(), pathElements.end());
+
+  std::string passFileName = llvm::formatv(
+      "{0:$[_]}_{1}.mlir",
+      llvm::make_range(countPrefix.begin(), countPrefix.end()), passName);
+
+  return {pathElements, passFileName};
+}
+
+static LogicalResult createDirectoryOrPrintErr(llvm::StringRef dirPath) {
+  if (std::error_code ec =
+          llvm::sys::fs::create_directory(dirPath, /*IgnoreExisting=*/true)) {
+    llvm::errs() << "Error while creating directory " << dirPath << ": "
+                 << ec.message() << "\n";
+    return failure();
+  }
+  return success();
+}
+
+/// Creates  directories (if required) and opens an output file for the
+/// FileTreeIRPrinterConfig.
+static std::unique_ptr<llvm::ToolOutputFile>
+createTreePrinterOutputPath(Operation *op, llvm::StringRef passArgument,
+                            llvm::StringRef rootDir,
+                            llvm::DenseMap<Operation *, unsigned> &counters) {
+  // Create the path. We will create a tree rooted at the given 'rootDir'
+  // directory. The root directory will contain folders with the names of
+  // modules. Sub-directories within those folders mirror the nesting
+  // structure of the pass manager, using symbol names for directory names.
+  auto [opAndSymbolNames, fileName] =
+      getOpAndSymbolNames(op, passArgument, counters);
+
+  // Create all the directories, starting at the root. Abort early if we fail to
+  // create any directory.
+  llvm::SmallString<128> path(rootDir);
+  if (failed(createDirectoryOrPrintErr(path)))
+    return nullptr;
+
+  for (auto [opName, symbolName] : opAndSymbolNames) {
+    llvm::sys::path::append(path, opName + "_" + symbolName);
+    if (failed(createDirectoryOrPrintErr(path)))
+      return nullptr;
+  }
+
+  // Open output file.
+  llvm::sys::path::append(path, fileName);
+  std::string error;
+  std::unique_ptr<llvm::ToolOutputFile> file = openOutputFile(path, &error);
+  if (!file) {
+    llvm::errs() << "Error opening output file " << path << ": " << error
+                 << "\n";
+    return nullptr;
+  }
+  return file;
+}
+
+namespace {
+/// A configuration that prints the IR before/after each pass to a set of files
+/// in the specified directory. The files are organized into subdirectories that
+/// mirror the nesting structure of the IR.
+struct FileTreeIRPrinterConfig : public PassManager::IRPrinterConfig {
+  FileTreeIRPrinterConfig(
+      std::function<bool(Pass *, Operation *)> shouldPrintBeforePass,
+      std::function<bool(Pass *, Operation *)> shouldPrintAfterPass,
+      bool printModuleScope, bool printAfterOnlyOnChange,
+      bool printAfterOnlyOnFailure, OpPrintingFlags opPrintingFlags,
+      llvm::StringRef treeDir)
+      : IRPrinterConfig(printModuleScope, printAfterOnlyOnChange,
+                        printAfterOnlyOnFailure, opPrintingFlags),
+        shouldPrintBeforePass(std::move(shouldPrintBeforePass)),
+        shouldPrintAfterPass(std::move(shouldPrintAfterPass)),
+        treeDir(treeDir) {
+    assert((this->shouldPrintBeforePass || this->shouldPrintAfterPass) &&
+           "expected at least one valid filter function");
+  }
+
+  void printBeforeIfEnabled(Pass *pass, Operation *operation,
+                            PrintCallbackFn printCallback) final {
+    if (!shouldPrintBeforePass || !shouldPrintBeforePass(pass, operation))
+      return;
+    std::unique_ptr<llvm::ToolOutputFile> file = createTreePrinterOutputPath(
+        operation, pass->getArgument(), treeDir, counters);
+    if (!file)
+      return;
+    printCallback(file->os());
+    file->keep();
+  }
+
+  void printAfterIfEnabled(Pass *pass, Operation *operation,
+                           PrintCallbackFn printCallback) final {
+    if (!shouldPrintAfterPass || !shouldPrintAfterPass(pass, operation))
+      return;
+    std::unique_ptr<llvm::ToolOutputFile> file = createTreePrinterOutputPath(
+        operation, pass->getArgument(), treeDir, counters);
+    if (!file)
+      return;
+    printCallback(file->os());
+    file->keep();
+  }
+
+  /// Filter functions for before and after pass execution.
+  std::function<bool(Pass *, Operation *)> shouldPrintBeforePass;
+  std::function<bool(Pass *, Operation *)> shouldPrintAfterPass;
+
+  /// Directory that should be used as the root of the file tree.
+  std::string treeDir;
+
+  /// Counters used for labeling the prefix. Every op which could be targeted by
+  /// a pass gets its own counter.
+  llvm::DenseMap<Operation *, unsigned> counters;
+};
+
+} // namespace
+
 /// Add an instrumentation to print the IR before and after pass execution,
 /// using the provided configuration.
 void PassManager::enableIRPrinting(std::unique_ptr<IRPrinterConfig> config) {
@@ -223,3 +370,16 @@ void PassManager::enableIRPrinting(
       printModuleScope, printAfterOnlyOnChange, printAfterOnlyOnFailure,
       opPrintingFlags, out));
 }
+
+/// Add an instrumentation to print the IR before and after pass execution.
+void PassManager::enableIRPrintingToFileTree(
+    std::function<bool(Pass *, Operation *)> shouldPrintBeforePass,
+    std::function<bool(Pass *, Operation *)> shouldPrintAfterPass,
+    bool printModuleScope, bool printAfterOnlyOnChange,
+    bool printAfterOnlyOnFailure, StringRef printTreeDir,
+    OpPrintingFlags opPrintingFlags) {
+  enableIRPrinting(std::make_unique<FileTreeIRPrinterConfig>(
+      std::move(shouldPrintBeforePass), std::move(shouldPrintAfterPass),
+      printModuleScope, printAfterOnlyOnChange, printAfterOnlyOnFailure,
+      opPrintingFlags, printTreeDir));
+}
diff --git a/mlir/lib/Pass/PassManagerOptions.cpp b/mlir/lib/Pass/PassManagerOptions.cpp
index ffc53b7..706a21a 100644
--- a/mlir/lib/Pass/PassManagerOptions.cpp
+++ b/mlir/lib/Pass/PassManagerOptions.cpp
@@ -58,6 +58,10 @@ struct PassManagerOptions {
       llvm::cl::desc("When printing IR for print-ir-[before|after]{-all} "
                      "always print the top-level operation"),
       llvm::cl::init(false)};
+  llvm::cl::opt<std::string> printTreeDir{
+      "mlir-print-ir-tree-dir",
+      llvm::cl::desc("When printing the IR before/after a pass, print file "
+                     "tree rooted at this directory")};
 
   /// Add an IR printing instrumentation if enabled by any 'print-ir' flags.
   void addPrinterInstrumentation(PassManager &pm);
@@ -120,6 +124,13 @@ void PassManagerOptions::addPrinterInstrumentation(PassManager &pm) {
     return;
 
   // Otherwise, add the IR printing instrumentation.
+  if (!printTreeDir.empty()) {
+    pm.enableIRPrintingToFileTree(shouldPrintBeforePass, shouldPrintAfterPass,
+                                  printModuleScope, printAfterChange,
+                                  printAfterFailure, printTreeDir);
+    return;
+  }
+
   pm.enableIRPrinting(shouldPrintBeforePass, shouldPrintAfterPass,
                       printModuleScope, printAfterChange, printAfterFailure,
                       llvm::errs());
diff --git a/mlir/lib/Target/LLVM/CMakeLists.txt b/mlir/lib/Target/LLVM/CMakeLists.txt
index e0657c89..5a3fa16 100644
--- a/mlir/lib/Target/LLVM/CMakeLists.txt
+++ b/mlir/lib/Target/LLVM/CMakeLists.txt
@@ -47,7 +47,7 @@ add_mlir_dialect_library(MLIRNVVMTarget
   MLIRNVVMToLLVMIRTranslation
   )
 
-if(MLIR_ENABLE_CUDA_CONVERSIONS)
+if ("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
   # Find the CUDA toolkit.
   find_package(CUDAToolkit)
 
diff --git a/mlir/lib/Target/LLVM/NVVM/Target.cpp b/mlir/lib/Target/LLVM/NVVM/Target.cpp
index e438ce8..e75547f 100644
--- a/mlir/lib/Target/LLVM/NVVM/Target.cpp
+++ b/mlir/lib/Target/LLVM/NVVM/Target.cpp
@@ -13,7 +13,6 @@
 
 #include "mlir/Target/LLVM/NVVM/Target.h"
 
-#include "mlir/Config/mlir-config.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Target/LLVM/NVVM/Utils.h"
@@ -158,40 +157,43 @@ SerializeGPUModuleBase::loadBitcodeFiles(llvm::Module &module) {
   return std::move(bcFiles);
 }
 
-#if MLIR_ENABLE_CUDA_CONVERSIONS
+#if LLVM_HAS_NVPTX_TARGET
 namespace {
 class NVPTXSerializer : public SerializeGPUModuleBase {
 public:
   NVPTXSerializer(Operation &module, NVVMTargetAttr target,
                   const gpu::TargetOptions &targetOptions);
 
+  /// Returns the GPU module op being serialized.
   gpu::GPUModuleOp getOperation();
 
-  // Compile PTX to cubin using `ptxas`.
+  /// Compiles PTX to cubin using `ptxas`.
   std::optional<SmallVector<char, 0>>
   compileToBinary(const std::string &ptxCode);
 
-  // Compile PTX to cubin using the `nvptxcompiler` library.
+  /// Compiles PTX to cubin using the `nvptxcompiler` library.
   std::optional<SmallVector<char, 0>>
   compileToBinaryNVPTX(const std::string &ptxCode);
 
+  /// Serializes the LLVM module to an object format, depending on the
+  /// compilation target selected in target options.
   std::optional<SmallVector<char, 0>>
   moduleToObject(llvm::Module &llvmModule) override;
 
 private:
   using TmpFile = std::pair<llvm::SmallString<128>, llvm::FileRemover>;
 
-  // Create a temp file.
+  /// Creates a temp file.
   std::optional<TmpFile> createTemp(StringRef name, StringRef suffix);
 
-  // Find the `tool` path, where `tool` is the name of the binary to search,
-  // i.e. `ptxas` or `fatbinary`. The search order is:
-  // 1. The toolkit path in `targetOptions`.
-  // 2. In the system PATH.
-  // 3. The path from `getCUDAToolkitPath()`.
+  /// Finds the `tool` path, where `tool` is the name of the binary to search,
+  /// i.e. `ptxas` or `fatbinary`. The search order is:
+  /// 1. The toolkit path in `targetOptions`.
+  /// 2. In the system PATH.
+  /// 3. The path from `getCUDAToolkitPath()`.
   std::optional<std::string> findTool(StringRef tool);
 
-  // Target options.
+  /// Target options.
   gpu::TargetOptions targetOptions;
 };
 } // namespace
@@ -515,7 +517,7 @@ NVPTXSerializer::compileToBinaryNVPTX(const std::string &ptxCode) {
 
 std::optional<SmallVector<char, 0>>
 NVPTXSerializer::moduleToObject(llvm::Module &llvmModule) {
-  // Return LLVM IR if the compilation target is offload.
+  // Return LLVM IR if the compilation target is `offload`.
 #define DEBUG_TYPE "serialize-to-llvm"
   LLVM_DEBUG({
     llvm::dbgs() << "LLVM IR for module: " << getOperation().getNameAttr()
@@ -549,7 +551,7 @@ NVPTXSerializer::moduleToObject(llvm::Module &llvmModule) {
   });
 #undef DEBUG_TYPE
 
-  // Return PTX if the compilation target is assembly.
+  // Return PTX if the compilation target is `assembly`.
   if (targetOptions.getCompilationTarget() ==
       gpu::CompilationTarget::Assembly) {
     // Make sure to include the null terminator.
@@ -564,7 +566,7 @@ NVPTXSerializer::moduleToObject(llvm::Module &llvmModule) {
   return compileToBinary(*serializedISA);
 #endif // MLIR_ENABLE_NVPTXCOMPILER
 }
-#endif // MLIR_ENABLE_CUDA_CONVERSIONS
+#endif // LLVM_HAS_NVPTX_TARGET
 
 std::optional<SmallVector<char, 0>>
 NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
@@ -576,7 +578,7 @@ NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
     module->emitError("Module must be a GPU module.");
     return std::nullopt;
   }
-#if MLIR_ENABLE_CUDA_CONVERSIONS
+#if LLVM_HAS_NVPTX_TARGET
   NVPTXSerializer serializer(*module, cast<NVVMTargetAttr>(attribute), options);
   serializer.init();
   return serializer.run();
@@ -584,7 +586,7 @@ NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
   module->emitError(
       "The `NVPTX` target was not built. Please enable it when building LLVM.");
   return std::nullopt;
-#endif // MLIR_ENABLE_CUDA_CONVERSIONS
+#endif // LLVM_HAS_NVPTX_TARGET
 }
 
 Attribute
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenACC/OpenACCToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenACC/OpenACCToLLVMIRTranslation.cpp
index eeda245..d9cf85e 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenACC/OpenACCToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenACC/OpenACCToLLVMIRTranslation.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Target/LLVMIR/Dialect/OpenACC/OpenACCToLLVMIRTranslation.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -19,7 +20,6 @@
 #include "mlir/Support/LLVM.h"
 #include "mlir/Target/LLVMIR/Dialect/OpenMPCommon.h"
 #include "mlir/Target/LLVMIR/ModuleTranslation.h"
-#include "mlir/Transforms/RegionUtils.h"
 
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 34b6903..6ec4c12 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 #include "mlir/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/Dialect/OpenMP/OpenMPInterfaces.h"
@@ -333,54 +334,6 @@ convertOmpCritical(Operation &opInst, llvm::IRBuilderBase &builder,
   return success();
 }
 
-/// Returns a reduction declaration that corresponds to the given reduction
-/// operation in the given container. Currently only supports reductions inside
-/// WsloopOp and ParallelOp but can be easily extended as long as the given
-/// construct implements getNumReductionVars.
-template <typename T>
-static std::optional<omp::DeclareReductionOp>
-findReductionDeclInContainer(T container, omp::ReductionOp reduction) {
-  for (unsigned i = 0, e = container.getNumReductionVars(); i < e; ++i) {
-    if (container.getReductionVars()[i] != reduction.getAccumulator())
-      continue;
-
-    SymbolRefAttr reductionSymbol =
-        cast<SymbolRefAttr>((*container.getReductions())[i]);
-    auto declareOp =
-        SymbolTable::lookupNearestSymbolFrom<omp::DeclareReductionOp>(
-            container, reductionSymbol);
-    return declareOp;
-  }
-  return std::nullopt;
-}
-
-/// Searches for a reduction in a provided region and the regions
-/// it is nested in
-static omp::DeclareReductionOp findReductionDecl(Operation &containerOp,
-                                                 omp::ReductionOp reduction) {
-  std::optional<omp::DeclareReductionOp> declareOp = std::nullopt;
-  Operation *container = &containerOp;
-
-  while (!declareOp.has_value() && container) {
-    // Check if current container is supported for reductions searches
-    if (auto par = dyn_cast<omp::ParallelOp>(*container)) {
-      declareOp = findReductionDeclInContainer(par, reduction);
-    } else if (auto loop = dyn_cast<omp::WsloopOp>(*container)) {
-      declareOp = findReductionDeclInContainer(loop, reduction);
-    } else {
-      break;
-    }
-
-    // See if we can search parent for reductions as well
-    container = containerOp.getParentOp();
-  }
-
-  assert(declareOp.has_value() &&
-         "reduction operation must be associated with a declaration");
-
-  return *declareOp;
-}
-
 /// Populates `reductions` with reduction declarations used in the given loop.
 template <typename T>
 static void
@@ -1785,62 +1738,6 @@ convertOmpAtomicCapture(omp::AtomicCaptureOp atomicCaptureOp,
   return updateGenStatus;
 }
 
-/// Converts an OpenMP reduction operation using OpenMPIRBuilder. Expects the
-/// mapping between reduction variables and their private equivalents to have
-/// been stored on the ModuleTranslation stack. Currently only supports
-/// reduction within WsloopOp and ParallelOp, but can be easily extended.
-static LogicalResult
-convertOmpReductionOp(omp::ReductionOp reductionOp,
-                      llvm::IRBuilderBase &builder,
-                      LLVM::ModuleTranslation &moduleTranslation) {
-  // Find the declaration that corresponds to the reduction op.
-  omp::DeclareReductionOp declaration;
-  Operation *reductionParent = reductionOp->getParentOp();
-  if (dyn_cast<omp::ParallelOp>(reductionParent) ||
-      dyn_cast<omp::WsloopOp>(reductionParent)) {
-    declaration = findReductionDecl(*reductionParent, reductionOp);
-  } else {
-    llvm_unreachable("Unhandled reduction container");
-  }
-  assert(declaration && "could not find reduction declaration");
-
-  // Retrieve the mapping between reduction variables and their private
-  // equivalents.
-  const DenseMap<Value, llvm::Value *> *reductionVariableMap = nullptr;
-  moduleTranslation.stackWalk<OpenMPVarMappingStackFrame>(
-      [&](const OpenMPVarMappingStackFrame &frame) {
-        if (frame.mapping.contains(reductionOp.getAccumulator())) {
-          reductionVariableMap = &frame.mapping;
-          return WalkResult::interrupt();
-        }
-        return WalkResult::advance();
-      });
-  assert(reductionVariableMap && "couldn't find private reduction variables");
-  // Translate the reduction operation by emitting the body of the corresponding
-  // reduction declaration.
-  Region &reductionRegion = declaration.getReductionRegion();
-  llvm::Value *privateReductionVar =
-      reductionVariableMap->lookup(reductionOp.getAccumulator());
-  llvm::Value *reductionVal = builder.CreateLoad(
-      moduleTranslation.convertType(reductionOp.getOperand().getType()),
-      privateReductionVar);
-
-  moduleTranslation.mapValue(reductionRegion.front().getArgument(0),
-                             reductionVal);
-  moduleTranslation.mapValue(
-      reductionRegion.front().getArgument(1),
-      moduleTranslation.lookupValue(reductionOp.getOperand()));
-
-  SmallVector<llvm::Value *> phis;
-  if (failed(inlineConvertOmpRegions(reductionRegion, "omp.reduction.body",
-                                     builder, moduleTranslation, &phis)))
-    return failure();
-  assert(phis.size() == 1 && "expected one value to be yielded from "
-                             "the reduction body declaration region");
-  builder.CreateStore(phis[0], privateReductionVar);
-  return success();
-}
-
 /// Converts an OpenMP Threadprivate operation into LLVM IR using
 /// OpenMPIRBuilder.
 static LogicalResult
@@ -3349,9 +3246,6 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder,
       .Case([&](omp::ParallelOp op) {
         return convertOmpParallel(op, builder, moduleTranslation);
       })
-      .Case([&](omp::ReductionOp reductionOp) {
-        return convertOmpReductionOp(reductionOp, builder, moduleTranslation);
-      })
       .Case([&](omp::MasterOp) {
         return convertOmpMaster(*op, builder, moduleTranslation);
       })
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index cf3257c..1ec0736 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -16,6 +16,7 @@
 #include "AttrKindDetail.h"
 #include "DebugTranslation.h"
 #include "LoopAnnotationTranslation.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMInterfaces.h"
@@ -33,7 +34,6 @@
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Target/LLVMIR/LLVMTranslationInterface.h"
 #include "mlir/Target/LLVMIR/TypeToLLVM.h"
-#include "mlir/Transforms/RegionUtils.h"
 
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
diff --git a/mlir/lib/Transforms/Mem2Reg.cpp b/mlir/lib/Transforms/Mem2Reg.cpp
index e2e240a..a452cc3 100644
--- a/mlir/lib/Transforms/Mem2Reg.cpp
+++ b/mlir/lib/Transforms/Mem2Reg.cpp
@@ -9,6 +9,7 @@
 #include "mlir/Transforms/Mem2Reg.h"
 #include "mlir/Analysis/DataLayoutAnalysis.h"
 #include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/PatternMatch.h"
@@ -16,7 +17,6 @@
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/MemorySlotInterfaces.h"
 #include "mlir/Transforms/Passes.h"
-#include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/GenericIteratedDominanceFrontier.h"
 
diff --git a/mlir/lib/Transforms/SROA.cpp b/mlir/lib/Transforms/SROA.cpp
index 67cbade..39f7256 100644
--- a/mlir/lib/Transforms/SROA.cpp
+++ b/mlir/lib/Transforms/SROA.cpp
@@ -9,6 +9,7 @@
 #include "mlir/Transforms/SROA.h"
 #include "mlir/Analysis/DataLayoutAnalysis.h"
 #include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Interfaces/MemorySlotInterfaces.h"
 #include "mlir/Transforms/Passes.h"
 
diff --git a/mlir/lib/Transforms/TopologicalSort.cpp b/mlir/lib/Transforms/TopologicalSort.cpp
index 1219968..528f6ef 100644
--- a/mlir/lib/Transforms/TopologicalSort.cpp
+++ b/mlir/lib/Transforms/TopologicalSort.cpp
@@ -8,8 +8,8 @@
 
 #include "mlir/Transforms/Passes.h"
 
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/IR/RegionKindInterface.h"
-#include "mlir/Transforms/TopologicalSortUtils.h"
 
 namespace mlir {
 #define GEN_PASS_DEF_TOPOLOGICALSORT
diff --git a/mlir/lib/Transforms/Utils/CMakeLists.txt b/mlir/lib/Transforms/Utils/CMakeLists.txt
index d6aac0e..b5788c6 100644
--- a/mlir/lib/Transforms/Utils/CMakeLists.txt
+++ b/mlir/lib/Transforms/Utils/CMakeLists.txt
@@ -10,7 +10,6 @@ add_mlir_library(MLIRTransformUtils
   LoopInvariantCodeMotionUtils.cpp
   OneToNTypeConversion.cpp
   RegionUtils.cpp
-  TopologicalSortUtils.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
diff --git a/mlir/lib/Transforms/Utils/RegionUtils.cpp b/mlir/lib/Transforms/Utils/RegionUtils.cpp
index 192f59b..b5e641d 100644
--- a/mlir/lib/Transforms/Utils/RegionUtils.cpp
+++ b/mlir/lib/Transforms/Utils/RegionUtils.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Transforms/RegionUtils.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/IR/Block.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Operation.h"
@@ -15,11 +16,9 @@
 #include "mlir/IR/Value.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
-#include "mlir/Transforms/TopologicalSortUtils.h"
 
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SmallSet.h"
 
 #include <deque>
 
@@ -836,19 +835,3 @@ LogicalResult mlir::simplifyRegions(RewriterBase &rewriter,
   return success(eliminatedBlocks || eliminatedOpsOrArgs ||
                  mergedIdenticalBlocks);
 }
-
-SetVector<Block *> mlir::getBlocksSortedByDominance(Region &region) {
-  // For each block that has not been visited yet (i.e. that has no
-  // predecessors), add it to the list as well as its successors.
-  SetVector<Block *> blocks;
-  for (Block &b : region) {
-    if (blocks.count(&b) == 0) {
-      llvm::ReversePostOrderTraversal<Block *> traversal(&b);
-      blocks.insert(traversal.begin(), traversal.end());
-    }
-  }
-  assert(blocks.size() == region.getBlocks().size() &&
-         "some blocks are not sorted");
-
-  return blocks;
-}
diff --git a/mlir/lib/Transforms/ViewOpGraph.cpp b/mlir/lib/Transforms/ViewOpGraph.cpp
index c2eb2b8..b3c0a06 100644
--- a/mlir/lib/Transforms/ViewOpGraph.cpp
+++ b/mlir/lib/Transforms/ViewOpGraph.cpp
@@ -8,12 +8,12 @@
 
 #include "mlir/Transforms/ViewOpGraph.h"
 
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/IR/Block.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/IndentedOstream.h"
-#include "mlir/Transforms/TopologicalSortUtils.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/GraphWriter.h"
 #include <map>
diff --git a/mlir/python/mlir/dialects/linalg/__init__.py b/mlir/python/mlir/dialects/linalg/__init__.py
index 6e4cb1b..8fb1227 100644
--- a/mlir/python/mlir/dialects/linalg/__init__.py
+++ b/mlir/python/mlir/dialects/linalg/__init__.py
@@ -55,7 +55,6 @@ from .._linalg_enum_gen import *
 #     TODO: guard against surprises and fail create Runtime Custom Ops with
 #     the same name as existing Core Named Ops.
 from .opdsl.ops.core_named_ops import *
-from .opdsl.lang.emitter import isa
 
 from ...ir import *
 from .._ods_common import get_op_result_or_value as _get_op_result_or_value
@@ -71,7 +70,7 @@ def transpose(
     if len(outs) > 1:
         raise ValueError(f"{outs=} must have length 1.")
     init = _get_op_result_or_value(outs[0])
-    result_types = [init.type] if isa(RankedTensorType, init.type) else []
+    result_types = [init.type] if isinstance(init.type, RankedTensorType) else []
 
     op = TransposeOp(
         result=result_types,
@@ -93,7 +92,7 @@ def broadcast(
     if len(outs) > 1:
         raise ValueError(f"{outs=} must have length 1.")
     init = _get_op_result_or_value(outs[0])
-    result_types = [init.type] if isa(RankedTensorType, init.type) else []
+    result_types = [init.type] if isinstance(init.type, RankedTensorType) else []
 
     op = BroadcastOp(
         result=result_types,
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/lang/emitter.py b/mlir/python/mlir/dialects/linalg/opdsl/lang/emitter.py
index 845b533..254458a 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/lang/emitter.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/lang/emitter.py
@@ -31,14 +31,6 @@ __all__ = [
 ValueList = Union[Sequence[Value], OpResultList]
 
 
-def isa(cls: Type, ty: Type):
-    try:
-        cls(ty)
-        return True
-    except ValueError:
-        return False
-
-
 def prepare_common_structured_op(
     op_config: LinalgStructuredOpConfig,
     *ins: Value,
@@ -127,7 +119,7 @@ def prepare_common_structured_op(
         op_config, in_arg_defs, ins, out_arg_defs, outs
     )
 
-    result_types = [t for t in out_types if isa(RankedTensorType, t)]
+    result_types = [t for t in out_types if isinstance(t, RankedTensorType)]
 
     # Initialize the type dictionary with the predefined types.
     type_mapping = dict()  # type: Dict[str, Type]
diff --git a/mlir/test/Analysis/DataFlow/test-next-access.mlir b/mlir/test/Analysis/DataFlow/test-next-access.mlir
index 8825c69..700a23a 100644
--- a/mlir/test/Analysis/DataFlow/test-next-access.mlir
+++ b/mlir/test/Analysis/DataFlow/test-next-access.mlir
@@ -63,7 +63,7 @@ func.func @branch(%arg0: memref<f32>, %arg1: f32, %arg2: i1) -> f32 {
   return %phi : f32
 }
 
-// CHECK-LABEL @dead_branch
+// CHECK-LABEL: @dead_branch
 func.func @dead_branch(%arg0: memref<f32>, %arg1: f32) -> f32 {
   // CHECK:      name = "store"
   // CHECK-SAME: next_access = ["unknown", ["load 2"]]
@@ -191,7 +191,7 @@ func.func @loop_cf(%arg0: memref<?xf32>, %arg1: f32, %arg2: index, %arg3: index,
   return %8 : f32
 }
 
-// CHECK-LABEL @conditional_cf
+// CHECK-LABEL: @conditional_cf
 func.func @conditional_cf(%arg0: i1, %arg1: memref<f32>) {
   // CHECK:      name = "pre"
   // CHECK-SAME: next_access = {{\[}}["then", "post"]]
diff --git a/mlir/test/Analysis/test-liveness.mlir b/mlir/test/Analysis/test-liveness.mlir
index 8ae3d09a..61a1e5f 100644
--- a/mlir/test/Analysis/test-liveness.mlir
+++ b/mlir/test/Analysis/test-liveness.mlir
@@ -493,3 +493,27 @@ func.func @nested_region3(
   }
   return %1 : i32
 }
+
+// -----
+
+// CHECK-LABEL: Testing : nested_region4
+
+func.func @nested_region4(%arg0: index, %arg1: index, %arg2: index) {
+  // CHECK: Block: 0
+  // CHECK-NEXT: LiveIn:{{ *$}}
+  // CHECK-NEXT: LiveOut:{{ *$}}
+
+  // CHECK: {{^// +}}[[VAL3:[a-z0-9_]+]]{{ *:}}
+  // CHECK: {{^// +}}[[VAL4:[a-z0-9_]+]]{{ *:}}
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i32 = arith.constant 1 : i32
+
+  %0 = scf.for %arg3 = %arg0 to %arg1 step %arg2 iter_args(%arg4 = %c0_i32) -> (i32) {
+    // CHECK: Block: 1
+    // CHECK-NEXT: LiveIn: [[VAL4]]{{ *$}}
+    // CHECK-NEXT: LiveOut:{{ *$}}
+    %1 = arith.addi %arg4, %c1_i32 : i32
+    scf.yield %1 : i32
+  }
+  return
+}
diff --git a/mlir/test/Analysis/test-topoligical-sort.mlir b/mlir/test/Analysis/test-topoligical-sort.mlir
index 8608586..150aff8 100644
--- a/mlir/test/Analysis/test-topoligical-sort.mlir
+++ b/mlir/test/Analysis/test-topoligical-sort.mlir
@@ -1,21 +1,38 @@
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(test-print-topological-sort))" 2>&1 | FileCheck %s
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(test-print-topological-sort))" --split-input-file | FileCheck %s
 
-// CHECK-LABEL: Testing : region
-//       CHECK: arith.addi {{.*}} : index
-//  CHECK-NEXT: scf.for
-//       CHECK: } {__test_sort_original_idx__ = 2 : i64}
-//  CHECK-NEXT: arith.addi {{.*}} : i32
-//  CHECK-NEXT: arith.subi {{.*}} : i32
-func.func @region(
-  %arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index,
-  %arg4 : i32, %arg5 : i32, %arg6 : i32,
-  %buffer : memref<i32>) {
-  %0 = arith.addi %arg4, %arg5 {__test_sort_original_idx__ = 0} : i32
-  %idx = arith.addi %arg0, %arg1 {__test_sort_original_idx__ = 3} : index
-  scf.for %arg7 = %idx to %arg2 step %arg3  {
-    %2 = arith.addi %0, %arg5 : i32
-    %3 = arith.subi %2, %arg6 {__test_sort_original_idx__ = 1} : i32
-    memref.store %3, %buffer[] : memref<i32>
-  } {__test_sort_original_idx__ = 2}
+// CHECK-LABEL: single_element
+func.func @single_element() {
+  // CHECK: test_sort_index = 0
+  return {test_to_sort}
+}
+
+// -----
+
+// CHECK-LABEL: @simple_region
+func.func @simple_region(%cond: i1) {
+  // CHECK: test_sort_index = 0
+  %0 = arith.constant {test_to_sort} 42 : i32
+  scf.if %cond {
+    %1 = arith.addi %0, %0 : i32
+    // CHECK: test_sort_index = 2
+    %2 = arith.subi %0, %1 {test_to_sort} : i32
+  // CHECK: test_sort_index = 1
+  } {test_to_sort}
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @multi_region
+func.func @multi_region(%cond: i1) {
+  scf.if %cond {
+    // CHECK: test_sort_index = 0
+    %0 = arith.constant {test_to_sort} 42 : i32
+  }
+
+  scf.if %cond {
+    // CHECK: test_sort_index = 1
+    %0 = arith.constant {test_to_sort} 24 : i32
+  }
   return
 }
diff --git a/mlir/test/Transforms/test-toposort.mlir b/mlir/test/Analysis/test-toposort.mlir
index c47b885..c47b885 100644
--- a/mlir/test/Transforms/test-toposort.mlir
+++ b/mlir/test/Analysis/test-toposort.mlir
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 8806a1d..be0b26e 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -67,8 +67,8 @@ endif()
 
 llvm_canonicalize_cmake_booleans(
   LLVM_BUILD_EXAMPLES
+  LLVM_HAS_NVPTX_TARGET
   MLIR_ENABLE_BINDINGS_PYTHON
-  MLIR_ENABLE_CUDA_CONVERSIONS
   MLIR_ENABLE_CUDA_RUNNER
   MLIR_ENABLE_ROCM_CONVERSIONS
   MLIR_ENABLE_ROCM_RUNNER
diff --git a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir
index 66dfa8f..97e4593 100644
--- a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir
+++ b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir
@@ -63,3 +63,10 @@ func.func @arith_cast_fptoui_i1(%arg0: f32) -> i1 {
   return %t: i1
 }
 
+// -----
+
+func.func @arith_extsi_i1_to_i32(%arg0: i1) {
+  // expected-error @+1 {{failed to legalize operation 'arith.extsi'}}
+  %idx = arith.extsi %arg0 : i1 to i32
+  return
+}
diff --git a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
index 79fecd6..b453b69 100644
--- a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
+++ b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
@@ -177,3 +177,66 @@ func.func @arith_int_to_float_cast_ops(%arg0: i8, %arg1: i64) {
 
   return
 }
+
+// -----
+
+func.func @arith_trunci(%arg0: i32) -> i8 {
+  // CHECK-LABEL: arith_trunci
+  // CHECK-SAME: (%[[Arg0:[^ ]*]]: i32)
+  // CHECK: %[[CastUI:.*]] = emitc.cast %[[Arg0]] : i32 to ui32
+  // CHECK: %[[Trunc:.*]] = emitc.cast %[[CastUI]] : ui32 to ui8
+  // CHECK: emitc.cast %[[Trunc]] : ui8 to i8
+  %truncd = arith.trunci %arg0 : i32 to i8
+
+  return %truncd : i8
+}
+
+// -----
+
+func.func @arith_trunci_to_i1(%arg0: i32) -> i1 {
+  // CHECK-LABEL: arith_trunci_to_i1
+  // CHECK-SAME: (%[[Arg0:[^ ]*]]: i32)
+  // CHECK: %[[Const:.*]] = "emitc.constant"
+  // CHECK-SAME: value = 1
+  // CHECK: %[[And:.*]] = emitc.bitwise_and %[[Arg0]], %[[Const]] : (i32, i32) -> i32
+  // CHECK: emitc.cast %[[And]] : i32 to i1
+  %truncd = arith.trunci %arg0 : i32 to i1
+
+  return %truncd : i1
+}
+
+// -----
+
+func.func @arith_extsi(%arg0: i32) {
+  // CHECK-LABEL: arith_extsi
+  // CHECK-SAME: ([[Arg0:[^ ]*]]: i32)
+  // CHECK: emitc.cast [[Arg0]] : i32 to i64
+  %extd = arith.extsi %arg0 : i32 to i64
+
+  return
+}
+
+// -----
+
+func.func @arith_extui(%arg0: i32) {
+  // CHECK-LABEL: arith_extui
+  // CHECK-SAME: (%[[Arg0:[^ ]*]]: i32)
+  // CHECK: %[[Conv0:.*]] = emitc.cast %[[Arg0]] : i32 to ui32
+  // CHECK: %[[Conv1:.*]] = emitc.cast %[[Conv0]] : ui32 to ui64
+  // CHECK: emitc.cast %[[Conv1]] : ui64 to i64
+  %extd = arith.extui %arg0 : i32 to i64
+
+  return
+}
+
+// -----
+
+func.func @arith_extui_i1_to_i32(%arg0: i1) {
+  // CHECK-LABEL: arith_extui_i1_to_i32
+  // CHECK-SAME: (%[[Arg0:[^ ]*]]: i1)
+  // CHECK: %[[Conv0:.*]] = emitc.cast %[[Arg0]] : i1 to ui1
+  // CHECK: %[[Conv1:.*]] = emitc.cast %[[Conv0]] : ui1 to ui32
+  // CHECK: emitc.cast %[[Conv1]] : ui32 to i32
+  %idx = arith.extui %arg0 : i1 to i32
+  return
+}
diff --git a/mlir/test/Conversion/BufferizationToMemRef/bufferization-to-memref.mlir b/mlir/test/Conversion/BufferizationToMemRef/bufferization-to-memref.mlir
index 1eb387ce..f58a2af 100644
--- a/mlir/test/Conversion/BufferizationToMemRef/bufferization-to-memref.mlir
+++ b/mlir/test/Conversion/BufferizationToMemRef/bufferization-to-memref.mlir
@@ -79,7 +79,7 @@ func.func @conversion_dealloc_simple(%arg0: memref<2xf32>, %arg1: i1) {
   return
 }
 
-//      CHECk: scf.if [[ARG1]] {
-// CHECk-NEXT:   memref.dealloc [[ARG0]] : memref<2xf32>
-// CHECk-NEXT: }
-// CHECk-NEXT: return
+//      CHECK: scf.if [[ARG1]] {
+// CHECK-NEXT:   memref.dealloc [[ARG0]] : memref<2xf32>
+// CHECK-NEXT: }
+// CHECK-NEXT: return
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index dbf8ead..1b046d3 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -778,11 +778,11 @@ func.func @create_tensor_map(%devicePtr2d : memref<64x128xf32>, %devicePtr1d : m
   %crd0 = arith.constant 64 : index
   %crd1 = arith.constant 128 : index
   %devicePtr2d_unranked = memref.cast %devicePtr2d : memref<64x128xf32> to memref<*xf32>
-  // CHECK : llvm.call @mgpuTensorMapEncodeTiledMemref
+  // CHECK: llvm.call @mgpuTensorMapEncodeTiledMemref
   %tensorMap2d = nvgpu.tma.create.descriptor %devicePtr2d_unranked box[%crd0, %crd1] : memref<*xf32> -> !tensorMap2d
 
   %devicePtr1d_unranked = memref.cast %devicePtr1d : memref<128xf32> to memref<*xf32>
-  // CHECK : llvm.call @mgpuTensorMapEncodeTiledMemref
+  // CHECK: llvm.call @mgpuTensorMapEncodeTiledMemref
   %tensorMap1d = nvgpu.tma.create.descriptor %devicePtr1d_unranked box[%crd1] : memref<*xf32> -> !tensorMap1d
   func.return
 }
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index 1d56ca9..21947c2 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -17,7 +17,7 @@ llvm.func @init_mbarrier(%barrier_gen : !llvm.ptr, %barrier : !llvm.ptr<3>, %cou
 llvm.func @init_mbarrier_arrive_expect_tx(%barrier : !llvm.ptr<3>, %txcount : i32, %pred : i1) {
   //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r"
   nvvm.mbarrier.arrive.expect_tx.shared %barrier, %txcount : !llvm.ptr<3>, i32
-  //CHECK :  llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r,b "
+  //CHECK:  llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r,b"
   nvvm.mbarrier.arrive.expect_tx.shared %barrier, %txcount, predicate = %pred : !llvm.ptr<3>, i32, i1 
   llvm.return
 }
@@ -129,7 +129,7 @@ func.func @tma_load_5d_all(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %bar
 func.func @tma_load_1d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %p : i1) {
   // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2} ], [$3];", "r,l,r,r"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0] : !llvm.ptr<3>, !llvm.ptr
-  // CHECK : llvm.inline_asm has_side_effects asm_dialect = att "@$4 cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2} ], [$3];", "l,r,r,r,b"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$4 cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2} ], [$3];", "r,l,r,r,b"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0] predicate=%p : !llvm.ptr<3>, !llvm.ptr
   return
 }
@@ -138,7 +138,7 @@ func.func @tma_load_1d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier
 func.func @tma_load_2d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %p : i1) {
   // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3} ], [$4];", "r,l,r,r,r"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1] : !llvm.ptr<3>, !llvm.ptr
-  // CHECK : llvm.inline_asm has_side_effects asm_dialect = att "@$5 cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3} ], [$4];", "l,r,r,r,r,b"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$5 cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3} ], [$4];", "r,l,r,r,r,b"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1] predicate=%p  : !llvm.ptr<3>, !llvm.ptr
   return
 }
@@ -147,7 +147,7 @@ func.func @tma_load_2d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier
 func.func @tma_load_3d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %p : i1) {
   // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4} ], [$5];", "r,l,r,r,r,r"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2] : !llvm.ptr<3>, !llvm.ptr
-  // CHECK : llvm.inline_asm has_side_effects asm_dialect = att "@$6 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4}], [$5];", "l,r,r,r,r,r,b"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$6 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4} ], [$5];", "r,l,r,r,r,r,b"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2] predicate=%p  : !llvm.ptr<3>, !llvm.ptr
   return
 }
@@ -156,7 +156,7 @@ func.func @tma_load_3d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier
 func.func @tma_load_4d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %p : i1) {
   // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5} ], [$6];", "r,l,r,r,r,r,r"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3] : !llvm.ptr<3>, !llvm.ptr
-  // CHECK : llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5}], [$6];", "l,r,r,r,r,r,r,b"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5} ], [$6];", "r,l,r,r,r,r,r,b"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3] predicate=%p  : !llvm.ptr<3>, !llvm.ptr
   return
 }
@@ -165,7 +165,7 @@ func.func @tma_load_4d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier
 func.func @tma_load_5d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %p : i1) {
   // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5,$6} ], [$7];", "r,l,r,r,r,r,r,r"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] : !llvm.ptr<3>, !llvm.ptr
-  // CHECK : llvm.inline_asm has_side_effects asm_dialect = att "@$8 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5,$6}], [$7];", "l,r,r,r,r,r,r,r,b"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$8 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5,$6} ], [$7];", "r,l,r,r,r,r,r,r,b"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] predicate=%p  : !llvm.ptr<3>, !llvm.ptr
   return
 }
diff --git a/mlir/test/Conversion/PDLToPDLInterp/pdl-to-pdl-interp-matcher.mlir b/mlir/test/Conversion/PDLToPDLInterp/pdl-to-pdl-interp-matcher.mlir
index 92afb76..ed6407a 100644
--- a/mlir/test/Conversion/PDLToPDLInterp/pdl-to-pdl-interp-matcher.mlir
+++ b/mlir/test/Conversion/PDLToPDLInterp/pdl-to-pdl-interp-matcher.mlir
@@ -588,7 +588,7 @@ module @variadic_results_all {
   // CHECK-DAG: %[[OPS:.*]] = pdl_interp.get_users of %[[VAL0]] : !pdl.value
   // CHECK-DAG: pdl_interp.foreach %[[OP:.*]] : !pdl.operation in %[[OPS]]
   // CHECK-DAG:   %[[OPERANDS:.*]] = pdl_interp.get_operands of %[[OP]]
-  // CHECK-DAG    pdl_interp.are_equal %[[VALS]], %[[OPERANDS]] -> ^{{.*}}, ^[[CONTINUE:.*]]
+  // CHECK-DAG:    pdl_interp.are_equal %[[OPERANDS]], %[[VALS]] : !pdl.range<value> -> ^{{.*}}, ^[[CONTINUE:.*]]
   // CHECK-DAG:   pdl_interp.is_not_null %[[OP]]
   // CHECK-DAG:   pdl_interp.check_result_count of %[[OP]] is 0
   pdl.pattern @variadic_results_all : benefit(1) {
@@ -701,7 +701,7 @@ module @common_connector {
   // CHECK-DAG:     pdl_interp.are_equal %[[ROOTA_OP]], %[[VAL0]] : !pdl.value
   // CHECK-DAG:     %[[ROOTB_OP:.*]] = pdl_interp.get_operand 0 of %[[ROOTB]]
   // CHECK-DAG:     pdl_interp.are_equal %[[ROOTB_OP]], %[[VAL0]] : !pdl.value
-  // CHECK-DAG    } -> ^[[CONTA:.*]]
+  // CHECK-DAG:    } -> ^[[CONTA:.*]]
   pdl.pattern @common_connector : benefit(1) {
       %type = type
       %op = operation -> (%type, %type : !pdl.type, !pdl.type)
@@ -742,7 +742,7 @@ module @common_connector_range {
   // CHECK-DAG:     pdl_interp.are_equal %[[ROOTA_OPS]], %[[VALS0]] : !pdl.range<value>
   // CHECK-DAG:     %[[ROOTB_OPS:.*]] = pdl_interp.get_operands of %[[ROOTB]]
   // CHECK-DAG:     pdl_interp.are_equal %[[ROOTB_OPS]], %[[VALS0]] : !pdl.range<value>
-  // CHECK-DAG    } -> ^[[CONTA:.*]]
+  // CHECK-DAG:    } -> ^[[CONTA:.*]]
   pdl.pattern @common_connector_range : benefit(1) {
     %types = types
     %op = operation -> (%types, %types : !pdl.range<type>, !pdl.range<type>)
diff --git a/mlir/test/Conversion/SPIRVToLLVM/spirv-storage-class-mapping.mlir b/mlir/test/Conversion/SPIRVToLLVM/spirv-storage-class-mapping.mlir
index b9c56a3..980406d 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/spirv-storage-class-mapping.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/spirv-storage-class-mapping.mlir
@@ -91,5 +91,5 @@ spirv.func @pointerCodeSectionINTEL(!spirv.ptr<i1, CodeSectionINTEL>) "None"
 spirv.func @pointerDeviceOnlyINTEL(!spirv.ptr<i1, DeviceOnlyINTEL>) "None"
 
 // CHECK-OPENCL:         llvm.func @pointerHostOnlyINTEL(!llvm.ptr<6>)
-// CHECK-UNKOWN:         llvm.func @pointerHostOnlyINTEL(!llvm.ptr)
+// CHECK-UNKNOWN:        llvm.func @pointerHostOnlyINTEL(!llvm.ptr)
 spirv.func @pointerHostOnlyINTEL(!spirv.ptr<i1, HostOnlyINTEL>) "None"
diff --git a/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir b/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
index cddc4ee..a754208 100644
--- a/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
+++ b/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
@@ -483,6 +483,17 @@ func.func @shuffle(%v0 : vector<1xi32>, %v1: vector<1xi32>) -> vector<2xi32> {
 
 // -----
 
+// CHECK-LABEL: func @interleave
+//  CHECK-SAME: (%[[ARG0:.+]]: vector<2xf32>, %[[ARG1:.+]]: vector<2xf32>)
+//       CHECK: %[[SHUFFLE:.*]] = spirv.VectorShuffle [0 : i32, 2 : i32, 1 : i32, 3 : i32] %[[ARG0]], %[[ARG1]] : vector<2xf32>, vector<2xf32> -> vector<4xf32>
+//       CHECK: return %[[SHUFFLE]]
+func.func @interleave(%a: vector<2xf32>, %b: vector<2xf32>) -> vector<4xf32> {
+  %0 = vector.interleave %a, %b : vector<2xf32>
+  return %0 : vector<4xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @reduction_add
 //  CHECK-SAME: (%[[V:.+]]: vector<4xi32>)
 //       CHECK:   %[[S0:.+]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<4xi32>
diff --git a/mlir/test/Dialect/Affine/slicing-utils.mlir b/mlir/test/Dialect/Affine/slicing-utils.mlir
index 7437997..0848a92 100644
--- a/mlir/test/Dialect/Affine/slicing-utils.mlir
+++ b/mlir/test/Dialect/Affine/slicing-utils.mlir
@@ -28,15 +28,15 @@ func.func @slicing_test() {
   // BWD: matched: %[[v1:.*]] {{.*}} backward static slice:
   //
   // FWDBWD: matched: %[[v1:.*]] {{.*}} static slice:
-  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
-  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
-  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
-  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
-  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
-  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
-  // FWDBWD-DAG: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
-  // FWDBWD-DAG: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
-  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+  // FWDBWD: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
 
   %1 = "slicing-test-op" () : () -> i1
 
@@ -49,15 +49,15 @@ func.func @slicing_test() {
   // BWD: matched: %[[v2:.*]] {{.*}} backward static slice:
   //
   // FWDBWD-NEXT: matched: %[[v2:.*]] {{.*}} static slice:
-  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
-  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
-  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
-  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
-  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
-  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
-  // FWDBWD-DAG: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
-  // FWDBWD-DAG: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
-  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+  // FWDBWD: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
 
   %2 = "slicing-test-op" () : () -> i2
 
@@ -69,15 +69,15 @@ func.func @slicing_test() {
   // BWD: matched: %[[v3:.*]] {{.*}} backward static slice:
   //
   // FWDBWD-NEXT: matched: %[[v3:.*]] {{.*}} static slice:
-  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
-  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
-  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
-  // FWDBWD-NEXT: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
-  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
-  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
-  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
-  // FWDBWD-NEXT: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
-  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+  // FWDBWD: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
 
   %3 = "slicing-test-op" () : () -> i3
 
@@ -89,15 +89,15 @@ func.func @slicing_test() {
   // BWD: matched: %[[v4:.*]] {{.*}} backward static slice:
   //
   // FWDBWD-NEXT: matched: %[[v4:.*]] {{.*}} static slice:
-  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
-  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
-  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
-  // FWDBWD-NEXT: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
-  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
-  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
-  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
-  // FWDBWD-NEXT: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
-  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+  // FWDBWD: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
 
   %4 = "slicing-test-op" () : () -> i4
 
@@ -111,15 +111,15 @@ func.func @slicing_test() {
   // BWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
   //
   // FWDBWD-NEXT: matched: %[[v5:.*]] {{.*}} static slice:
-  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
-  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
-  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
-  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
-  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
-  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
-  // FWDBWD-DAG: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
-  // FWDBWD-DAG: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
-  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+  // FWDBWD: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
 
   %5 = "slicing-test-op" (%1, %2) : (i1, i2) -> i5
 
@@ -132,15 +132,15 @@ func.func @slicing_test() {
   // BWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
   //
   // FWDBWD-NEXT: matched: %[[v6:.*]] {{.*}} static slice:
-  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
-  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
-  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
-  // FWDBWD-NEXT: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
-  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
-  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
-  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
-  // FWDBWD-NEXT: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
-  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+  // FWDBWD: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
 
   %6 = "slicing-test-op" (%3, %4) : (i3, i4) -> i6
 
@@ -153,15 +153,15 @@ func.func @slicing_test() {
   // BWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
   //
   // FWDBWD-NEXT: matched: %[[v7:.*]] {{.*}} static slice:
-  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
-  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
-  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
-  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
-  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD: %[[v4:.*]] = "slicing-test-op"() : () -> i4
   // FWDBWD: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
-  // FWDBWD-DAG: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
-  // FWDBWD-DAG: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
-  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+  // FWDBWD: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
 
   %7 = "slicing-test-op" (%1, %5) : (i1, i5) -> i7
 
@@ -177,15 +177,15 @@ func.func @slicing_test() {
   // BWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
   //
   // FWDBWD-NEXT: matched: %[[v8:.*]] {{.*}} static slice:
-  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
-  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
-  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
-  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
-  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
-  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
-  // FWDBWD-DAG: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
-  // FWDBWD-DAG: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
-  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+  // FWDBWD: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
 
   %8 = "slicing-test-op" (%5, %6) : (i5, i6) -> i8
 
@@ -202,15 +202,15 @@ func.func @slicing_test() {
   // BWD-NEXT: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
   //
   // FWDBWD-NEXT: matched: %[[v9:.*]] {{.*}} static slice:
-  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
-  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
-  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
-  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
-  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
-  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
-  // FWDBWD-DAG: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
-  // FWDBWD-DAG: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
-  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+  // FWDBWD: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
 
   %9 = "slicing-test-op" (%7, %8) : (i7, i8) -> i9
 
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index e4f95bb..1a387c2 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -2950,6 +2950,14 @@ func.func @unsignedExtendConstantResource() -> tensor<i16> {
   return %ext : tensor<i16>
 }
 
+// Just checks that this doesn't crash.
+// CHECK-LABEL: @signedExtendSplatAsDynamicShape
+func.func @signedExtendSplatAsDynamicShape() -> tensor<?xi64> {
+  %splat = arith.constant dense<5> : tensor<2xi16>
+  %extsplat = arith.extsi %splat : tensor<2xi16> to tensor<?xi64>
+  return %extsplat : tensor<?xi64>
+}
+
 // CHECK-LABEL: @extsi_i0
 //       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i16
 //       CHECK:   return %[[ZERO]] : i16
diff --git a/mlir/test/Dialect/Arith/int-range-interface.mlir b/mlir/test/Dialect/Arith/int-range-interface.mlir
index 16524b3..5b53819 100644
--- a/mlir/test/Dialect/Arith/int-range-interface.mlir
+++ b/mlir/test/Dialect/Arith/int-range-interface.mlir
@@ -758,7 +758,7 @@ func.func private @callee(%arg0: memref<?xindex, 4>) {
 }
 
 // CHECK-LABEL: func @test_i8_bounds
-// CHECK: test.reflect_bounds {smax = 127 : i8, smin = -128 : i8, umax = -1 : i8, umin = 0 : i8}
+// CHECK: test.reflect_bounds {smax = 127 : si8, smin = -128 : si8, umax = 255 : ui8, umin = 0 : ui8}
 func.func @test_i8_bounds() -> i8 {
   %cst1 = arith.constant 1 : i8
   %0 = test.with_bounds { umin = 0 : i8, umax = 255 : i8, smin = -128 : i8, smax = 127 : i8 } : i8
@@ -766,3 +766,136 @@ func.func @test_i8_bounds() -> i8 {
   %2 = test.reflect_bounds %1 : i8
   return %2: i8
 }
+
+// CHECK-LABEL: func @test_add_1
+// CHECK: test.reflect_bounds {smax = 127 : si8, smin = -128 : si8, umax = 255 : ui8, umin = 0 : ui8}
+func.func @test_add_1() -> i8 {
+  %cst1 = arith.constant 1 : i8
+  %0 = test.with_bounds { umin = 0 : i8, umax = 255 : i8, smin = -128 : i8, smax = 127 : i8 } : i8
+  %1 = arith.addi %0, %cst1 : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
+
+// Tests below check inference with overflow flags.
+
+// CHECK-LABEL: func @test_add_i8_wrap1
+// CHECK: test.reflect_bounds {smax = 127 : si8, smin = -128 : si8, umax = 128 : ui8, umin = 1 : ui8}
+func.func @test_add_i8_wrap1() -> i8 {
+  %cst1 = arith.constant 1 : i8
+  %0 = test.with_bounds { umin = 0 : i8, umax = 127 : i8, smin = 0 : i8, smax = 127 : i8 } : i8
+  // smax overflow
+  %1 = arith.addi %0, %cst1 : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
+
+// CHECK-LABEL: func @test_add_i8_wrap2
+// CHECK: test.reflect_bounds {smax = 127 : si8, smin = -128 : si8, umax = 128 : ui8, umin = 1 : ui8}
+func.func @test_add_i8_wrap2() -> i8 {
+  %cst1 = arith.constant 1 : i8
+  %0 = test.with_bounds { umin = 0 : i8, umax = 127 : i8, smin = 0 : i8, smax = 127 : i8 } : i8
+  // smax overflow
+  %1 = arith.addi %0, %cst1 overflow<nuw> : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
+
+// CHECK-LABEL: func @test_add_i8_nowrap
+// CHECK: test.reflect_bounds {smax = 127 : si8, smin = 1 : si8, umax = 127 : ui8, umin = 1 : ui8}
+func.func @test_add_i8_nowrap() -> i8 {
+  %cst1 = arith.constant 1 : i8
+  %0 = test.with_bounds { umin = 0 : i8, umax = 127 : i8, smin = 0 : i8, smax = 127 : i8 } : i8
+  // nsw flag stops smax from overflowing
+  %1 = arith.addi %0, %cst1 overflow<nsw> : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
+
+// CHECK-LABEL: func @test_sub_i8_wrap1
+// CHECK: test.reflect_bounds {smax = 5 : si8, smin = -10 : si8, umax = 255 : ui8, umin = 0 : ui8} %1 : i8
+func.func @test_sub_i8_wrap1() -> i8 {
+  %cst10 = arith.constant 10 : i8
+  %0 = test.with_bounds { umin = 0 : i8, umax = 15 : i8, smin = 0 : i8, smax = 15 : i8 } : i8
+  // umin underflows
+  %1 = arith.subi %0, %cst10 : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
+
+// CHECK-LABEL: func @test_sub_i8_wrap2
+// CHECK: test.reflect_bounds {smax = 5 : si8, smin = -10 : si8, umax = 255 : ui8, umin = 0 : ui8} %1 : i8
+func.func @test_sub_i8_wrap2() -> i8 {
+  %cst10 = arith.constant 10 : i8
+  %0 = test.with_bounds { umin = 0 : i8, umax = 15 : i8, smin = 0 : i8, smax = 15 : i8 } : i8
+  // umin underflows
+  %1 = arith.subi %0, %cst10 overflow<nsw> : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
+
+// CHECK-LABEL: func @test_sub_i8_nowrap
+// CHECK: test.reflect_bounds {smax = 5 : si8, smin = 0 : si8, umax = 5 : ui8, umin = 0 : ui8}
+func.func @test_sub_i8_nowrap() -> i8 {
+  %cst10 = arith.constant 10 : i8
+  %0 = test.with_bounds { umin = 0 : i8, umax = 15 : i8, smin = 0 : i8, smax = 15 : i8 } : i8
+  // nuw flag stops umin from underflowing
+  %1 = arith.subi %0, %cst10 overflow<nuw> : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
+
+// CHECK-LABEL: func @test_mul_i8_wrap
+// CHECK: test.reflect_bounds {smax = 127 : si8, smin = -128 : si8, umax = 200 : ui8, umin = 100 : ui8}
+func.func @test_mul_i8_wrap() -> i8 {
+  %cst10 = arith.constant 10 : i8
+  %0 = test.with_bounds { umin = 10 : i8, umax = 20 : i8, smin = 10 : i8, smax = 20 : i8 } : i8
+  // smax overflows
+  %1 = arith.muli %0, %cst10 : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
+
+// CHECK-LABEL: func @test_mul_i8_nowrap
+// CHECK: test.reflect_bounds {smax = 127 : si8, smin = 100 : si8, umax = 127 : ui8, umin = 100 : ui8}
+func.func @test_mul_i8_nowrap() -> i8 {
+  %cst10 = arith.constant 10 : i8
+  %0 = test.with_bounds { umin = 10 : i8, umax = 20 : i8, smin = 10 : i8, smax = 20 : i8 } : i8
+  // nsw stops overflow
+  %1 = arith.muli %0, %cst10 overflow<nsw> : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
+
+// CHECK-LABEL: func @test_shl_i8_wrap1
+// CHECK: test.reflect_bounds {smax = 127 : si8, smin = -128 : si8, umax = 160 : ui8, umin = 80 : ui8}
+func.func @test_shl_i8_wrap1() -> i8 {
+  %cst3 = arith.constant 3 : i8
+  %0 = test.with_bounds { umin = 10 : i8, umax = 20 : i8, smin = 10 : i8, smax = 20 : i8 } : i8
+  // smax overflows
+  %1 = arith.shli %0, %cst3 : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
+
+// CHECK-LABEL: func @test_shl_i8_wrap2
+// CHECK: test.reflect_bounds {smax = 127 : si8, smin = -128 : si8, umax = 160 : ui8, umin = 80 : ui8}
+func.func @test_shl_i8_wrap2() -> i8 {
+  %cst3 = arith.constant 3 : i8
+  %0 = test.with_bounds { umin = 10 : i8, umax = 20 : i8, smin = 10 : i8, smax = 20 : i8 } : i8
+  // smax overflows
+  %1 = arith.shli %0, %cst3 overflow<nuw> : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
+
+// CHECK-LABEL: func @test_shl_i8_nowrap
+// CHECK: test.reflect_bounds {smax = 127 : si8, smin = 80 : si8, umax = 127 : ui8, umin = 80 : ui8}
+func.func @test_shl_i8_nowrap() -> i8 {
+  %cst3 = arith.constant 3 : i8
+  %0 = test.with_bounds { umin = 10 : i8, umax = 20 : ui8, smin = 10 : i8, smax = 20 : i8 } : i8
+  // nsw stops smax overflow
+  %1 = arith.shli %0, %cst3 overflow<nsw> : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
diff --git a/mlir/test/Dialect/Arith/int-range-opts.mlir b/mlir/test/Dialect/Arith/int-range-opts.mlir
index 6179003..dd62a48 100644
--- a/mlir/test/Dialect/Arith/int-range-opts.mlir
+++ b/mlir/test/Dialect/Arith/int-range-opts.mlir
@@ -75,7 +75,7 @@ func.func @test() -> i1 {
 // -----
 
 // CHECK-LABEL: func @test
-// CHECK: test.reflect_bounds {smax = 24 : i8, smin = 0 : i8, umax = 24 : i8, umin = 0 : i8}
+// CHECK: test.reflect_bounds {smax = 24 : si8, smin = 0 : si8, umax = 24 : ui8, umin = 0 : ui8}
 func.func @test() -> i8 {
   %cst1 = arith.constant 1 : i8
   %i8val = test.with_bounds { umin = 0 : i8, umax = 12 : i8, smin = 0 : i8, smax = 12 : i8 } : i8
@@ -87,7 +87,7 @@ func.func @test() -> i8 {
 // -----
 
 // CHECK-LABEL: func @test
-// CHECK: test.reflect_bounds {smax = 127 : i8, smin = -128 : i8, umax = -1 : i8, umin = 0 : i8}
+// CHECK: test.reflect_bounds {smax = 127 : si8, smin = -128 : si8, umax = 254 : ui8, umin = 0 : ui8}
 func.func @test() -> i8 {
   %cst1 = arith.constant 1 : i8
   %i8val = test.with_bounds { umin = 0 : i8, umax = 127 : i8, smin = 0 : i8, smax = 127 : i8 } : i8
diff --git a/mlir/test/Dialect/Arith/unsigned-when-equivalent.mlir b/mlir/test/Dialect/Arith/unsigned-when-equivalent.mlir
index ce77d3d..49bd74c 100644
--- a/mlir/test/Dialect/Arith/unsigned-when-equivalent.mlir
+++ b/mlir/test/Dialect/Arith/unsigned-when-equivalent.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt -arith-unsigned-when-equivalent %s | FileCheck %s
 
-// CHECK-LABEL func @not_with_maybe_overflow
+// CHECK-LABEL: func @not_with_maybe_overflow
 // CHECK: arith.divsi
 // CHECK: arith.ceildivsi
 // CHECK: arith.floordivsi
@@ -32,7 +32,7 @@ func.func @not_with_maybe_overflow(%arg0 : i32) {
     func.return
 }
 
-// CHECK-LABEL func @yes_with_no_overflow
+// CHECK-LABEL: func @yes_with_no_overflow
 // CHECK: arith.divui
 // CHECK: arith.ceildivui
 // CHECK: arith.divui
diff --git a/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir b/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir
index 88fc8a8..fe4c005 100644
--- a/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir
+++ b/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir
@@ -366,15 +366,15 @@ func.func @avoidable_spill(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %c: vector<
 
 //  CHECK-LIVE-RANGE-LABEL: @cond_branch_with_backedge
 //        CHECK-LIVE-RANGE: ^bb1:
-//  CHECK-LIVE-RANGE--NEXT:  ||| |           arith.cmpi
-//  CHECK-LIVE-RANGE--NEXT:  EEE E           cf.cond_br
+//   CHECK-LIVE-RANGE-NEXT:  ||| |           arith.cmpi
+//   CHECK-LIVE-RANGE-NEXT:  EEE E           cf.cond_br
 //
-//  CHECK-LIVE-RANGE--NEXT: ^[[BB3_COPIES:[[:alnum:]]+]]:
-//  CHECK-LIVE-RANGE--NEXT:  ||| ES          arm_sme.copy_tile
-//  CHECK-LIVE-RANGE--NEXT:  E||  |S         arm_sme.copy_tile
-//  CHECK-LIVE-RANGE--NEXT:   E|  ||S        arm_sme.copy_tile
-//  CHECK-LIVE-RANGE--NEXT:    E  |||S       arm_sme.copy_tile
-//  CHECK-LIVE-RANGE--NEXT:       EEEE       cf.br
+//   CHECK-LIVE-RANGE-NEXT: ^[[BB3_COPIES:[[:alnum:]]+]]:
+//   CHECK-LIVE-RANGE-NEXT:  ||| ES          arm_sme.copy_tile
+//   CHECK-LIVE-RANGE-NEXT:  E||  |S         arm_sme.copy_tile
+//   CHECK-LIVE-RANGE-NEXT:   E|  ||S        arm_sme.copy_tile
+//   CHECK-LIVE-RANGE-NEXT:    E  |||S       arm_sme.copy_tile
+//   CHECK-LIVE-RANGE-NEXT:       EEEE       cf.br
 //
 // It is important to note that the first three live ranges in ^bb1 do not end
 // at the `cf.cond_br` they are live-out via the backedge bb1 -> bb2 -> bb1.
@@ -389,15 +389,15 @@ func.func @avoidable_spill(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %c: vector<
 //
 //        CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
 //        CHECK-LIVE-RANGE: ^bb1:
-//  CHECK-LIVE-RANGE--NEXT: |||| arith.cmpi
-//  CHECK-LIVE-RANGE--NEXT: EEEE cf.cond_br
+//   CHECK-LIVE-RANGE-NEXT: |||| arith.cmpi
+//   CHECK-LIVE-RANGE-NEXT: EEEE cf.cond_br
 //
-//  CHECK-LIVE-RANGE--NEXT: ^[[BB3_COPIES]]:
-//  CHECK-LIVE-RANGE--NEXT: |||| arm_sme.copy_tile
-//  CHECK-LIVE-RANGE--NEXT: |||| arm_sme.copy_tile
-//  CHECK-LIVE-RANGE--NEXT: |||| arm_sme.copy_tile
-//  CHECK-LIVE-RANGE--NEXT: |||| arm_sme.copy_tile
-//  CHECK-LIVE-RANGE--NEXT: EEEE cf.br
+//   CHECK-LIVE-RANGE-NEXT: ^[[BB3_COPIES]]:
+//   CHECK-LIVE-RANGE-NEXT: |||| arm_sme.copy_tile
+//   CHECK-LIVE-RANGE-NEXT: |||| arm_sme.copy_tile
+//   CHECK-LIVE-RANGE-NEXT: |||| arm_sme.copy_tile
+//   CHECK-LIVE-RANGE-NEXT: |||| arm_sme.copy_tile
+//   CHECK-LIVE-RANGE-NEXT: EEEE cf.br
 
 // CHECK-LABEL: @cond_branch_with_backedge
 // CHECK-NOT: tile_id = 16
diff --git a/mlir/test/Dialect/Bufferization/Transforms/lower-deallocations-func.mlir b/mlir/test/Dialect/Bufferization/Transforms/lower-deallocations-func.mlir
index 03cf10a..3de3a6a6 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/lower-deallocations-func.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/lower-deallocations-func.mlir
@@ -9,10 +9,10 @@ func.func @conversion_dealloc_simple(%arg0: memref<2xf32>, %arg1: i1) {
   return
 }
 
-//      CHECk: scf.if [[ARG1]] {
-// CHECk-NEXT:   memref.dealloc [[ARG0]] : memref<2xf32>
-// CHECk-NEXT: }
-// CHECk-NEXT: return
+//      CHECK: scf.if [[ARG1]] {
+// CHECK-NEXT:   memref.dealloc [[ARG0]] : memref<2xf32>
+// CHECK-NEXT: }
+// CHECK-NEXT: return
 
 // -----
 
diff --git a/mlir/test/Dialect/Bufferization/Transforms/lower-deallocations.mlir b/mlir/test/Dialect/Bufferization/Transforms/lower-deallocations.mlir
index 2c69fca..5fedd45 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/lower-deallocations.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/lower-deallocations.mlir
@@ -29,10 +29,10 @@ func.func @conversion_dealloc_simple(%arg0: memref<2xf32>, %arg1: i1) {
   return
 }
 
-//      CHECk: scf.if [[ARG1]] {
-// CHECk-NEXT:   memref.dealloc [[ARG0]] : memref<2xf32>
-// CHECk-NEXT: }
-// CHECk-NEXT: return
+//      CHECK: scf.if [[ARG1]] {
+// CHECK-NEXT:   memref.dealloc [[ARG0]] : memref<2xf32>
+// CHECK-NEXT: }
+// CHECK-NEXT: return
 
 // -----
 
diff --git a/mlir/test/Dialect/GPU/barrier-elimination.mlir b/mlir/test/Dialect/GPU/barrier-elimination.mlir
index 844dc7d..1f5b849 100644
--- a/mlir/test/Dialect/GPU/barrier-elimination.mlir
+++ b/mlir/test/Dialect/GPU/barrier-elimination.mlir
@@ -61,7 +61,7 @@ func.func @write_in_a_loop(%arg0: memref<?xf32>, %arg1: f32) attributes {__paral
   return
 }
 
-// CHECK-LABEL @read_read_write_loop
+// CHECK-LABEL: @read_read_write_loop
 func.func @read_read_write_loop(%arg0: memref<?xf32>, %arg1: f32) attributes {__parallel_region_boundary_for_test} {
   %c0 = arith.constant 0 : index
   %c42 = arith.constant 42 : index
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index 511b018..ba7897f 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -227,7 +227,7 @@ module attributes {gpu.container_module} {
       gpu.return
     }
 
-    // CHECK-LABEL gpu.func @printf_test
+    // CHECK-LABEL: gpu.func @printf_test
     // CHECK: (%[[ARG0:.*]]: i32)
     // CHECK: gpu.printf "Value: %d" %[[ARG0]] : i32
     gpu.func @printf_test(%arg0 : i32) {
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index 5e4724c..47ebe32 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -123,7 +123,7 @@ llvm.func @launch_from_llvm_func() {
   llvm.return
 }
 
-// CHECK-DL-LABLE: gpu.module @launch_from_llvm_func_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+// CHECK-DL-LABEL: gpu.module @launch_from_llvm_func_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
 
 // -----
 
diff --git a/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir b/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
index 07e7197..732f40c 100644
--- a/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
+++ b/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
@@ -27,4 +27,4 @@ func.func @test_math(%arg0 : f32) {
         gpu.terminator
     }
     return
-}
-\ No newline at end of file
+}
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index de2904d..a7bdceb 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -464,24 +464,24 @@ llvm.func private @mbarrier_test_wait_shared(%barrier: !llvm.ptr<3>, %token : i6
   llvm.return
 }
 
-// CHECK-LABEL : @wgmma_fence_aligned
+// CHECK-LABEL: @wgmma_fence_aligned
 func.func @wgmma_fence_aligned() {
-  // CHECK : nvvm.wgmma.fence.aligned
+  // CHECK: nvvm.wgmma.fence.aligned
   nvvm.wgmma.fence.aligned
   return
 }
 
-// CHECK-LABEL : @wgmma_commit_group_sync_aligned
+// CHECK-LABEL: @wgmma_commit_group_sync_aligned
 func.func @wgmma_commit_group_sync_aligned() {
-  // CHECK : nvvm.wgmma.commit.group.sync.aligned
+  // CHECK: nvvm.wgmma.commit.group.sync.aligned
   nvvm.wgmma.commit.group.sync.aligned
   return
 }
 
 
-// CHECK-LABEL : @wgmma_commit_group_sync_aligned
+// CHECK-LABEL: @wgmma_wait_group_sync_aligned
 func.func @wgmma_wait_group_sync_aligned() {
-  // CHECK : nvvm.wgmma.wait.group.sync.aligned
+  // CHECK: nvvm.wgmma.wait.group.sync.aligned
   nvvm.wgmma.wait.group.sync.aligned 0
   return
 }
@@ -495,7 +495,7 @@ gpu.module @module_1 [#nvvm.target<chip = "sm_90", features = "+ptx70", link = [
 gpu.module @module_2 [#nvvm.target<chip = "sm_90">, #nvvm.target<chip = "sm_80">, #nvvm.target<chip = "sm_70">] {
 }
 
-// CHECK-LABEL : nvvm.grid_constant
+// CHECK-LABEL: nvvm.grid_constant
 llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}) attributes {nvvm.kernel} {
   llvm.return
 }
diff --git a/mlir/test/Dialect/LLVMIR/type-consistency.mlir b/mlir/test/Dialect/LLVMIR/type-consistency.mlir
deleted file mode 100644
index c9c1355..0000000
--- a/mlir/test/Dialect/LLVMIR/type-consistency.mlir
+++ /dev/null
@@ -1,533 +0,0 @@
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(llvm.func(llvm-type-consistency))" --split-input-file | FileCheck %s
-
-// CHECK-LABEL: llvm.func @same_address
-llvm.func @same_address(%arg: i32) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32, i32, i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, i32, i32)> : (i32) -> !llvm.ptr
-  // CHECK: = llvm.getelementptr %[[ALLOCA]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, i32, i32)>
-  %7 = llvm.getelementptr %1[8] : (!llvm.ptr) -> !llvm.ptr, i8
-  llvm.store %arg, %7 : i32, !llvm.ptr
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @same_address_keep_inbounds
-llvm.func @same_address_keep_inbounds(%arg: i32) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32, i32, i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, i32, i32)> : (i32) -> !llvm.ptr
-  // CHECK: = llvm.getelementptr inbounds %[[ALLOCA]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, i32, i32)>
-  %7 = llvm.getelementptr inbounds %1[8] : (!llvm.ptr) -> !llvm.ptr, i8
-  llvm.store %arg, %7 : i32, !llvm.ptr
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @index_in_final_padding
-llvm.func @index_in_final_padding(%arg: i32) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32, i8)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, i8)> : (i32) -> !llvm.ptr
-  // CHECK: = llvm.getelementptr %[[ALLOCA]][7] : (!llvm.ptr) -> !llvm.ptr, i8
-  %7 = llvm.getelementptr %1[7] : (!llvm.ptr) -> !llvm.ptr, i8
-  llvm.store %arg, %7 : i32, !llvm.ptr
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @index_out_of_bounds
-llvm.func @index_out_of_bounds(%arg: i32) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32, i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, i32)> : (i32) -> !llvm.ptr
-  // CHECK: = llvm.getelementptr %[[ALLOCA]][9] : (!llvm.ptr) -> !llvm.ptr, i8
-  %7 = llvm.getelementptr %1[9] : (!llvm.ptr) -> !llvm.ptr, i8
-  llvm.store %arg, %7 : i32, !llvm.ptr
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @index_in_padding
-llvm.func @index_in_padding(%arg: i16) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i16, i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i16, i32)> : (i32) -> !llvm.ptr
-  // CHECK: = llvm.getelementptr %[[ALLOCA]][2] : (!llvm.ptr) -> !llvm.ptr, i8
-  %7 = llvm.getelementptr %1[2] : (!llvm.ptr) -> !llvm.ptr, i8
-  llvm.store %arg, %7 : i16, !llvm.ptr
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @index_not_in_padding_because_packed
-llvm.func @index_not_in_padding_because_packed(%arg: i16) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", packed (i16, i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", packed (i16, i32)> : (i32) -> !llvm.ptr
-  // CHECK: = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", packed (i16, i32)>
-  %7 = llvm.getelementptr %1[2] : (!llvm.ptr) -> !llvm.ptr, i8
-  llvm.store %arg, %7 : i16, !llvm.ptr
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @no_crash_on_negative_gep_index
-llvm.func @no_crash_on_negative_gep_index() {
-  %0 = llvm.mlir.constant(1.000000e+00 : f16) : f16
-  %1 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32, i32, i32)>
-  %2 = llvm.alloca %1 x !llvm.struct<"foo", (i32, i32, i32)> : (i32) -> !llvm.ptr
-  // CHECK: llvm.getelementptr %[[ALLOCA]][-1] : (!llvm.ptr) -> !llvm.ptr, f32
-  %3 = llvm.getelementptr %2[-1] : (!llvm.ptr) -> !llvm.ptr, f32
-  llvm.store %0, %3 : f16, !llvm.ptr
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @coalesced_store_ints
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @coalesced_store_ints(%arg: i64) {
-  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK-DAG: %[[CST32:.*]] = llvm.mlir.constant(32 : i64) : i64
-
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32, i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, i32)> : (i32) -> !llvm.ptr
-
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST0]]
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i32
-  // CHECK: llvm.store %[[TRUNC]], %[[ALLOCA]]
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST32]] : i64
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i32
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<"foo", (i32, i32)>
-  // CHECK: llvm.store %[[TRUNC]], %[[GEP]]
-  llvm.store %arg, %1 : i64, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @coalesced_store_ints_offset
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @coalesced_store_ints_offset(%arg: i64) {
-  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK-DAG: %[[CST32:.*]] = llvm.mlir.constant(32 : i64) : i64
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i64, i32, i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i64, i32, i32)> : (i32) -> !llvm.ptr
-  %3 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i64, i32, i32)>
-
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST0]]
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i32
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i64, i32, i32)>
-  // CHECK: llvm.store %[[TRUNC]], %[[GEP]]
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST32]] : i64
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i32
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 2] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<"foo", (i64, i32, i32)>
-  // CHECK: llvm.store %[[TRUNC]], %[[GEP]]
-  llvm.store %arg, %3 : i64, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @coalesced_store_floats
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @coalesced_store_floats(%arg: i64) {
-  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK-DAG: %[[CST32:.*]] = llvm.mlir.constant(32 : i64) : i64
-  %0 = llvm.mlir.constant(1 : i32) : i32
-
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (f32, f32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (f32, f32)> : (i32) -> !llvm.ptr
-
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST0]]
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i32
-  // CHECK: llvm.store %[[TRUNC]], %[[ALLOCA]]
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST32]] : i64
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i32
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<"foo", (f32, f32)>
-  // CHECK: llvm.store %[[TRUNC]], %[[GEP]]
-  llvm.store %arg, %1 : i64, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// Padding test purposefully not modified.
-
-// CHECK-LABEL: llvm.func @coalesced_store_padding_inbetween
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @coalesced_store_padding_inbetween(%arg: i64) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i16, i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i16, i32)> : (i32) -> !llvm.ptr
-  // CHECK: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.store %arg, %1 : i64, !llvm.ptr
-  llvm.return
-}
-
-// -----
-
-// Padding test purposefully not modified.
-
-// CHECK-LABEL: llvm.func @coalesced_store_padding_end
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @coalesced_store_padding_end(%arg: i64) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32, i16)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, i16)> : (i32) -> !llvm.ptr
-  // CHECK: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.store %arg, %1 : i64, !llvm.ptr
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @coalesced_store_past_end
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @coalesced_store_past_end(%arg: i64) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32)> : (i32) -> !llvm.ptr
-  // CHECK: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.store %arg, %1 : i64, !llvm.ptr
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @coalesced_store_packed_struct
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @coalesced_store_packed_struct(%arg: i64) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK-DAG: %[[CST16:.*]] = llvm.mlir.constant(16 : i64) : i64
-  // CHECK-DAG: %[[CST48:.*]] = llvm.mlir.constant(48 : i64) : i64
-
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", packed (i16, i32, i16)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", packed (i16, i32, i16)> : (i32) -> !llvm.ptr
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST0]]
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i16
-  // CHECK: llvm.store %[[TRUNC]], %[[ALLOCA]]
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST16]]
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i32
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", packed (i16, i32, i16)>
-  // CHECK: llvm.store %[[TRUNC]], %[[GEP]]
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST48]]
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i16
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", packed (i16, i32, i16)>
-  // CHECK: llvm.store %[[TRUNC]], %[[GEP]]
-  llvm.store %arg, %1 : i64, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @vector_write_split
-// CHECK-SAME: %[[ARG:.*]]: vector<4xi32>
-llvm.func @vector_write_split(%arg: vector<4xi32>) {
-  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i32) : i32
-  // CHECK-DAG: %[[CST1:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[CST2:.*]] = llvm.mlir.constant(2 : i32) : i32
-  // CHECK-DAG: %[[CST3:.*]] = llvm.mlir.constant(3 : i32) : i32
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32, i32, i32, i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, i32, i32, i32)> : (i32) -> !llvm.ptr
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST0]] : i32] : vector<4xi32>
-  // CHECK: llvm.store %[[EXTRACT]], %[[ALLOCA]] : i32, !llvm.ptr
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST1]] : i32] : vector<4xi32>
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, i32, i32, i32)>
-  // CHECK: llvm.store %[[EXTRACT]], %[[GEP]] : i32, !llvm.ptr
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST2]] : i32] : vector<4xi32>
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, i32, i32, i32)>
-  // CHECK: llvm.store %[[EXTRACT]], %[[GEP]] : i32, !llvm.ptr
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST3]] : i32] : vector<4xi32>
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, i32, i32, i32)>
-  // CHECK: llvm.store %[[EXTRACT]], %[[GEP]] : i32, !llvm.ptr
-
-  llvm.store %arg, %1 : vector<4xi32>, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @vector_write_split_offset
-// CHECK-SAME: %[[ARG:.*]]: vector<4xi32>
-llvm.func @vector_write_split_offset(%arg: vector<4xi32>) {
-  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i32) : i32
-  // CHECK-DAG: %[[CST1:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[CST2:.*]] = llvm.mlir.constant(2 : i32) : i32
-  // CHECK-DAG: %[[CST3:.*]] = llvm.mlir.constant(3 : i32) : i32
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i64, i32, i32, i32, i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i64, i32, i32, i32, i32)> : (i32) -> !llvm.ptr
-  %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i64, i32, i32, i32, i32)>
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST0]] : i32] : vector<4xi32>
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i64, i32, i32, i32, i32)>
-  // CHECK: llvm.store %[[EXTRACT]], %[[GEP]] : i32, !llvm.ptr
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST1]] : i32] : vector<4xi32>
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i64, i32, i32, i32, i32)>
-  // CHECK: llvm.store %[[EXTRACT]], %[[GEP]] : i32, !llvm.ptr
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST2]] : i32] : vector<4xi32>
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i64, i32, i32, i32, i32)>
-  // CHECK: llvm.store %[[EXTRACT]], %[[GEP]] : i32, !llvm.ptr
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST3]] : i32] : vector<4xi32>
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 4] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i64, i32, i32, i32, i32)>
-  // CHECK: llvm.store %[[EXTRACT]], %[[GEP]] : i32, !llvm.ptr
-
-  llvm.store %arg, %2 : vector<4xi32>, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// Small test that a split vector store will be further optimized (to than e.g.
-// split integer loads to structs as shown here)
-
-// CHECK-LABEL: llvm.func @vector_write_split_struct
-// CHECK-SAME: %[[ARG:.*]]: vector<2xi64>
-llvm.func @vector_write_split_struct(%arg: vector<2xi64>) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32, i32, i32, i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, i32, i32, i32)> : (i32) -> !llvm.ptr
-
-  // CHECK-COUNT-4: llvm.store %{{.*}}, %{{.*}} : i32, !llvm.ptr
-
-  llvm.store %arg, %1 : vector<2xi64>, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @gep_split
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @gep_split(%arg: i64) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.array<2 x struct<"foo", (i64)>>
-  %1 = llvm.alloca %0 x !llvm.array<2 x struct<"foo", (i64)>> : (i32) -> !llvm.ptr
-  %3 = llvm.getelementptr %1[0, 1, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<2 x struct<"foo", (i64)>>
-  // CHECK: %[[TOP_GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<2 x struct<"foo", (i64)>>
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[TOP_GEP]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i64)>
-  // CHECK: llvm.store %[[ARG]], %[[GEP]]
-  llvm.store %arg, %3 : i64, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @coalesced_store_ints_subaggregate
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @coalesced_store_ints_subaggregate(%arg: i64) {
-  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK-DAG: %[[CST32:.*]] = llvm.mlir.constant(32 : i64) : i64
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i64, struct<(i32, i32)>)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i64, struct<(i32, i32)>)> : (i32) -> !llvm.ptr
-  %3 = llvm.getelementptr %1[0, 1, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i64, struct<(i32, i32)>)>
-
-  // CHECK: %[[TOP_GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i64, struct<(i32, i32)>)>
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST0]]
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i32
-  // CHECK: llvm.store %[[TRUNC]], %[[TOP_GEP]]
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST32]] : i64
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i32
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[TOP_GEP]][0, 1] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<(i32, i32)>
-  // CHECK: llvm.store %[[TRUNC]], %[[GEP]]
-  llvm.store %arg, %3 : i64, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @gep_result_ptr_type_dynamic
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @gep_result_ptr_type_dynamic(%arg: i64) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.array<2 x struct<"foo", (i64)>>
-  %1 = llvm.alloca %0 x !llvm.array<2 x struct<"foo", (i64)>> : (i32) -> !llvm.ptr
-  %3 = llvm.getelementptr %1[0, %arg, 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.array<2 x struct<"foo", (i64)>>
-  // CHECK: %[[TOP_GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, %[[ARG]]] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.array<2 x struct<"foo", (i64)>>
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[TOP_GEP]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i64)>
-  // CHECK: llvm.store %[[ARG]], %[[GEP]]
-  llvm.store %arg, %3 : i64, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @overlapping_int_aggregate_store
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @overlapping_int_aggregate_store(%arg: i64) {
-  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK-DAG: %[[CST16:.*]] = llvm.mlir.constant(16 : i64) : i64
-
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i16, struct<(i16, i16, i16)>)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i16, struct<(i16, i16, i16)>)> : (i32) -> !llvm.ptr
-
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST0]]
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i16
-  // CHECK: llvm.store %[[TRUNC]], %[[ALLOCA]]
-
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST16]] : i64
-  // CHECK: [[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i48
-  // CHECK: %[[TOP_GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<"foo", (i16, struct<(i16, i16, i16)>)>
-
-  // Normal integer splitting of [[TRUNC]] follows:
-
-  // CHECK: llvm.store %{{.*}}, %[[TOP_GEP]]
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[TOP_GEP]][0, 1] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<(i16, i16, i16)>
-  // CHECK: llvm.store %{{.*}}, %[[GEP]]
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[TOP_GEP]][0, 2] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<(i16, i16, i16)>
-  // CHECK: llvm.store %{{.*}}, %[[GEP]]
-
-  llvm.store %arg, %1 : i64, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @overlapping_vector_aggregate_store
-// CHECK-SAME: %[[ARG:.*]]: vector<4xi16>
-llvm.func @overlapping_vector_aggregate_store(%arg: vector<4 x i16>) {
-  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i32) : i32
-  // CHECK-DAG: %[[CST1:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[CST2:.*]] = llvm.mlir.constant(2 : i32) : i32
-  // CHECK-DAG: %[[CST3:.*]] = llvm.mlir.constant(3 : i32) : i32
-
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i16, struct<(i16, i16, i16)>)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i16, struct<(i16, i16, i16)>)> : (i32) -> !llvm.ptr
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST0]] : i32]
-  // CHECK: llvm.store %[[EXTRACT]], %[[ALLOCA]]
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST1]] : i32]
-  // CHECK: %[[GEP0:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<"foo", (i16, struct<(i16, i16, i16)>)>
-  // CHECK: llvm.store %[[EXTRACT]], %[[GEP0]]
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST2]] : i32]
-  // CHECK: %[[GEP0:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<"foo", (i16, struct<(i16, i16, i16)>)>
-  // CHECK: %[[GEP1:.*]] = llvm.getelementptr %[[GEP0]][0, 1] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<(i16, i16, i16)>
-  // CHECK: llvm.store %[[EXTRACT]], %[[GEP1]]
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST3]] : i32]
-  // CHECK: %[[GEP0:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<"foo", (i16, struct<(i16, i16, i16)>)>
-  // CHECK: %[[GEP1:.*]] = llvm.getelementptr %[[GEP0]][0, 2] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<(i16, i16, i16)>
-  // CHECK: llvm.store %[[EXTRACT]], %[[GEP1]]
-
-  llvm.store %arg, %1 : vector<4 x i16>, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @partially_overlapping_aggregate_store
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @partially_overlapping_aggregate_store(%arg: i64) {
-  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK-DAG: %[[CST16:.*]] = llvm.mlir.constant(16 : i64) : i64
-
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i16, struct<(i16, i16, i16, i16)>)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i16, struct<(i16, i16, i16, i16)>)> : (i32) -> !llvm.ptr
-
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST0]]
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i16
-  // CHECK: llvm.store %[[TRUNC]], %[[ALLOCA]]
-
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST16]] : i64
-  // CHECK: [[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i48
-  // CHECK: %[[TOP_GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i16, struct<(i16, i16, i16, i16)>)>
-
-  // Normal integer splitting of [[TRUNC]] follows:
-
-  // CHECK: llvm.store %{{.*}}, %[[TOP_GEP]]
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[TOP_GEP]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i16, i16, i16, i16)>
-  // CHECK: llvm.store %{{.*}}, %[[GEP]]
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[TOP_GEP]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i16, i16, i16, i16)>
-  // CHECK: llvm.store %{{.*}}, %[[GEP]]
-
-  // It is important that there are no more stores at this point.
-  // Specifically a store into the fourth field of %[[TOP_GEP]] would
-  // incorrectly change the semantics of the code.
-  // CHECK-NOT: llvm.store %{{.*}}, %{{.*}}
-
-  llvm.store %arg, %1 : i64, !llvm.ptr
-
-  llvm.return
-}
-
-// -----
-
-// Here a split is undesirable since the store does a partial store into the field.
-
-// CHECK-LABEL: llvm.func @undesirable_overlapping_aggregate_store
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @undesirable_overlapping_aggregate_store(%arg: i64) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32, i32, struct<(i64, i16, i16, i16)>)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, i32, struct<(i64, i16, i16, i16)>)> : (i32) -> !llvm.ptr
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, i32, struct<(i64, i16, i16, i16)>)>
-  %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, i32, struct<(i64, i16, i16, i16)>)>
-  // CHECK: llvm.store %[[ARG]], %[[GEP]]
-  llvm.store %arg, %2 : i64, !llvm.ptr
-
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @coalesced_store_ints_array
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @coalesced_store_ints_array(%arg: i64) {
-  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK-DAG: %[[CST32:.*]] = llvm.mlir.constant(32 : i64) : i64
-
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.array<2 x i32>
-  %1 = llvm.alloca %0 x !llvm.array<2 x i32> : (i32) -> !llvm.ptr
-
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST0]]
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i32
-  // CHECK: llvm.store %[[TRUNC]], %[[ALLOCA]]
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST32]] : i64
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i32
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr)  -> !llvm.ptr, !llvm.array<2 x i32>
-  // CHECK: llvm.store %[[TRUNC]], %[[GEP]]
-  llvm.store %arg, %1 : i64, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
diff --git a/mlir/test/Dialect/Linalg/block-pack-matmul.mlir b/mlir/test/Dialect/Linalg/block-pack-matmul.mlir
index cc9af91..8a82608 100644
--- a/mlir/test/Dialect/Linalg/block-pack-matmul.mlir
+++ b/mlir/test/Dialect/Linalg/block-pack-matmul.mlir
@@ -476,3 +476,32 @@ func.func @block_generic_matmul_transpose_b(
 // CHECK-SAME:  inner_dims_pos = [0, 1] inner_tiles = [32, 16]
 // CHECK-SAME:  into %[[C]] : tensor<2x4x32x16xf32> -> tensor<64x64xf32>
 // CHECK: return %[[RES_UNPACKED]] : tensor<64x64xf32>
+
+// -----
+
+#map = affine_map<(d0, d1) -> (d0, d1)>
+
+func.func @non_contraction_generic(
+    %A: tensor<64x128xf32>) -> tensor<64x128xf32> {
+  %c0 = arith.constant 0.000000e+00 : f32
+  %0 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel"]}
+    outs(%A : tensor<64x128xf32>) {
+  ^bb0(%out: f32):
+    %1 = arith.maximumf %out, %c0 : f32
+    linalg.yield %1 : f32
+  } -> tensor<64x128xf32>
+  return %0 : tensor<64x128xf32>
+}
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1) -> (d0, d1)>
+
+// CHECK-LABEL: func @non_contraction_generic(
+// CHECK-SAME:    %[[A:[0-9a-z]+]]: tensor<64x128xf32>
+// CHECK-DAG: %[[C0:.+]] = arith.constant 0.000000e+00 : f32
+// CHECK-NOT: tensor.pack
+// CHECK: %[[GENERIC:.+]] = linalg.generic
+// CHECK-SAME:  indexing_maps = [#[[$MAP]]]
+// CHECK-SAME:  iterator_types = ["parallel", "parallel"]
+// CHECK-SAME:  outs(%[[A]] : tensor<64x128xf32>)
+// CHECK-NOT: tensor.unpack
+// CHECK: return %[[GENERIC]] : tensor<64x128xf32>
diff --git a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
index bee0850..9140904 100644
--- a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
+++ b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
@@ -795,7 +795,7 @@ func.func @reduction_pack_transpose_inner_dims(%arg0: tensor<128x256x32xi32>,
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]
 // CHECK:         %[[ARG1_EMPTY:.+]] = tensor.empty() : tensor<4x16x16x32xi32>
 // CHECK:         %[[PACK_ARG1:.+]] = tensor.pack %[[ARG1]]
-// CHECK-SME:      inner_dims_pos = [1, 0] inner_tiles = [16, 32]
+// CHECK-SAME:     inner_dims_pos = [1, 0] inner_tiles = [16, 32]
 // CHECK-SAME:     into %[[ARG1_EMPTY]]
 // CHECK:         %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<4x16x32x16x32xi32>
 // CHECK:         %[[PACK_ARG0:.+]] = tensor.pack %[[ARG0]]
diff --git a/mlir/test/Dialect/Linalg/mesh-sharding-propagation.mlir b/mlir/test/Dialect/Linalg/mesh-sharding-propagation.mlir
new file mode 100644
index 0000000..59fd548
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/mesh-sharding-propagation.mlir
@@ -0,0 +1,34 @@
+// RUN: mlir-opt \
+// RUN:   --verify-each \
+// RUN:   --pass-pipeline="builtin.module(func.func(sharding-propagation))" \
+// RUN:   %s | FileCheck %s
+
+mesh.mesh @mesh_2_2(shape = 2)
+
+// CHECK-LABEL: func @matmul_shard_prallel_axis
+func.func @matmul_shard_prallel_axis(
+  // CHECK-SAME: %[[IN1:[A-Za-z0-9_]+]]: tensor<2x3xf32>,
+  %arg0 : tensor<2x3xf32>,
+  // CHECK-SAME: %[[IN2:[A-Za-z0-9_]+]]: tensor<3x2xf32>,
+  %arg1 : tensor<3x2xf32>,
+  // CHECK-SAME: %[[DPS_OUT:[A-Za-z0-9_]+]]: tensor<2x2xf32>
+  %out_dps: tensor<2x2xf32>
+) -> tensor<2x2xf32> {
+  // CHECK: %[[IN1_ANNOTATED_0:.*]] = mesh.shard %[[IN1]] to <@mesh_2, {{\[}}[0]]> : tensor<2x3xf32>
+  // CHECK: %[[IN1_ANNOTATED_1:.*]] = mesh.shard %[[IN1_ANNOTATED_0]] to <@mesh_2, {{\[}}[0]]> annotate_for_users : tensor<2x3xf32>
+  // CHECK: %[[IN2_ANNOTATED:.*]] = mesh.shard %[[IN2]] to <@mesh_2, []> annotate_for_users : tensor<3x2xf32>
+  // CHECK: %[[DPS_OUT_ANNOTATED:.*]] = mesh.shard %[[DPS_OUT]] to <@mesh_2, {{\[}}[0]]> annotate_for_users : tensor<2x2xf32>
+  %arg0_sharded = mesh.shard %arg0 to <@mesh_2, [[0]]> : tensor<2x3xf32>
+
+  // CHECK: %[[RES:.*]] = linalg.matmul ins(%[[IN1_ANNOTATED_1]], %[[IN2_ANNOTATED]] : tensor<2x3xf32>, tensor<3x2xf32>)
+  // CHECK-SAME:  outs(%[[DPS_OUT_ANNOTATED]] : tensor<2x2xf32>) -> tensor<2x2xf32>
+  %res = linalg.matmul ins(%arg0_sharded, %arg1 : tensor<2x3xf32>, tensor<3x2xf32>)
+    outs(%out_dps : tensor<2x2xf32>) -> tensor<2x2xf32>
+
+  // CHECK: %[[RES_ANNOTATED_0:.*]] = mesh.shard %[[RES]] to <@mesh_2, {{\[}}[0]]> : tensor<2x2xf32>
+  // CHECK: %[[RES_ANNOTATED_1:.*]] = mesh.shard %[[RES_ANNOTATED_0]] to <@mesh_2, {{\[}}[]]> annotate_for_users : tensor<2x2xf32>
+  %res_sharded = mesh.shard %res to <@mesh_2, [[]]> annotate_for_users : tensor<2x2xf32>
+
+  // CHECK: return %[[RES_ANNOTATED_1]] : tensor<2x2xf32>
+  return %res_sharded : tensor<2x2xf32>
+}
diff --git a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
index 0e15127..f3cf7c4 100644
--- a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
+++ b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
@@ -80,13 +80,14 @@ module attributes {transform.with_named_sequence} {
 
 // CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 5)>
 // CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1) -> (d1)>
+// CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1) -> (d1, d0)>
+// CHECK-DAG: #[[MAP3:.*]] = affine_map<(d0, d1) -> (d1)>
 //     CHECK: func @reduction_tile_transpose
 //     CHECK:   tensor.empty(%{{.*}}) : tensor<5x?xf32>
 //     CHECK:   linalg.fill {{.*}} : tensor<5x?xf32>) -> tensor<5x?xf32>
 //     CHECK:   scf.for
 //     CHECK:     %[[EXT:.*]] = tensor.extract_slice %[[ARG3:.*]][0, 0] [%[[D0:.*]], %[[D1:.*]]] [1, 1] : tensor<5x?xf32> to tensor<?x?xf32>
-//     CHECK:     %[[R:.*]] = linalg.generic {indexing_maps = [#[[MAP1]], #[[MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%[[L:.*]] : tensor<?x?xf32>) outs(%[[EXT]] : tensor<?x?xf32>)
+//     CHECK:     %[[R:.*]] = linalg.generic {indexing_maps = [#[[MAP1]], #[[MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[L:.*]] : tensor<?x?xf32>) outs(%[[EXT]] : tensor<?x?xf32>)
 //     CHECK:     %[[INS:.*]] = tensor.insert_slice %[[R]] into %[[ARG3]][0, 0] [%[[D0]], %[[D1]]] [1, 1] : tensor<?x?xf32> into tensor<5x?xf32>
 //     CHECK:     scf.yield {{.*}} : tensor<5x?xf32>
 //     CHECK:   }
@@ -403,3 +404,48 @@ module {
 // CHECK:     scf.yield %[[L1]] : tensor<4096x2x64xf32>
 // CHECK:   %[[OUT2:.*]] = linalg.generic {indexing_maps = [{{.*}}, {{.*}}], iterator_types = ["parallel", "reduction", "reduction"]} ins(%{{.*}} : tensor<4096x2x64xf32>) outs(%{{.*}} : tensor<4096xf32>)
 // CHECK:  return %[[OUT2]] : tensor<4096xf32>
+
+// -----
+
+func.func @reduction_tile_multiple_results(%arg0: tensor<?x?xf32>, %out: tensor<?xf32>, %out2: tensor<?xf32>) -> (tensor<?xf32>, tensor<?xf32>) {
+  %red:2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                            affine_map<(d0, d1) -> (d0)>,
+                                            affine_map<(d0, d1) -> (d0)>],
+   iterator_types = ["parallel", "reduction"]}
+   ins(%arg0 : tensor<?x?xf32>)
+   outs(%out, %out2 : tensor<?xf32>, tensor<?xf32>) {
+    ^bb0(%arg7: f32, %arg9: f32, %arg9_1: f32):
+      %1 = arith.mulf %arg7, %arg7 : f32
+      %2 = arith.addf %1, %arg9 : f32
+      %3 = arith.maximumf %1, %arg9_1 : f32
+      linalg.yield %2, %3 : f32, f32
+    } -> (tensor<?xf32>, tensor<?xf32>)
+  return %red#0, %red#1 : tensor<?xf32>, tensor<?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1, %12, %2, %3, %loop = transform.structured.tile_reduction_using_for %0
+      by tile_sizes = [0, 5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+      transform.yield
+  }
+}
+
+// CHECK: func @reduction_tile_multiple_results
+// CHECK-DAG:   %[[SUM_ID:.+]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:   %[[MAX_ID:.+]] = arith.constant 0xFF800000 : f32
+// CHECK-DAG:   %[[SUM_INIT:.+]] = linalg.fill ins(%[[SUM_ID]] : f32) outs(%{{.*}} : tensor<?x5xf32>) -> tensor<?x5xf32>
+// CHECK-DAG:   %[[MAX_INIT:.+]] = linalg.fill ins(%[[MAX_ID]] : f32) outs(%{{.*}} : tensor<?x5xf32>) -> tensor<?x5xf32>
+// CHECK:       %[[OUT:.+]]:2 = scf.for
+// CHECK-SAME:            iter_args(%[[SUM:.+]] = %[[SUM_INIT]], %[[MAX:.+]] = %[[MAX_INIT]])
+// CHECK:         %[[UPDATED:.*]]:2 = linalg.generic
+// CHECK:         arith.mulf
+// CHECK:         arith.addf
+// CHECK:         arith.maximumf
+// CHECK:       %[[INSERT1:.+]] = tensor.insert_slice %[[UPDATED]]#0 into %[[SUM]]
+// CHECK:       %[[INSERT2:.+]] = tensor.insert_slice %[[UPDATED]]#1 into %[[MAX]]
+// CHECK:       scf.yield %[[INSERT1]], %[[INSERT1]]
+// CHECK:       linalg.generic
+// CHECK:         arith.addf
+// CHECK:         arith.maximumf
diff --git a/mlir/test/Dialect/Math/expand-math.mlir b/mlir/test/Dialect/Math/expand-math.mlir
index 016a7bb..c10a78c 100644
--- a/mlir/test/Dialect/Math/expand-math.mlir
+++ b/mlir/test/Dialect/Math/expand-math.mlir
@@ -221,7 +221,7 @@ func.func @roundf_func(%a: f32) -> f32 {
 // CHECK-LABEL:   func @powf_func
 // CHECK-SAME:    ([[ARG0:%.+]]: f64, [[ARG1:%.+]]: f64)
 func.func @powf_func(%a: f64, %b: f64) ->f64 {
-  // CHECK-DAG = [[CST0:%.+]] = arith.constant 0.000000e+00
+  // CHECK-DAG: [[CST0:%.+]] = arith.constant 0.000000e+00
   // CHECK-DAG: [[TWO:%.+]] = arith.constant 2.000000e+00
   // CHECK-DAG: [[NEGONE:%.+]] = arith.constant -1.000000e+00
   // CHECK-DAG: [[SQR:%.+]] = arith.mulf [[ARG0]], [[ARG0]]
diff --git a/mlir/test/Dialect/Mesh/sharding-propagation.mlir b/mlir/test/Dialect/Mesh/sharding-propagation.mlir
index 270787a..11a8059 100644
--- a/mlir/test/Dialect/Mesh/sharding-propagation.mlir
+++ b/mlir/test/Dialect/Mesh/sharding-propagation.mlir
@@ -1,5 +1,6 @@
-// RUN: mlir-opt --pass-pipeline="builtin.module(func.func(sharding-propagation))" %s | FileCheck %s
+// RUN: mlir-opt --pass-pipeline="builtin.module(func.func(sharding-propagation,cse))" %s | FileCheck %s
 
+mesh.mesh @mesh_2(shape = 2)
 mesh.mesh @mesh_1d(shape = ?)
 mesh.mesh @mesh_2d(shape = 2x4)
 mesh.mesh @mesh_3d(shape = ?x?x?)
@@ -73,12 +74,11 @@ func.func @arrow_structure(%arg0: tensor<8x16xf32>) -> (tensor<8x16xf32>, tensor
   // CHECK-NEXT:  %[[V5:.*]] = tosa.abs %[[V4]]
   // CHECK-NEXT:  %[[V6:.*]] = mesh.shard %[[V5]] to <@mesh_2d, {{\[\[}}0], [1]]> : tensor<8x16xf32>
   %1 = tosa.abs %0 : (tensor<8x16xf32>) -> tensor<8x16xf32>
-  // CHECK-NEXT:  %[[V7:.*]] = mesh.shard %[[V3]] to <@mesh_2d, {{\[\[}}0], [1]]> annotate_for_users : tensor<8x16xf32>
-  // CHECK-NEXT:  %[[V8:.*]] = tosa.negate %[[V7]]
-  // CHECK-NEXT:  %[[V9:.*]] = mesh.shard %[[V8]] to <@mesh_2d, {{\[\[}}0], [1]]> : tensor<8x16xf32>
+  // CHECK-NEXT:  %[[V7:.*]] = tosa.negate %[[V4]]
+  // CHECK-NEXT:  %[[V8:.*]] = mesh.shard %[[V7]] to <@mesh_2d, {{\[\[}}0], [1]]> : tensor<8x16xf32>
   %2 = tosa.negate %0 : (tensor<8x16xf32>) -> tensor<8x16xf32>
   %3 = mesh.shard %2 to <@mesh_2d, [[0], [1]]> : tensor<8x16xf32>
-  // CHECK-NEXT: return %[[V6]], %[[V9]]
+  // CHECK-NEXT: return %[[V6]], %[[V8]]
   return %1, %3 : tensor<8x16xf32>, tensor<8x16xf32>
 }
 
@@ -135,6 +135,34 @@ func.func @matmul_on_use_shard_m_and_duplicted_k(%arg0: tensor<2x16x8xf32>, %arg
   return %2 : tensor<2x16x32xf32>
 }
 
+// CHECK-LABEL: func.func @resolve_conflicting_annotations
+func.func @resolve_conflicting_annotations(
+  // CHECK-SAME: %[[IN1:.*]]: tensor<2x3xf32>,
+  %arg0: tensor<2x3xf32>,
+  // CHECK-SAME: %[[IN2:.*]]: tensor<3x2xf32>,
+  %arg1: tensor<3x2xf32>,
+  // CHECK-SAME: %[[OUT_DPS:.*]]: tensor<2x2xf32>
+  %out_dps: tensor<2x2xf32>
+// CHECK-SAME: ) -> tensor<2x2xf32> {
+) -> tensor<2x2xf32> {
+  // CHECK: %[[IN1_SHARDED1:.*]] = mesh.shard %[[IN1]] to <@mesh_2, {{\[\[}}0]]> : tensor<2x3xf32>
+  // CHECK: %[[IN1_SHARDED2:.*]] = mesh.shard %[[IN1_SHARDED1]] to <@mesh_2, {{\[}}]> annotate_for_users : tensor<2x3xf32>
+  // CHECK: %[[IN2_SHARDED:.*]] = mesh.shard %[[IN2]] to <@mesh_2, []> annotate_for_users : tensor<3x2xf32>
+  // CHECK: %[[OUT_DPS_SHARDED:.*]] = mesh.shard %[[OUT_DPS]] to <@mesh_2, {{\[}}]> annotate_for_users : tensor<2x2xf32>
+  %arg0_sharded = mesh.shard %arg0 to <@mesh_2, [[0]]> : tensor<2x3xf32>
+
+  // CHECK: %[[MATMUL:.*]] = linalg.matmul ins(%[[IN1_SHARDED2]], %[[IN2_SHARDED]] : tensor<2x3xf32>, tensor<3x2xf32>)
+  // CHECK-SAME: outs(%[[OUT_DPS_SHARDED]] : tensor<2x2xf32>) -> tensor<2x2xf32>
+  %res = linalg.matmul ins(%arg0_sharded, %arg1 : tensor<2x3xf32>, tensor<3x2xf32>)
+    outs(%out_dps : tensor<2x2xf32>) -> tensor<2x2xf32>
+
+  // CHECK: %[[MATMUL_SHARDED1:.*]] = mesh.shard %[[MATMUL]] to <@mesh_2, {{\[\[}}]]> : tensor<2x2xf32>
+  %res_sharded = mesh.shard %res to <@mesh_2, [[]]> : tensor<2x2xf32>
+
+  // CHECK: return %[[MATMUL_SHARDED1]] : tensor<2x2xf32>
+  return %res_sharded : tensor<2x2xf32>
+}
+
 // https://arxiv.org/abs/2211.05102 Figure 2(a)
 // CHECK-LABEL: func.func @mlp_1d_weight_stationary
 // CHECK-SAME:     %[[ARG0:.*]]: tensor<2x4x8xf32>, %[[ARG1:.*]]: tensor<2x8x32xf32>, %[[ARG2:.*]]: tensor<2x32x8xf32>
diff --git a/mlir/test/Dialect/Mesh/spmdization.mlir b/mlir/test/Dialect/Mesh/spmdization.mlir
index 2df247a..d7a1e2f 100644
--- a/mlir/test/Dialect/Mesh/spmdization.mlir
+++ b/mlir/test/Dialect/Mesh/spmdization.mlir
@@ -16,6 +16,21 @@ func.func @full_replication(
   return %1 : tensor<2xi8>
 }
 
+// CHECK-LABEL: func @sharding_triplet
+func.func @sharding_triplet(
+  // CHECK-SAME: %[[ARG:.*]]: tensor<1xf32>
+  %arg0: tensor<2xf32>
+// CHECK-SAME: ) -> tensor<2xf32> {
+) -> tensor<2xf32> {
+  // CHECK: %[[ALL_GATHER:.*]] = mesh.all_gather %[[ARG]] on @mesh_1d mesh_axes = [0] gather_axis = 0 : tensor<1xf32> -> tensor<2xf32>
+  %sharding_annotated = mesh.shard %arg0 to <@mesh_1d, [[0]]> : tensor<2xf32>
+  %sharding_annotated_0 = mesh.shard %sharding_annotated to <@mesh_1d, [[0]]> annotate_for_users : tensor<2xf32>
+  %sharding_annotated_1 = mesh.shard %sharding_annotated_0 to <@mesh_1d, [[]]> : tensor<2xf32>
+  // CHECK: return %[[ALL_GATHER]] : tensor<2xf32>
+  return %sharding_annotated_1 : tensor<2xf32>
+}
+
+
 // CHECK-LABEL: func @move_split_axis
 func.func @move_split_axis(
   // CHECK-SAME: %[[ARG:.*]]: tensor<1x2xi8>
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index db016fe..115d164 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -648,7 +648,6 @@ func.func @foo(%lb : index, %ub : index, %step : index) {
   omp.wsloop reduction(@foo %0 -> %prv : !llvm.ptr) {
     omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
       %2 = arith.constant 2.0 : f32
-      omp.reduction %2, %1 : f32, !llvm.ptr
       omp.yield
     }
     omp.terminator
@@ -678,7 +677,6 @@ func.func @foo(%lb : index, %ub : index, %step : index) {
   omp.wsloop reduction(@add_f32 %0 -> %prv : !llvm.ptr, @add_f32 %0 -> %prv1 : !llvm.ptr) {
     omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
       %2 = arith.constant 2.0 : f32
-      omp.reduction %2, %0 : f32, !llvm.ptr
       omp.yield
     }
     omp.terminator
@@ -713,7 +711,6 @@ func.func @foo(%lb : index, %ub : index, %step : index, %mem : memref<1xf32>) {
   omp.wsloop reduction(@add_f32 %mem -> %prv : memref<1xf32>) {
     omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
       %2 = arith.constant 2.0 : f32
-      omp.reduction %2, %mem : f32, memref<1xf32>
       omp.yield
     }
     omp.terminator
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 0d5fd93..caf25a3 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -1003,8 +1003,6 @@ func.func @omp_teams(%lb : i32, %ub : i32, %if_cond : i1, %num_threads : i32,
   // CHECK: omp.teams reduction(@add_f32 -> %{{.+}} : !llvm.ptr) {
   omp.teams reduction(@add_f32 -> %0 : !llvm.ptr) {
     %1 = arith.constant 2.0 : f32
-    // CHECK: omp.reduction %{{.+}}, %{{.+}}
-    omp.reduction %1, %0 : f32, !llvm.ptr
     // CHECK: omp.terminator
     omp.terminator
   }
@@ -1028,15 +1026,11 @@ func.func @sections_reduction() {
     // CHECK: omp.section
     omp.section {
       %1 = arith.constant 2.0 : f32
-      // CHECK: omp.reduction %{{.+}}, %{{.+}}
-      omp.reduction %1, %0 : f32, !llvm.ptr
       omp.terminator
     }
     // CHECK: omp.section
     omp.section {
       %1 = arith.constant 3.0 : f32
-      // CHECK: omp.reduction %{{.+}}, %{{.+}}
-      omp.reduction %1, %0 : f32, !llvm.ptr
       omp.terminator
     }
     omp.terminator
@@ -1130,14 +1124,10 @@ func.func @sections_reduction2() {
   omp.sections reduction(@add2_f32 -> %0 : memref<1xf32>) {
     omp.section {
       %1 = arith.constant 2.0 : f32
-      // CHECK: omp.reduction
-      omp.reduction %1, %0 : f32, memref<1xf32>
       omp.terminator
     }
     omp.section {
       %1 = arith.constant 2.0 : f32
-      // CHECK: omp.reduction
-      omp.reduction %1, %0 : f32, memref<1xf32>
       omp.terminator
     }
     omp.terminator
diff --git a/mlir/test/Dialect/Polynomial/canonicalization.mlir b/mlir/test/Dialect/Polynomial/canonicalization.mlir
index dbfbf2d..489d9ec 100644
--- a/mlir/test/Dialect/Polynomial/canonicalization.mlir
+++ b/mlir/test/Dialect/Polynomial/canonicalization.mlir
@@ -43,3 +43,60 @@ func.func @test_canonicalize_sub(%poly0 : !sub_ty, %poly1 : !sub_ty) -> !sub_ty
   // CHECK: [[ADD:%.+]] = polynomial.add %[[p0]], %[[p1neg]]
   return %0 : !sub_ty
 }
+
+// CHECK-LABEL: test_canonicalize_fold_add_through_ntt
+// CHECK: polynomial.add
+// CHECK-NOT: polynomial.ntt
+// CHECK-NOT: polynomial.intt
+func.func @test_canonicalize_fold_add_through_ntt(
+    %poly0 : !ntt_poly_ty,
+    %poly1 : !ntt_poly_ty) -> !ntt_poly_ty {
+  %0 = polynomial.ntt %poly0 : !ntt_poly_ty -> !tensor_ty
+  %1 = polynomial.ntt %poly1 : !ntt_poly_ty -> !tensor_ty
+  %a_plus_b = arith.addi %0, %1 : !tensor_ty
+  %out = polynomial.intt %a_plus_b : !tensor_ty -> !ntt_poly_ty
+  return %out : !ntt_poly_ty
+}
+
+// CHECK-LABEL: test_canonicalize_fold_add_through_intt
+// CHECK: arith.addi
+// CHECK-NOT: polynomial.intt
+// CHECK-NOT: polynomial.iintt
+func.func @test_canonicalize_fold_add_through_intt(
+    %tensor0 : !tensor_ty,
+    %tensor1 : !tensor_ty) -> !tensor_ty {
+  %0 = polynomial.intt %tensor0 : !tensor_ty -> !ntt_poly_ty
+  %1 = polynomial.intt %tensor1 : !tensor_ty -> !ntt_poly_ty
+  %a_plus_b = polynomial.add %0, %1 : !ntt_poly_ty
+  %out = polynomial.ntt %a_plus_b : !ntt_poly_ty -> !tensor_ty
+  return %out : !tensor_ty
+}
+
+// CHECK-LABEL: test_canonicalize_fold_sub_through_ntt
+// CHECK: polynomial.mul_scalar
+// CHECK: polynomial.add
+// CHECK-NOT: polynomial.ntt
+// CHECK-NOT: polynomial.intt
+func.func @test_canonicalize_fold_sub_through_ntt(
+    %poly0 : !ntt_poly_ty,
+    %poly1 : !ntt_poly_ty) -> !ntt_poly_ty {
+  %0 = polynomial.ntt %poly0 : !ntt_poly_ty -> !tensor_ty
+  %1 = polynomial.ntt %poly1 : !ntt_poly_ty -> !tensor_ty
+  %a_plus_b = arith.subi %0, %1 : !tensor_ty
+  %out = polynomial.intt %a_plus_b : !tensor_ty -> !ntt_poly_ty
+  return %out : !ntt_poly_ty
+}
+
+// CHECK-LABEL: test_canonicalize_fold_sub_through_intt
+// CHECK: arith.subi
+// CHECK-NOT: polynomial.intt
+// CHECK-NOT: polynomial.iintt
+func.func @test_canonicalize_fold_sub_through_intt(
+    %tensor0 : !tensor_ty,
+    %tensor1 : !tensor_ty) -> !tensor_ty {
+  %0 = polynomial.intt %tensor0 : !tensor_ty -> !ntt_poly_ty
+  %1 = polynomial.intt %tensor1 : !tensor_ty -> !ntt_poly_ty
+  %a_plus_b = polynomial.sub %0, %1 : !ntt_poly_ty
+  %out = polynomial.ntt %a_plus_b : !ntt_poly_ty -> !tensor_ty
+  return %out : !tensor_ty
+}
diff --git a/mlir/test/Dialect/Polynomial/ops.mlir b/mlir/test/Dialect/Polynomial/ops.mlir
index ff70996..4716e37f 100644
--- a/mlir/test/Dialect/Polynomial/ops.mlir
+++ b/mlir/test/Dialect/Polynomial/ops.mlir
@@ -74,15 +74,19 @@ module {
 
   func.func @test_monic_monomial_mul() {
     %five = arith.constant 5 : index
-    %0 = polynomial.constant {value=#one_plus_x_squared} : !polynomial.polynomial<ring=#ring1>
+    %0 = polynomial.constant int<1 + x**2> : !polynomial.polynomial<ring=#ring1>
     %1 = polynomial.monic_monomial_mul %0, %five : (!polynomial.polynomial<ring=#ring1>, index) -> !polynomial.polynomial<ring=#ring1>
     return
   }
 
   func.func @test_constant() {
-    %0 = polynomial.constant {value=#one_plus_x_squared} : !polynomial.polynomial<ring=#ring1>
-    %1 = polynomial.constant {value=#polynomial.int_polynomial<1 + x**2>} : !polynomial.polynomial<ring=#ring1>
-    %2 = polynomial.constant {value=#polynomial.float_polynomial<1.5 + 0.5 x**2>} : !polynomial.polynomial<ring=#ring2>
+    %0 = polynomial.constant int<1 + x**2> : !polynomial.polynomial<ring=#ring1>
+    %1 = polynomial.constant int<1 + x**2> : !polynomial.polynomial<ring=#ring1>
+    %2 = polynomial.constant float<1.5 + 0.5 x**2> : !polynomial.polynomial<ring=#ring2>
+
+    // Test verbose fallbacks
+    %verb0 = polynomial.constant #polynomial.typed_int_polynomial<1 + x**2> : !polynomial.polynomial<ring=#ring1>
+    %verb2 = polynomial.constant #polynomial.typed_float_polynomial<1.5 + 0.5 x**2> : !polynomial.polynomial<ring=#ring2>
     return
   }
 
diff --git a/mlir/test/Dialect/SCF/transform-ops.mlir b/mlir/test/Dialect/SCF/transform-ops.mlir
index f4b0db7..a4daa865 100644
--- a/mlir/test/Dialect/SCF/transform-ops.mlir
+++ b/mlir/test/Dialect/SCF/transform-ops.mlir
@@ -6,11 +6,11 @@
 // CHECK:   scf.for
 // CHECK:     arith.addi
 //
-// CHECK: func @foo[[SUFFIX:.+]](%{{.+}}, %{{.+}}, %{{.+}})
+// CHECK: func @foo[[$SUFFIX:.+]](%{{.+}}, %{{.+}}, %{{.+}})
 // CHECK:   scf.for
 // CHECK:     arith.addi
 //
-// CHECK-LABEL @loop_outline_op
+// CHECK-LABEL: @loop_outline_op
 func.func @loop_outline_op(%arg0: index, %arg1: index, %arg2: index) {
   // CHECK: scf.for
   // CHECK-NOT: scf.for
@@ -23,7 +23,7 @@ func.func @loop_outline_op(%arg0: index, %arg1: index, %arg2: index) {
   }
   // CHECK: scf.execute_region
   // CHECK-NOT: scf.for
-  // CHECK:   func.call @foo[[SUFFIX]]
+  // CHECK:   func.call @foo[[$SUFFIX]]
   scf.for %j = %arg0 to %arg1 step %arg2 {
     arith.addi %j, %j : index
   }
diff --git a/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir b/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir
index 7dc0bd9..5c24f0e 100644
--- a/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir
@@ -180,7 +180,7 @@ func.func @logicalUnary(%arg0 : i32)
 func.func @select_op_bool(%arg0: i1) -> () {
   %0 = spirv.Constant true
   %1 = spirv.Constant false
-  // CHECK : spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, i1
+  // CHECK: spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, i1
   %2 = spirv.Select %arg0, %0, %1 : i1, i1
   return
 }
@@ -188,7 +188,7 @@ func.func @select_op_bool(%arg0: i1) -> () {
 func.func @select_op_int(%arg0: i1) -> () {
   %0 = spirv.Constant 2 : i32
   %1 = spirv.Constant 3 : i32
-  // CHECK : spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, i32
+  // CHECK: spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, i32
   %2 = spirv.Select %arg0, %0, %1 : i1, i32
   return
 }
@@ -196,7 +196,7 @@ func.func @select_op_int(%arg0: i1) -> () {
 func.func @select_op_float(%arg0: i1) -> () {
   %0 = spirv.Constant 2.0 : f32
   %1 = spirv.Constant 3.0 : f32
-  // CHECK : spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, f32
+  // CHECK: spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, f32
   %2 = spirv.Select %arg0, %0, %1 : i1, f32
   return
 }
@@ -204,7 +204,7 @@ func.func @select_op_float(%arg0: i1) -> () {
 func.func @select_op_ptr(%arg0: i1) -> () {
   %0 = spirv.Variable : !spirv.ptr<f32, Function>
   %1 = spirv.Variable : !spirv.ptr<f32, Function>
-  // CHECK : spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, !spirv.ptr<f32, Function>
+  // CHECK: spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, !spirv.ptr<f32, Function>
   %2 = spirv.Select %arg0, %0, %1 : i1, !spirv.ptr<f32, Function>
   return
 }
@@ -212,7 +212,7 @@ func.func @select_op_ptr(%arg0: i1) -> () {
 func.func @select_op_vec(%arg0: i1) -> () {
   %0 = spirv.Constant dense<[2.0, 3.0, 4.0]> : vector<3xf32>
   %1 = spirv.Constant dense<[5.0, 6.0, 7.0]> : vector<3xf32>
-  // CHECK : spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, vector<3xf32>
+  // CHECK: spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, vector<3xf32>
   %2 = spirv.Select %arg0, %0, %1 : i1, vector<3xf32>
   return
 }
@@ -220,7 +220,7 @@ func.func @select_op_vec(%arg0: i1) -> () {
 func.func @select_op_vec_condn_vec(%arg0: vector<3xi1>) -> () {
   %0 = spirv.Constant dense<[2.0, 3.0, 4.0]> : vector<3xf32>
   %1 = spirv.Constant dense<[5.0, 6.0, 7.0]> : vector<3xf32>
-  // CHECK : spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : vector<3xi1>, vector<3xf32>
+  // CHECK: spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : vector<3xi1>, vector<3xf32>
   %2 = spirv.Select %arg0, %0, %1 : vector<3xi1>, vector<3xf32>
   return
 }
diff --git a/mlir/test/Dialect/SPIRV/IR/structure-ops.mlir b/mlir/test/Dialect/SPIRV/IR/structure-ops.mlir
index db0f52d..1eed589 100644
--- a/mlir/test/Dialect/SPIRV/IR/structure-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/structure-ops.mlir
@@ -330,7 +330,7 @@ spirv.module Logical GLSL450 {
 // TODO: Fix test case after initialization with normal constant is addressed
 // spirv.module Logical GLSL450 {
 //   %0 = spirv.Constant 4.0 : f32
-//   // CHECK1: spirv.Variable init(%0) : !spirv.ptr<f32, Private>
+//   COM: CHECK: spirv.Variable init(%0) : !spirv.ptr<f32, Private>
 //   spirv.GlobalVariable @var1 init(%0) : !spirv.ptr<f32, Private>
 // }
 
@@ -372,7 +372,7 @@ spirv.module Logical GLSL450 {
 // TODO: Fix test case after initialization with constant is addressed
 // spirv.module Logical GLSL450 {
 //   %0 = spirv.Constant 4.0 : f32
-//   // CHECK1: spirv.GlobalVariable @var1 initializer(%0) {binding = 5 : i32} : !spirv.ptr<f32, Private>
+//   COM: CHECK: spirv.GlobalVariable @var1 initializer(%0) {binding = 5 : i32} : !spirv.ptr<f32, Private>
 //   spirv.GlobalVariable @var1 initializer(%0) {binding = 5 : i32} : !spirv.ptr<f32, Private>
 // }
 
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index 914e5e8..f7fbd38 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -2523,4 +2523,3 @@ func.func @dim_out_of_bounds() -> vector<7xi32> {
     %16 = affine.vector_load %alloc_21[%c1, %c1, %dim] : memref<?x26x2xi32>, vector<7xi32>
     return %16 : vector<7xi32>
 }
-
diff --git a/mlir/test/Dialect/Tensor/fold-empty-op.mlir b/mlir/test/Dialect/Tensor/fold-empty-op.mlir
index e200a4f..e94f6ec 100644
--- a/mlir/test/Dialect/Tensor/fold-empty-op.mlir
+++ b/mlir/test/Dialect/Tensor/fold-empty-op.mlir
@@ -64,6 +64,79 @@ func.func @rank_reducing_empty_tensor_extract(%sz : index, %idx : index) -> tens
   return %r: tensor<2xf32>
 }
 
+func.func @pack_empty(%arg0: tensor<8x8x32x32xf32>) -> tensor<8x8x32x32xf32> {
+  %empty_unpacked = tensor.empty() : tensor<256x256xf32>
+  %packed = tensor.pack %empty_unpacked
+    inner_dims_pos = [0, 1] inner_tiles = [32, 32]
+    into %arg0 : tensor<256x256xf32> -> tensor<8x8x32x32xf32>
+  return %packed : tensor<8x8x32x32xf32>
+}
+
+// CHECK-LABEL: func.func @pack_empty(
+// CHECK-SAME:   %[[T:.+]]: tensor<8x8x32x32xf32>
+// CHECK-NOT:    tensor.pack
+// CHECK:        return %[[T]] : tensor<8x8x32x32xf32>
+
+func.func @pack_empty_dynamic(%arg0: tensor<?x?x?x?xf32>, %dim0: index, %dim1: index) -> tensor<?x?x?x?xf32> {
+  %empty_unpacked = tensor.empty(%dim0, %dim1) : tensor<?x?xf32>
+  %packed = tensor.pack %empty_unpacked
+    inner_dims_pos = [0, 1] inner_tiles = [32, 32]
+    into %arg0 : tensor<?x?xf32> -> tensor<?x?x?x?xf32>
+  return %packed : tensor<?x?x?x?xf32>
+}
+
+// CHECK-LABEL: func.func @pack_empty_dynamic(
+// CHECK-SAME:   %[[T:.+]]: tensor<?x?x?x?xf32>,
+// CHECK-SAME:   %[[DIM0:[a-zA-Z0-9_]+]]: index,
+// CHECK-SAME:   %[[DIM1:[a-zA-Z0-9_]+]]: index
+// CHECK-NOT:    tensor.pack
+// CHECK:        return %[[T]] : tensor<?x?x?x?xf32>
+
+func.func @unpack_empty(%arg0: tensor<256x256xf32>) -> tensor<256x256xf32> {
+  %empty_packed = tensor.empty() : tensor<8x8x32x32xf32>
+  %unpacked = tensor.unpack %empty_packed
+    inner_dims_pos = [0, 1] inner_tiles = [32, 32]
+    into %arg0 : tensor<8x8x32x32xf32> -> tensor<256x256xf32>
+  return %unpacked : tensor<256x256xf32>
+}
+
+// CHECK-LABEL: func.func @unpack_empty(
+// CHECK-SAME:   %[[T:.+]]: tensor<256x256xf32>
+// CHECK-NOT:    tensor.unpack
+// CHECK:        return %[[T]] : tensor<256x256xf32>
+
+func.func @unpack_empty_dynamic(%arg0: tensor<?x?xf32>, %dim0: index, %dim1: index, %dim2: index, %dim3: index) -> tensor<?x?xf32> {
+  %empty_packed = tensor.empty(%dim0, %dim1, %dim2, %dim3) : tensor<?x?x?x?xf32>
+  %unpacked = tensor.unpack %empty_packed
+    inner_dims_pos = [0, 1] inner_tiles = [32, 32]
+    into %arg0 : tensor<?x?x?x?xf32> -> tensor<?x?xf32>
+  return %unpacked : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: func.func @unpack_empty_dynamic(
+// CHECK-SAME:   %[[T:.+]]: tensor<?x?xf32>,
+// CHECK-SAME:   %[[DIM0:[a-zA-Z0-9_]+]]: index,
+// CHECK-SAME:   %[[DIM1:[a-zA-Z0-9_]+]]: index,
+// CHECK-SAME:   %[[DIM2:[a-zA-Z0-9_]+]]: index,
+// CHECK-SAME:   %[[DIM3:[a-zA-Z0-9_]+]]: index
+// CHECK-NOT:    tensor.unpack
+// CHECK:        return %[[T]] : tensor<?x?xf32>
+
+func.func @pack_padded_empty(%arg0: tensor<8x8x32x32xf32>) -> tensor<8x8x32x32xf32> {
+  %pad = arith.constant 1.0 : f32
+  %empty_unpacked = tensor.empty() : tensor<256x256xf32>
+  %packed = tensor.pack %empty_unpacked
+    padding_value(%pad : f32)
+    inner_dims_pos = [0, 1] inner_tiles = [32, 32]
+    into %arg0 : tensor<256x256xf32> -> tensor<8x8x32x32xf32>
+  return %packed : tensor<8x8x32x32xf32>
+}
+
+// CHECK-LABEL: func.func @pack_padded_empty(
+// CHECK-SAME:   %[[T:.+]]: tensor<8x8x32x32xf32>
+// CHECK:        %[[PACK:.+]] = tensor.pack
+// CHECK:        return %[[PACK]] : tensor<8x8x32x32xf32>
+
 // -----
 
 module attributes {transform.with_named_sequence} {
diff --git a/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir b/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir
index 9f486f9..9a3143f5 100644
--- a/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir
+++ b/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir
@@ -544,7 +544,7 @@ func.func @linalg_transpose_tensor_unpack_fold(%arg0: tensor<1x1x4x16xi32>) -> t
 // CHECK-SAME:        outer_dims_perm = [1, 0]
 // CHECK-SAME:        inner_dims_pos = [1, 0]
 // CHECK-SAME:        inner_tiles = [4, 16]
-// CHEKC-SAME:        into %[[OUT]] : tensor<1x1x4x16xi32> -> tensor<16x4xi32>
+// CHECK-SAME:        into %[[OUT]] : tensor<1x1x4x16xi32> -> tensor<16x4xi32>
 //      CHECK:     return %[[UNPACK]] : tensor<16x4xi32>
 //      CHECK:   }
 
diff --git a/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir b/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir
index d3ac6ce..644d9a9 100644
--- a/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir
+++ b/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir
@@ -54,3 +54,105 @@ func.func @rank_reducing_parallel_insert_of_collapse_shape(
   }
   return %1 : tensor<?x?x?x?xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func @insert_of_padding_expand_shape(
+//  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>
+//  CHECK-SAME:     %[[d:.*]]: tensor<?x?x?x?xf32>
+//  CHECK-SAME:     %[[x:[a-zA-Z0-9_]+]]: index
+//  CHECK-SAME:     %[[y:[a-zA-Z0-9_]+]]: index
+//       CHECK:   %[[insert:.*]] = tensor.insert_slice %[[t]] into %[[d]][%[[x]], %[[y]], 0, 0] [1, %{{.*}}, 1, %{{.*}}] [1, 1, 1, 1] : tensor<?x?xf32> into tensor<?x?x?x?xf32>
+//       CHECK:   return %[[insert]]
+func.func @insert_of_padding_expand_shape(
+    %t: tensor<?x?xf32>, %d: tensor<?x?x?x?xf32>, %x: index, %y: index)
+  -> tensor<?x?x?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %sz0 = tensor.dim %t, %c0 : tensor<?x?xf32>
+  %sz1 = tensor.dim %t, %c1 : tensor<?x?xf32>
+  %0 = tensor.expand_shape %t [[0, 1], [2, 3]] output_shape [1, %sz0, 1, %sz1]
+      : tensor<?x?xf32> into tensor<1x?x1x?xf32>
+  %1 = tensor.insert_slice %0 into %d[%x, %y, 0, 0][1, %sz0, 1, %sz1][1, 1, 1, 1]
+      : tensor<1x?x1x?xf32> into tensor<?x?x?x?xf32>
+  return %1 : tensor<?x?x?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @insert_of_non_padding_expand_shape(
+//  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>
+//  CHECK-SAME:     %[[d:.*]]: tensor<?x?x?x?xf32>
+//  CHECK-SAME:     %[[x:[a-zA-Z0-9_]+]]: index
+//  CHECK-SAME:     %[[y:[a-zA-Z0-9_]+]]: index
+//  CHECK-SAME:     %[[sz:[a-zA-Z0-9_]+]]: index
+//       CHECK:   %[[expand:.*]] = tensor.expand_shape %[[t]] {{\[}}[0, 1], [2]] output_shape [%[[sz]], %{{.*}}, %{{.*}}] : tensor<?x?xf32> into tensor<?x?x?xf32>
+//       CHECK:   %[[insert:.*]] = tensor.insert_slice %[[expand]] into %[[d]][%[[x]], %[[y]], 0, 0] [%[[sz]], 1, %{{.*}}, %{{.*}}] [1, 1, 1, 1] : tensor<?x?x?xf32> into tensor<?x?x?x?xf32>
+//       CHECK:   return %[[insert]]
+func.func @insert_of_non_padding_expand_shape(
+    %t: tensor<?x?xf32>, %d: tensor<?x?x?x?xf32>, %x: index, %y: index, %sz: index)
+  -> tensor<?x?x?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %sz0 = tensor.dim %t, %c0 : tensor<?x?xf32>
+  %sz1 = tensor.dim %t, %c1 : tensor<?x?xf32>
+  %0 = tensor.expand_shape %t [[0, 1], [2]] output_shape [%sz, %sz0, %sz1]
+      : tensor<?x?xf32> into tensor<?x?x?xf32>
+  %1 = tensor.insert_slice %0 into %d[%x, %y, 0, 0][%sz, 1, %sz0, %sz1][1, 1, 1, 1]
+      : tensor<?x?x?xf32> into tensor<?x?x?x?xf32>
+  return %1 : tensor<?x?x?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @parallel_insert_of_padding_expand_shape(
+//  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>
+//  CHECK-SAME:     %[[d:.*]]: tensor<?x?x?x?xf32>
+//  CHECK-SAME:     %[[x:[a-zA-Z0-9_]+]]: index
+//  CHECK-SAME:     %[[y:[a-zA-Z0-9_]+]]: index
+//       CHECK:   tensor.parallel_insert_slice %[[t]] into %{{.*}}[%{{.*}}, %{{.*}}, 0, 0] [1, %{{.*}}, 1, %{{.*}}] [1, 1, 1, 1] : tensor<?x?xf32> into tensor<?x?x?x?xf32>
+func.func @parallel_insert_of_padding_expand_shape(
+    %t: tensor<?x?xf32>, %d: tensor<?x?x?x?xf32>, %x: index, %y: index)
+  -> tensor<?x?x?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %sz0 = tensor.dim %t, %c0 : tensor<?x?xf32>
+  %sz1 = tensor.dim %t, %c1 : tensor<?x?xf32>
+  %0 = tensor.expand_shape %t [[0, 1], [2, 3]] output_shape [1, %sz0, 1, %sz1]
+      : tensor<?x?xf32> into tensor<1x?x1x?xf32>
+  %1 = scf.forall (%i, %j) in (%x, %y) shared_outs(%o = %d) -> (tensor<?x?x?x?xf32>) {
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %0 into %o[%i, %j, 0, 0][1, %sz0, 1, %sz1][1, 1, 1, 1]
+          : tensor<1x?x1x?xf32> into tensor<?x?x?x?xf32>
+    }
+  }
+  return %1 : tensor<?x?x?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @parallel_insert_of_non_padding_expand_shape(
+//  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>
+//  CHECK-SAME:     %[[d:.*]]: tensor<?x?x?x?xf32>
+//  CHECK-SAME:     %[[x:[a-zA-Z0-9_]+]]: index
+//  CHECK-SAME:     %[[y:[a-zA-Z0-9_]+]]: index
+//  CHECK-SAME:     %[[sz:[a-zA-Z0-9_]+]]: index
+//       CHECK:   %[[expand:.*]] = tensor.expand_shape %[[t]] {{\[}}[0, 1], [2]] output_shape [%[[sz]], %{{.*}}, %{{.*}}] : tensor<?x?xf32> into tensor<?x?x?xf32>
+//       CHECK:   tensor.parallel_insert_slice %[[expand]] into %{{.*}}[%{{.*}}, %{{.*}}, 0, 0] [%[[sz]], 1, %{{.*}}, %{{.*}}] [1, 1, 1, 1] : tensor<?x?x?xf32> into tensor<?x?x?x?xf32>
+func.func @parallel_insert_of_non_padding_expand_shape(
+    %t: tensor<?x?xf32>, %d: tensor<?x?x?x?xf32>, %x: index, %y: index, %sz: index)
+  -> tensor<?x?x?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %sz0 = tensor.dim %t, %c0 : tensor<?x?xf32>
+  %sz1 = tensor.dim %t, %c1 : tensor<?x?xf32>
+  %0 = tensor.expand_shape %t [[0, 1], [2]] output_shape [%sz, %sz0, %sz1]
+      : tensor<?x?xf32> into tensor<?x?x?xf32>
+  %1 = scf.forall (%i, %j) in (%x, %y) shared_outs(%o = %d) -> (tensor<?x?x?x?xf32>) {
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %0 into %o[%i, %j, 0, 0][%sz, 1, %sz0, %sz1][1, 1, 1, 1]
+          : tensor<?x?x?xf32> into tensor<?x?x?x?xf32>
+    }
+  }
+  return %1 : tensor<?x?x?x?xf32>
+}
diff --git a/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir b/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir
index 5a2eade..f9e51ae 100644
--- a/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir
+++ b/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir
@@ -266,3 +266,131 @@ func.func @unpack_16x1x1x2_to_32x1(%arg0 : tensor<16x1x1x2xf32>) -> tensor<32x1x
     : tensor<16x1x1x2xf32> -> tensor<32x1xf32>
   return %unpack : tensor<32x1xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func.func @pad_like_pack(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<32x64xf32>)
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1, 2], [3]] output_shape [1, 1, 32, 64] : tensor<32x64xf32> into tensor<1x1x32x64xf32>
+// CHECK:         return %[[EXPANDED]] : tensor<1x1x32x64xf32>
+func.func @pad_like_pack(%arg0: tensor<32x64xf32>) -> tensor<1x1x32x64xf32> {
+  %empty = tensor.empty() : tensor<1x1x32x64xf32>
+  %0 = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<32x64xf32> -> tensor<1x1x32x64xf32>
+  return %0 : tensor<1x1x32x64xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @pad_like_pack_with_outer_dims_perm(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<32x64xf32>)
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1, 2], [3]] output_shape [1, 1, 32, 64] : tensor<32x64xf32> into tensor<1x1x32x64xf32>
+// CHECK:         return %[[EXPANDED]] : tensor<1x1x32x64xf32>
+func.func @pad_like_pack_with_outer_dims_perm(%arg0: tensor<32x64xf32>) -> tensor<1x1x32x64xf32> {
+  %empty = tensor.empty() : tensor<1x1x32x64xf32>
+  %0 = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<32x64xf32> -> tensor<1x1x32x64xf32>
+  return %0 : tensor<1x1x32x64xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @inner_pad_like_pack(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<32x64xf32>)
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0], [1, 2]] output_shape [32, 1, 64] : tensor<32x64xf32> into tensor<32x1x64xf32>
+// CHECK:         return %[[EXPANDED]] : tensor<32x1x64xf32>
+func.func @inner_pad_like_pack(%arg0: tensor<32x64xf32>) -> tensor<32x1x64xf32> {
+  %empty = tensor.empty() : tensor<32x1x64xf32>
+  %0 = tensor.pack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x64xf32> -> tensor<32x1x64xf32>
+  return %0 : tensor<32x1x64xf32>
+}
+
+// -----
+
+// Do not simplify pack with inner dimension shuffling.
+// CHECK-LABEL: func.func @pad_and_inner_dim_shuffle_pack(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<32x64xf32>)
+// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<1x1x64x32xf32>
+// CHECK:         %[[PACK:.+]] = tensor.pack %[[ARG0]] inner_dims_pos = [1, 0] inner_tiles = [64, 32] into %[[EMPTY]] : tensor<32x64xf32> -> tensor<1x1x64x32xf32>
+// CHECK:         return %[[PACK]] : tensor<1x1x64x32xf32>
+func.func @pad_and_inner_dim_shuffle_pack(%arg0: tensor<32x64xf32>) -> tensor<1x1x64x32xf32> {
+  %empty = tensor.empty() : tensor<1x1x64x32xf32>
+  %0 = tensor.pack %arg0 inner_dims_pos = [1, 0] inner_tiles = [64, 32] into %empty : tensor<32x64xf32> -> tensor<1x1x64x32xf32>
+  return %0 : tensor<1x1x64x32xf32>
+}
+
+// -----
+
+// Do not simplify pack with inner dimension transpose.
+// CHECK-LABEL: func.func @pad_like_pack_with_transpose(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<32x64x16xf32>)
+// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<32x1x16x64xf32>
+// CHECK:         %[[PACK:.+]] = tensor.pack %[[ARG0]] inner_dims_pos = [1] inner_tiles = [64] into %[[EMPTY]] : tensor<32x64x16xf32> -> tensor<32x1x16x64xf32>
+// CHECK:         return %[[PACK]] : tensor<32x1x16x64xf32>
+func.func @pad_like_pack_with_transpose(%arg0: tensor<32x64x16xf32>) -> tensor<32x1x16x64xf32> {
+  %empty = tensor.empty() : tensor<32x1x16x64xf32>
+  %0 = tensor.pack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x64x16xf32> -> tensor<32x1x16x64xf32>
+  return %0 : tensor<32x1x16x64xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @unpad_like_unpack(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<1x1x32x64xf32>)
+// CHECK:         %[[COLLAPSED:.+]] = tensor.collapse_shape %[[ARG0]] {{\[}}[0, 1, 2], [3]] : tensor<1x1x32x64xf32> into tensor<32x64xf32>
+// CHECK:         return %[[COLLAPSED]] : tensor<32x64xf32>
+func.func @unpad_like_unpack(%arg0: tensor<1x1x32x64xf32>) -> tensor<32x64xf32> {
+  %empty = tensor.empty() : tensor<32x64xf32>
+  %0 = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<1x1x32x64xf32> -> tensor<32x64xf32>
+  return %0 : tensor<32x64xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @unpad_like_unpack_with_outer_dims_perm(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<1x1x32x64xf32>)
+// CHECK:         %[[COLLAPSED:.+]] = tensor.collapse_shape %[[ARG0]] {{\[}}[0, 1, 2], [3]] : tensor<1x1x32x64xf32> into tensor<32x64xf32>
+// CHECK:         return %[[COLLAPSED]] : tensor<32x64xf32>
+func.func @unpad_like_unpack_with_outer_dims_perm(%arg0: tensor<1x1x32x64xf32>) -> tensor<32x64xf32> {
+  %empty = tensor.empty() : tensor<32x64xf32>
+  %0 = tensor.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<1x1x32x64xf32> -> tensor<32x64xf32>
+  return %0 : tensor<32x64xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @inner_unpad_like_unpack(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<32x1x64xf32>)
+// CHECK:         %[[COLLAPSED:.+]] = tensor.collapse_shape %[[ARG0]] {{\[}}[0], [1, 2]] : tensor<32x1x64xf32> into tensor<32x64xf32>
+// CHECK:         return %[[COLLAPSED]] : tensor<32x64xf32>
+func.func @inner_unpad_like_unpack(%arg0: tensor<32x1x64xf32>) -> tensor<32x64xf32> {
+  %empty = tensor.empty() : tensor<32x64xf32>
+  %0 = tensor.unpack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x1x64xf32> -> tensor<32x64xf32>
+  return %0 : tensor<32x64xf32>
+}
+
+// -----
+
+// Do not simplify unpack with inner dimension shuffling.
+// CHECK-LABEL: func.func @unpad_and_inner_dim_shuffle_pack(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<1x1x32x64xf32>)
+// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<64x32xf32>
+// CHECK:         %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] inner_dims_pos = [1, 0] inner_tiles = [32, 64] into %[[EMPTY]] : tensor<1x1x32x64xf32> -> tensor<64x32xf32>
+// CHECK:         return %[[UNPACK]] : tensor<64x32xf32>
+func.func @unpad_and_inner_dim_shuffle_pack(%arg0: tensor<1x1x32x64xf32>) -> tensor<64x32xf32> {
+  %empty = tensor.empty() : tensor<64x32xf32>
+  %0 = tensor.unpack %arg0 inner_dims_pos = [1, 0] inner_tiles = [32, 64] into %empty : tensor<1x1x32x64xf32> -> tensor<64x32xf32>
+  return %0 : tensor<64x32xf32>
+}
+
+// -----
+
+// Do not simplify unpack with inner dimension transpose.
+// CHECK-LABEL: func.func @unpad_like_unpack_with_transpose(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<32x1x16x64xf32>)
+// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<32x64x16xf32>
+// CHECK:         %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] inner_dims_pos = [1] inner_tiles = [64] into %[[EMPTY]] : tensor<32x1x16x64xf32> -> tensor<32x64x16xf32>
+// CHECK:         return %[[UNPACK]] : tensor<32x64x16xf32>
+func.func @unpad_like_unpack_with_transpose(%arg0: tensor<32x1x16x64xf32>) -> tensor<32x64x16xf32> {
+  %empty = tensor.empty() : tensor<32x64x16xf32>
+  %0 = tensor.unpack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x1x16x64xf32> -> tensor<32x64x16xf32>
+  return %0 : tensor<32x64x16xf32>
+}
diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
index c9f7e9c..1516f51f 100644
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -1798,3 +1798,59 @@ func.func @invalid_outerproduct1(%src : memref<?xf32>) {
   // expected-error @+1 {{'vector.outerproduct' op expected 1-d vector for operand #1}}
   %op = vector.outerproduct %0, %1 : vector<[4]x[4]xf32>, vector<[4]xf32>
 }
+
+// -----
+
+func.func @deinterleave_zero_dim_fail(%vec : vector<f32>) {
+  // expected-error @+1 {{'vector.deinterleave' op operand #0 must be vector of any type values, but got 'vector<f32>}}
+  %0, %1 = vector.deinterleave %vec : vector<f32> -> vector<f32>
+  return
+}
+
+// -----
+
+func.func @deinterleave_one_dim_fail(%vec : vector<1xf32>) {
+  // expected-error @+1 {{'vector.deinterleave' op failed to verify that the trailing dimension of the source vector has an even number of elements}}
+  %0, %1 = vector.deinterleave %vec : vector<1xf32> -> vector<1xf32>
+  return
+}
+
+// -----
+
+func.func @deinterleave_oversized_output_fail(%vec : vector<4xf32>) {
+  // expected-error @+1 {{'vector.deinterleave' op failed to verify that the trailing dimension of the results is half the width of source trailing dimension}}
+  %0, %1 = "vector.deinterleave" (%vec) : (vector<4xf32>) -> (vector<8xf32>, vector<8xf32>)
+  return
+}
+
+// -----
+
+func.func @deinterleave_output_dim_size_mismatch(%vec : vector<4xf32>) {
+  // expected-error @+1 {{'vector.deinterleave' op failed to verify that the trailing dimension of the results is half the width of source trailing dimension}}
+  %0, %1 = "vector.deinterleave" (%vec) : (vector<4xf32>) -> (vector<4xf32>, vector<2xf32>)
+  return
+}
+
+// -----
+
+func.func @deinterleave_n_dim_rank_fail(%vec : vector<2x3x4xf32>) {
+  // expected-error @+1 {{'vector.deinterleave' op failed to verify that the trailing dimension of the results is half the width of source trailing dimension}}
+  %0, %1 = "vector.deinterleave" (%vec) : (vector<2x3x4xf32>) -> (vector<2x3x4xf32>, vector<2x3x2xf32>)
+  return
+}
+
+// -----
+
+func.func @deinterleave_scalable_dim_size_fail(%vec : vector<2x[4]xf32>) {
+  // expected-error @+1 {{'vector.deinterleave' op failed to verify that all of {res1, res2} have same type}}
+  %0, %1 = "vector.deinterleave" (%vec) : (vector<2x[4]xf32>) -> (vector<2x[2]xf32>, vector<2x[1]xf32>)
+  return
+}
+
+// -----
+
+func.func @deinterleave_scalable_rank_fail(%vec : vector<2x[4]xf32>) {
+  // expected-error @+1 {{'vector.deinterleave' op failed to verify that all of {res1, res2} have same type}}
+  %0, %1 = "vector.deinterleave" (%vec) : (vector<2x[4]xf32>) -> (vector<2x[2]xf32>, vector<[2]xf32>)
+  return
+}
diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir
index 79a80be..9d8101d 100644
--- a/mlir/test/Dialect/Vector/ops.mlir
+++ b/mlir/test/Dialect/Vector/ops.mlir
@@ -1116,3 +1116,45 @@ func.func @interleave_2d_scalable(%a: vector<2x[2]xf64>, %b: vector<2x[2]xf64>)
   %0 = vector.interleave %a, %b : vector<2x[2]xf64>
   return %0 : vector<2x[4]xf64>
 }
+
+// CHECK-LABEL: @deinterleave_1d
+func.func @deinterleave_1d(%arg: vector<4xf32>) -> (vector<2xf32>, vector<2xf32>) {
+  // CHECK: vector.deinterleave %{{.*}} : vector<4xf32> -> vector<2xf32>
+  %0, %1 = vector.deinterleave %arg : vector<4xf32> -> vector<2xf32>
+  return %0, %1 : vector<2xf32>, vector<2xf32>
+}
+
+// CHECK-LABEL: @deinterleave_1d_scalable
+func.func @deinterleave_1d_scalable(%arg: vector<[4]xf32>) -> (vector<[2]xf32>, vector<[2]xf32>) {
+  // CHECK: vector.deinterleave %{{.*}} : vector<[4]xf32> -> vector<[2]xf32>
+  %0, %1 = vector.deinterleave %arg : vector<[4]xf32> -> vector<[2]xf32>
+  return %0, %1 : vector<[2]xf32>, vector<[2]xf32>
+}
+
+// CHECK-LABEL: @deinterleave_2d
+func.func @deinterleave_2d(%arg: vector<3x4xf32>) -> (vector<3x2xf32>, vector<3x2xf32>) {
+  // CHECK: vector.deinterleave %{{.*}} : vector<3x4xf32> -> vector<3x2xf32>
+  %0, %1 = vector.deinterleave %arg : vector<3x4xf32> -> vector<3x2xf32>
+  return %0, %1 : vector<3x2xf32>, vector<3x2xf32>
+}
+
+// CHECK-LABEL: @deinterleave_2d_scalable
+func.func @deinterleave_2d_scalable(%arg: vector<3x[4]xf32>) -> (vector<3x[2]xf32>, vector<3x[2]xf32>) {
+  // CHECK: vector.deinterleave %{{.*}} : vector<3x[4]xf32> -> vector<3x[2]xf32>
+  %0, %1 = vector.deinterleave %arg : vector<3x[4]xf32> -> vector<3x[2]xf32>
+  return %0, %1 : vector<3x[2]xf32>, vector<3x[2]xf32>
+}
+
+// CHECK-LABEL: @deinterleave_nd
+func.func @deinterleave_nd(%arg: vector<2x3x4x6xf32>) -> (vector<2x3x4x3xf32>, vector<2x3x4x3xf32>) {
+  // CHECK: vector.deinterleave %{{.*}} : vector<2x3x4x6xf32> -> vector<2x3x4x3xf32>
+  %0, %1 = vector.deinterleave %arg : vector<2x3x4x6xf32> -> vector<2x3x4x3xf32>
+  return %0, %1 : vector<2x3x4x3xf32>, vector<2x3x4x3xf32>
+}
+
+// CHECK-LABEL: @deinterleave_nd_scalable
+func.func @deinterleave_nd_scalable(%arg:vector<2x3x4x[6]xf32>) -> (vector<2x3x4x[3]xf32>, vector<2x3x4x[3]xf32>) {
+  // CHECK: vector.deinterleave %{{.*}} : vector<2x3x4x[6]xf32> -> vector<2x3x4x[3]xf32>
+  %0, %1 = vector.deinterleave %arg : vector<2x3x4x[6]xf32> -> vector<2x3x4x[3]xf32>
+  return %0, %1 : vector<2x3x4x[3]xf32>, vector<2x3x4x[3]xf32>
+}
diff --git a/mlir/test/IR/parser.mlir b/mlir/test/IR/parser.mlir
index 020942e..bcc146e 100644
--- a/mlir/test/IR/parser.mlir
+++ b/mlir/test/IR/parser.mlir
@@ -597,7 +597,7 @@ func.func @funcattrwithblock() -> ()
   return
 }
 
-// CHECK-label func @funcsimplemap
+// CHECK-LABEL: func @funcsimplemap
 #map_simple0 = affine_map<()[] -> (10)>
 #map_simple1 = affine_map<()[s0] -> (s0)>
 #map_non_simple0 = affine_map<(d0)[] -> (d0)>
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
index 2e59b72..391fda8 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
@@ -6,15 +6,6 @@
 // RUN:   --entry-point-result=void \
 // RUN:  | FileCheck %s
 
-// Basic PTX check to make sure we are generating the right instructions.
-
-// CHECK-PTX: mbarrier.init.shared.b64
-// CHECK-PTX: mbarrier.arrive.expect_tx.shared.b64
-// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes
-// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes
-// CHECK-PTX: mbarrier.arrive.expect_tx.shared.b64
-// CHECK-PTX: mbarrier.try_wait.parity.shared.b64
-
 // RUN: mlir-opt %s --convert-nvgpu-to-nvvm \
 // RUN:         -gpu-kernel-outlining \
 // RUN:         -convert-nvvm-to-llvm \
diff --git a/mlir/test/Pass/ir-printing-file-tree.mlir b/mlir/test/Pass/ir-printing-file-tree.mlir
new file mode 100644
index 0000000..b00d77d
--- /dev/null
+++ b/mlir/test/Pass/ir-printing-file-tree.mlir
@@ -0,0 +1,41 @@
+// Test filtering by "before"
+// RUN: rm -rf %t || true
+// RUN: mlir-opt %s -mlir-print-ir-tree-dir=%t \
+// RUN:   -pass-pipeline='builtin.module(builtin.module(func.func(cse,canonicalize)))' \
+// RUN:   -mlir-print-ir-before=cse
+// RUN: test -f %t/builtin_module_outer/builtin_module_inner/func_func_symB/0_0_0_cse.mlir
+// RUN: test ! -f %t/builtin_module_outer/builtin_module_inner/func_func_symB/0_0_1_canonicalize.mlir
+// RUN: test -f %t/builtin_module_outer/builtin_module_inner/func_func_symC/0_0_0_cse.mlir
+// RUN: test ! -f %t/builtin_module_outer/builtin_module_inner/func_func_symC/0_0_1_canonicalize.mlir
+
+// Test printing after all and the counter mechanism.
+// RUN: rm -rf %t || true
+// RUN: mlir-opt %s -mlir-print-ir-tree-dir=%t \
+// RUN:   -pass-pipeline='builtin.module(canonicalize,canonicalize,func.func(cse),builtin.module(canonicalize,func.func(cse,canonicalize),cse),cse)' \
+// RUN:   -mlir-print-ir-after-all
+// RUN: test -f %t/builtin_module_outer/0_canonicalize.mlir
+// RUN: test -f %t/builtin_module_outer/1_canonicalize.mlir
+// RUN: test -f %t/builtin_module_outer/func_func_symA/1_0_cse.mlir
+// RUN: test -f %t/builtin_module_outer/builtin_module_inner/1_0_canonicalize.mlir
+// RUN: test -f %t/builtin_module_outer/builtin_module_inner/func_func_symB/1_0_0_cse.mlir
+// RUN: test -f %t/builtin_module_outer/builtin_module_inner/func_func_symB/1_0_1_canonicalize.mlir
+// RUN: test -f %t/builtin_module_outer/builtin_module_inner/func_func_symC/1_0_0_cse.mlir
+// RUN: test -f %t/builtin_module_outer/builtin_module_inner/func_func_symC/1_0_1_canonicalize.mlir
+// RUN: test -f %t/builtin_module_outer/builtin_module_inner/1_1_cse.mlir
+// RUN: test -f %t/builtin_module_outer/2_cse.mlir
+
+builtin.module @outer {
+
+  func.func @symA() {
+    return
+  }
+
+  builtin.module @inner {
+    func.func @symB() {
+      return
+    }
+    func.func @symC() {
+      return
+    }
+  }
+}
diff --git a/mlir/test/Target/LLVMIR/Import/global-variables.ll b/mlir/test/Target/LLVMIR/Import/global-variables.ll
index 9d97340..902f77b 100644
--- a/mlir/test/Target/LLVMIR/Import/global-variables.ll
+++ b/mlir/test/Target/LLVMIR/Import/global-variables.ll
@@ -36,7 +36,7 @@
 ; CHECK-DAG:  %[[ADDR:[0-9]+]] = llvm.mlir.addressof @global_int : !llvm.ptr
 ; CHECK-DAG:  %[[IDX:[0-9]+]] = llvm.mlir.constant(2 : i32) : i32
 ; CHECK-DAG:  %[[GEP:[0-9]+]] = llvm.getelementptr %[[ADDR]][%[[IDX]]] : (!llvm.ptr, i32) -> !llvm.ptr
-; CHECK-DAG   llvm.return %[[GEP]] : !llvm.ptr
+; CHECK-DAG:  llvm.return %[[GEP]] : !llvm.ptr
 @global_gep_const_expr = internal constant ptr getelementptr (i32, ptr @global_int, i32 2)
 
 ; // -----
diff --git a/mlir/test/Target/LLVMIR/Import/metadata-loop.ll b/mlir/test/Target/LLVMIR/Import/metadata-loop.ll
index 3516101..20431a7 100644
--- a/mlir/test/Target/LLVMIR/Import/metadata-loop.ll
+++ b/mlir/test/Target/LLVMIR/Import/metadata-loop.ll
@@ -324,7 +324,7 @@ end:
 ; // -----
 
 ; Verify the unused access group is not imported.
-; CHECK-COUNT1: #llvm.access_group
+; CHECK-COUNT-1: #llvm.access_group
 
 ; CHECK-LABEL: @unused_parallel_access
 define void @unused_parallel_access(ptr %arg) {
diff --git a/mlir/test/Target/LLVMIR/llvmir-debug.mlir b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
index 1cb94bc..2792f13 100644
--- a/mlir/test/Target/LLVMIR/llvmir-debug.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
@@ -234,7 +234,7 @@ llvm.func @func_with_inlined_dbg_value(%arg0: i32) -> (i32) {
 // CHECK-DAG: ![[LEXICAL_BLOCK_FILE:.*]] = distinct !DILexicalBlockFile(scope: ![[INNER_FUNC]], file: ![[FILE]], discriminator: 0)
 // CHECK-DAG: ![[VAR_LOC0]] = !DILocalVariable(name: "a", scope: ![[OUTER_FUNC]], file: ![[FILE]]
 // CHECK-DAG: ![[VAR_LOC1]] = !DILocalVariable(name: "b", scope: ![[LEXICAL_BLOCK_FILE]], file: ![[FILE]]
-// CHECK-DAG  ![[LABEL]] = !DILabel(scope: ![[LEXICAL_BLOCK_FILE]], name: "label", file: ![[FILE]], line: 42)
+// CHECK-DAG: ![[LABEL]] = !DILabel(scope: ![[LEXICAL_BLOCK_FILE]], name: "label", file: ![[FILE]], line: 42)
 
 // -----
 
diff --git a/mlir/test/Target/LLVMIR/omptarget-array-sectioning-host.mlir b/mlir/test/Target/LLVMIR/omptarget-array-sectioning-host.mlir
index 08ccbf0..0016a1f 100644
--- a/mlir/test/Target/LLVMIR/omptarget-array-sectioning-host.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-array-sectioning-host.mlir
@@ -42,7 +42,7 @@ module attributes {omp.is_target_device = false} {
 
 // CHECK: @.offload_sizes = private unnamed_addr constant [2 x i64] [i64 36, i64 108]
 // CHECK: @.offload_maptypes = private unnamed_addr constant [2 x i64] [i64 35, i64 35]
-// CHECKL: @.offload_mapnames = private constant [2 x ptr] [ptr @0, ptr @1]
+// CHECK: @.offload_mapnames = private constant [2 x ptr] [ptr @0, ptr @1]
 
 // CHECK: define void @_3d_target_array_section()
 
diff --git a/mlir/test/Transforms/test-convert-func-op.mlir b/mlir/test/Transforms/test-convert-func-op.mlir
new file mode 100644
index 0000000..6e96703
--- /dev/null
+++ b/mlir/test/Transforms/test-convert-func-op.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-opt %s -test-convert-func-op | FileCheck %s
+
+// CHECK-LABEL: llvm.func @add
+func.func @add(%arg0: i32, %arg1: i32) -> i32 attributes { llvm.emit_c_interface } {
+  %res = arith.addi %arg0, %arg1 : i32
+  return %res : i32
+}
+// CHECK-LABEL: llvm.func @_mlir_ciface_add
+// CHECK-SAME: [[ARG0:%[a-zA-Z0-9_]+]]: i32
+// CHECK-SAME: [[ARG1:%[a-zA-Z0-9_]+]]: i32
+// CHECK-NEXT: [[RES:%.*]] = llvm.call @add([[ARG0]], [[ARG1]])
+// CHECK-NEXT: llvm.return [[RES]]
diff --git a/mlir/test/lib/Analysis/CMakeLists.txt b/mlir/test/lib/Analysis/CMakeLists.txt
index d168888..7c6b31a 100644
--- a/mlir/test/lib/Analysis/CMakeLists.txt
+++ b/mlir/test/lib/Analysis/CMakeLists.txt
@@ -10,6 +10,7 @@ add_mlir_library(MLIRTestAnalysis
   TestMemRefDependenceCheck.cpp
   TestMemRefStrideCalculation.cpp
   TestSlice.cpp
+  TestTopologicalSort.cpp
 
   DataFlow/TestDeadCodeAnalysis.cpp
   DataFlow/TestDenseBackwardDataFlowAnalysis.cpp
diff --git a/mlir/test/lib/Analysis/TestSlice.cpp b/mlir/test/lib/Analysis/TestSlice.cpp
index b445feb..7e8320d 100644
--- a/mlir/test/lib/Analysis/TestSlice.cpp
+++ b/mlir/test/lib/Analysis/TestSlice.cpp
@@ -1,4 +1,4 @@
-//===------------- TestSlice.cpp - Test slice related analisis ------------===//
+//===- TestSlice.cpp - Test slice related analisis ------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,13 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/Pass/Pass.h"
 
 using namespace mlir;
 
-static const StringLiteral kOrderMarker = "__test_sort_original_idx__";
+static const StringLiteral kToSortMark = "test_to_sort";
+static const StringLiteral kOrderIndex = "test_sort_index";
 
 namespace {
 
@@ -23,23 +25,20 @@ struct TestTopologicalSortPass
 
   StringRef getArgument() const final { return "test-print-topological-sort"; }
   StringRef getDescription() const final {
-    return "Print operations in topological order";
+    return "Sorts operations topologically and attaches attributes with their "
+           "corresponding index in the ordering to them";
   }
   void runOnOperation() override {
-    std::map<int, Operation *> ops;
-    getOperation().walk([&ops](Operation *op) {
-      if (auto originalOrderAttr = op->getAttrOfType<IntegerAttr>(kOrderMarker))
-        ops[originalOrderAttr.getInt()] = op;
+    SetVector<Operation *> toSort;
+    getOperation().walk([&](Operation *op) {
+      if (op->hasAttrOfType<UnitAttr>(kToSortMark))
+        toSort.insert(op);
     });
-    SetVector<Operation *> sortedOp;
-    for (auto op : ops)
-      sortedOp.insert(op.second);
-    sortedOp = topologicalSort(sortedOp);
-    llvm::errs() << "Testing : " << getOperation().getName() << "\n";
-    for (Operation *op : sortedOp) {
-      op->print(llvm::errs());
-      llvm::errs() << "\n";
-    }
+
+    auto i32Type = IntegerType::get(&getContext(), 32);
+    SetVector<Operation *> sortedOps = topologicalSort(toSort);
+    for (auto [index, op] : llvm::enumerate(sortedOps))
+      op->setAttr(kOrderIndex, IntegerAttr::get(i32Type, index));
   }
 };
 
diff --git a/mlir/test/lib/Transforms/TestTopologicalSort.cpp b/mlir/test/lib/Analysis/TestTopologicalSort.cpp
index 3b110c7..c7e0206 100644
--- a/mlir/test/lib/Transforms/TestTopologicalSort.cpp
+++ b/mlir/test/lib/Analysis/TestTopologicalSort.cpp
@@ -6,10 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/TopologicalSortUtils.h"
 
 using namespace mlir;
 
diff --git a/mlir/test/lib/Conversion/FuncToLLVM/CMakeLists.txt b/mlir/test/lib/Conversion/FuncToLLVM/CMakeLists.txt
index 45ba62d..d3dbc94 100644
--- a/mlir/test/lib/Conversion/FuncToLLVM/CMakeLists.txt
+++ b/mlir/test/lib/Conversion/FuncToLLVM/CMakeLists.txt
@@ -1,6 +1,7 @@
 # Exclude tests from libMLIR.so
 add_mlir_library(MLIRTestFuncToLLVM
   TestConvertCallOp.cpp
+  TestConvertFuncOp.cpp
 
   EXCLUDE_FROM_LIBMLIR
 
diff --git a/mlir/test/lib/Conversion/FuncToLLVM/TestConvertFuncOp.cpp b/mlir/test/lib/Conversion/FuncToLLVM/TestConvertFuncOp.cpp
new file mode 100644
index 0000000..e25e890
--- /dev/null
+++ b/mlir/test/lib/Conversion/FuncToLLVM/TestConvertFuncOp.cpp
@@ -0,0 +1,93 @@
+//===- TestConvertFuncOp.cpp - Test LLVM Conversion of Func FuncOp --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TestDialect.h"
+
+#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
+#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
+#include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+
+/// Test helper Conversion Pattern to directly call `convertFuncOpToLLVMFuncOp`
+/// to verify this utility function includes all functionalities of conversion
+struct FuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
+  FuncOpConversion(const LLVMTypeConverter &converter)
+      : ConvertOpToLLVMPattern(converter) {}
+
+  LogicalResult
+  matchAndRewrite(func::FuncOp funcOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    FailureOr<LLVM::LLVMFuncOp> newFuncOp = mlir::convertFuncOpToLLVMFuncOp(
+        cast<FunctionOpInterface>(funcOp.getOperation()), rewriter,
+        *getTypeConverter());
+    if (failed(newFuncOp))
+      return rewriter.notifyMatchFailure(funcOp, "Could not convert funcop");
+
+    rewriter.eraseOp(funcOp);
+    return success();
+  }
+};
+
+struct ReturnOpConversion : public ConvertOpToLLVMPattern<func::ReturnOp> {
+  ReturnOpConversion(const LLVMTypeConverter &converter)
+      : ConvertOpToLLVMPattern(converter) {}
+
+  LogicalResult
+  matchAndRewrite(func::ReturnOp returnOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(returnOp,
+                                                returnOp->getOperands());
+    return success();
+  }
+};
+
+struct TestConvertFuncOp
+    : public PassWrapper<TestConvertFuncOp, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestConvertFuncOp)
+
+  void getDependentDialects(DialectRegistry &registry) const final {
+    registry.insert<LLVM::LLVMDialect>();
+  }
+
+  StringRef getArgument() const final { return "test-convert-func-op"; }
+
+  StringRef getDescription() const final {
+    return "Tests conversion of `func.func` to `llvm.func` for different "
+           "attributes";
+  }
+
+  void runOnOperation() override {
+    MLIRContext *ctx = &getContext();
+
+    LowerToLLVMOptions options(ctx);
+    // Populate type conversions.
+    LLVMTypeConverter typeConverter(ctx, options);
+
+    RewritePatternSet patterns(ctx);
+    patterns.add<FuncOpConversion>(typeConverter);
+    patterns.add<ReturnOpConversion>(typeConverter);
+
+    LLVMConversionTarget target(getContext());
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns))))
+      signalPassFailure();
+  }
+};
+
+} // namespace
+
+namespace mlir::test {
+void registerConvertFuncOpPass() { PassRegistration<TestConvertFuncOp>(); }
+} // namespace mlir::test
diff --git a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
index bfee039..b058a8e 100644
--- a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
+++ b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
@@ -706,11 +706,20 @@ void TestReflectBoundsOp::inferResultRanges(
   const ConstantIntRanges &range = argRanges[0];
   MLIRContext *ctx = getContext();
   Builder b(ctx);
-  auto intTy = getType();
-  setUminAttr(b.getIntegerAttr(intTy, range.umin()));
-  setUmaxAttr(b.getIntegerAttr(intTy, range.umax()));
-  setSminAttr(b.getIntegerAttr(intTy, range.smin()));
-  setSmaxAttr(b.getIntegerAttr(intTy, range.smax()));
+  Type sIntTy, uIntTy;
+  // For plain `IntegerType`s, we can derive the appropriate signed and unsigned
+  // Types for the Attributes.
+  if (auto intTy = llvm::dyn_cast<IntegerType>(getType())) {
+    unsigned bitwidth = intTy.getWidth();
+    sIntTy = b.getIntegerType(bitwidth, /*isSigned=*/true);
+    uIntTy = b.getIntegerType(bitwidth, /*isSigned=*/false);
+  } else
+    sIntTy = uIntTy = getType();
+
+  setUminAttr(b.getIntegerAttr(uIntTy, range.umin()));
+  setUmaxAttr(b.getIntegerAttr(uIntTy, range.umax()));
+  setSminAttr(b.getIntegerAttr(sIntTy, range.smin()));
+  setSmaxAttr(b.getIntegerAttr(sIntTy, range.smax()));
   setResultRanges(getResult(), range);
 }
 
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index c5d0341..faf70ad 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -1697,6 +1697,12 @@ def : Pat<
                     ConstantStrAttr<StrAttr, "MatchVariadic">)>;
 
 def : Pat<
+  (MixedVOperandOp5 $input1a, $input1b, $input2, $attr1,
+                    ConstantStrAttr<StrAttr, "MatchInverseVariadic">),
+  (MixedVOperandOp3 $input2, (variadic $input1b), (variadic $input1a),
+                    ConstantAttr<I32Attr, "1">:$attr1)>;
+
+def : Pat<
   (MixedVOperandOp4 (variadic (MixedVOperandInOutI32Op $input1a),
                               (MixedVOperandInOutI32Op $input1b)),
                     $input2, ConstantAttr<I32Attr, "1">:$attr1),
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index a849b7e..975a41a 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -26,7 +26,6 @@ add_mlir_library(MLIRTestTransforms
   TestInlining.cpp
   TestIntRangeInference.cpp
   TestMakeIsolatedFromAbove.cpp
-  TestTopologicalSort.cpp
   ${MLIRTestTransformsPDLSrc}
 
   EXCLUDE_FROM_LIBMLIR
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index ea6d9ae..9ed3a2e 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -245,7 +245,7 @@ def have_host_jit_feature_support(feature_name):
 if have_host_jit_feature_support("jit"):
     config.available_features.add("host-supports-jit")
 
-if config.run_cuda_tests:
+if config.run_nvptx_tests:
     config.available_features.add("host-supports-nvptx")
 
 if config.run_rocm_tests:
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index c0fa1b8..4f5186d 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -25,7 +25,7 @@ config.mlir_cmake_dir = "@MLIR_CMAKE_DIR@"
 config.mlir_lib_dir = "@MLIR_LIB_DIR@"
 
 config.build_examples = @LLVM_BUILD_EXAMPLES@
-config.run_cuda_tests = @MLIR_ENABLE_CUDA_CONVERSIONS@
+config.run_nvptx_tests = @LLVM_HAS_NVPTX_TARGET@
 config.enable_cuda_runner = @MLIR_ENABLE_CUDA_RUNNER@
 config.run_rocm_tests = @MLIR_ENABLE_ROCM_CONVERSIONS@
 config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@
diff --git a/mlir/test/mlir-tblgen/op-decl-and-defs.td b/mlir/test/mlir-tblgen/op-decl-and-defs.td
index 499e3ce..836ddca 100644
--- a/mlir/test/mlir-tblgen/op-decl-and-defs.td
+++ b/mlir/test/mlir-tblgen/op-decl-and-defs.td
@@ -58,7 +58,8 @@ def NS_AOp : NS_Op<"a_op", [IsolatedFromAbove, IsolatedFromAbove]> {
 // CHECK: namespace detail {
 // CHECK: class AOpGenericAdaptorBase {
 // CHECK: public:
-// CHECK:   AOpGenericAdaptorBase(AOp{{[[:space:]]}}
+// CHECK:   AOpGenericAdaptorBase(::mlir::DictionaryAttr attrs = {}, const ::mlir::EmptyProperties &properties = {}, ::mlir::RegionRange regions = {}) : odsAttrs(attrs), odsRegions(regions)
+// CHECK:   AOpGenericAdaptorBase(::mlir::Operation *op) : odsAttrs(op->getRawDictionaryAttrs()), odsOpName(op->getName()), odsRegions(op->getRegions()) {}
 // CHECK:   ::mlir::IntegerAttr getAttr1Attr();
 // CHECK:   uint32_t getAttr1();
 // CHECK:   ::mlir::FloatAttr getSomeAttr2Attr();
@@ -128,15 +129,8 @@ def NS_AOp : NS_Op<"a_op", [IsolatedFromAbove, IsolatedFromAbove]> {
 
 // DEFS-LABEL: NS::AOp definitions
 
-// DEFS: AOpGenericAdaptorBase::AOpGenericAdaptorBase(::mlir::DictionaryAttr attrs, const ::mlir::EmptyProperties &properties, ::mlir::RegionRange regions) : odsAttrs(attrs), odsRegions(regions)
-
 // Check that `getAttrDictionary()` is used when not using properties.
 
-// DEFS: AOpGenericAdaptorBase::AOpGenericAdaptorBase(AOp op)
-// DEFS-SAME: op->getAttrDictionary()
-// DEFS-SAME: p.getProperties()
-// DEFS-SAME: op->getRegions()
-
 // DECLS: ::mlir::RegionRange AOpGenericAdaptorBase::getSomeRegions()
 // DECLS-NEXT: return odsRegions.drop_front(1);
 // DECLS: ::mlir::RegionRange AOpGenericAdaptorBase::getRegions()
@@ -344,12 +338,11 @@ def NS_NOp : NS_Op<"op_with_properties", []> {
   let arguments = (ins Property<"unsigned">:$value);
 }
 
-// Check that `getDiscardableAttrDictionary()` is used with properties.
-
-// DEFS: NOpGenericAdaptorBase::NOpGenericAdaptorBase(NOp op) : NOpGenericAdaptorBase(
-// DEFS-SAME: op->getDiscardableAttrDictionary()
-// DEFS-SAME: op.getProperties()
-// DEFS-SAME: op->getRegions()
+// DEFS: NOpGenericAdaptorBase::NOpGenericAdaptorBase(NOp op) :
+// DEFS-SAME: odsAttrs(op->getRawDictionaryAttrs())
+// DEFS-SAME: odsOpName(op->getName())
+// DEFS-SAME: properties(op.getProperties())
+// DEFS-SAME: odsRegions(op->getRegions())
 
 // Test that type defs have the proper namespaces when used as a constraint.
 // ---
diff --git a/mlir/test/mlir-tblgen/op-operand.td b/mlir/test/mlir-tblgen/op-operand.td
index a749708..a2fa1f7 100644
--- a/mlir/test/mlir-tblgen/op-operand.td
+++ b/mlir/test/mlir-tblgen/op-operand.td
@@ -15,9 +15,6 @@ def OpA : NS_Op<"one_normal_operand_op", []> {
 
 // CHECK-LABEL: OpA definitions
 
-// CHECK:      OpAGenericAdaptorBase::OpAGenericAdaptorBase
-// CHECK-SAME: odsAttrs(attrs)
-
 // CHECK:      void OpA::build
 // CHECK:        ::mlir::Value input
 // CHECK:        odsState.addOperands(input);
diff --git a/mlir/test/mlir-tblgen/pattern.mlir b/mlir/test/mlir-tblgen/pattern.mlir
index 7f9c450..5ff8710 100644
--- a/mlir/test/mlir-tblgen/pattern.mlir
+++ b/mlir/test/mlir-tblgen/pattern.mlir
@@ -527,6 +527,14 @@ func.func @testMatchVariadic(%arg0: i32, %arg1: i32, %arg2: i32, %arg3: i32) ->
   return
 }
 
+// CHECK-LABEL: @testReplaceVariadic
+func.func @testReplaceVariadic(%arg0: i32, %arg1: i32, %arg2: i32, %arg3: i32) -> () {
+  // CHECK: "test.mixed_variadic_in3"(%arg2, %arg1, %arg0) <{count = 1 : i32}>
+  "test.mixed_variadic_in5"(%arg0, %arg1, %arg2) <{attr1 = 0 : i32, pattern_name = "MatchInverseVariadic"}> : (i32, i32, i32) -> ()
+
+  return
+}
+
 // CHECK-LABEL: @testMatchVariadicSubDag
 func.func @testMatchVariadicSubDag(%arg0: i32, %arg1: i32, %arg2: i32) -> () {
   // CHECK: %[[IN0:.*]] = "test.mixed_variadic_in_out_i32"(%arg0) : (i32) -> i32
diff --git a/mlir/test/mlir-vulkan-runner/iaddcarry_extended.mlir b/mlir/test/mlir-vulkan-runner/addui_extended.mlir
index 9b1f196..9b1f196 100644
--- a/mlir/test/mlir-vulkan-runner/iaddcarry_extended.mlir
+++ b/mlir/test/mlir-vulkan-runner/addui_extended.mlir
diff --git a/mlir/test/python/dialects/transform_structured_ext.py b/mlir/test/python/dialects/transform_structured_ext.py
index 935534e..f97017b 100644
--- a/mlir/test/python/dialects/transform_structured_ext.py
+++ b/mlir/test/python/dialects/transform_structured_ext.py
@@ -443,7 +443,7 @@ def testTileExplicitLoopTypeAll(target):
     structured.TileUsingForOp(types, target, sizes=[2, 3, 4])
     # CHECK-LABEL: TEST: testTileExplicitLoopTypeAll
     # CHECK: = transform.structured.tile
-    # CHECK-SAME : (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">,
+    # CHECK-SAME: (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">,
     # CHECK-SAME: !transform.op<"scf.parallel">, !transform.op<"scf.forall">
 
 
diff --git a/mlir/test/python/ir/array_attributes.py b/mlir/test/python/ir/array_attributes.py
index 9251588..2bc403a 100644
--- a/mlir/test/python/ir/array_attributes.py
+++ b/mlir/test/python/ir/array_attributes.py
@@ -51,6 +51,87 @@ def testGetDenseElementsUnSupportedTypeOkIfExplicitTypeProvided():
 
 
 ################################################################################
+# Tests of the list of attributes .get() factory method
+################################################################################
+
+
+# CHECK-LABEL: TEST: testGetDenseElementsFromList
+@run
+def testGetDenseElementsFromList():
+    with Context(), Location.unknown():
+        attrs = [FloatAttr.get(F64Type.get(), 1.0), FloatAttr.get(F64Type.get(), 2.0)]
+        attr = DenseElementsAttr.get(attrs)
+
+        # CHECK: dense<[1.000000e+00, 2.000000e+00]> : tensor<2xf64>
+        print(attr)
+
+
+# CHECK-LABEL: TEST: testGetDenseElementsFromListWithExplicitType
+@run
+def testGetDenseElementsFromListWithExplicitType():
+    with Context(), Location.unknown():
+        attrs = [FloatAttr.get(F64Type.get(), 1.0), FloatAttr.get(F64Type.get(), 2.0)]
+        shaped_type = ShapedType(Type.parse("tensor<2xf64>"))
+        attr = DenseElementsAttr.get(attrs, shaped_type)
+
+        # CHECK: dense<[1.000000e+00, 2.000000e+00]> : tensor<2xf64>
+        print(attr)
+
+
+# CHECK-LABEL: TEST: testGetDenseElementsFromListEmptyList
+@run
+def testGetDenseElementsFromListEmptyList():
+    with Context(), Location.unknown():
+        attrs = []
+
+        try:
+            attr = DenseElementsAttr.get(attrs)
+        except ValueError as e:
+            # CHECK: Attributes list must be non-empty
+            print(e)
+
+
+# CHECK-LABEL: TEST: testGetDenseElementsFromListNonAttributeType
+@run
+def testGetDenseElementsFromListNonAttributeType():
+    with Context(), Location.unknown():
+        attrs = [1.0]
+
+        try:
+            attr = DenseElementsAttr.get(attrs)
+        except RuntimeError as e:
+            # CHECK: Invalid attribute when attempting to create an ArrayAttribute
+            print(e)
+
+
+# CHECK-LABEL: TEST: testGetDenseElementsFromListMismatchedType
+@run
+def testGetDenseElementsFromListMismatchedType():
+    with Context(), Location.unknown():
+        attrs = [FloatAttr.get(F64Type.get(), 1.0), FloatAttr.get(F64Type.get(), 2.0)]
+        shaped_type = ShapedType(Type.parse("tensor<2xf32>"))
+
+        try:
+            attr = DenseElementsAttr.get(attrs, shaped_type)
+        except ValueError as e:
+            # CHECK: All attributes must be of the same type and match the type parameter
+            print(e)
+
+
+# CHECK-LABEL: TEST: testGetDenseElementsFromListMixedTypes
+@run
+def testGetDenseElementsFromListMixedTypes():
+    with Context(), Location.unknown():
+        attrs = [FloatAttr.get(F64Type.get(), 1.0), FloatAttr.get(F32Type.get(), 2.0)]
+
+        try:
+            attr = DenseElementsAttr.get(attrs)
+        except ValueError as e:
+            # CHECK: All attributes must be of the same type and match the type parameter
+            print(e)
+
+
+################################################################################
 # Splats.
 ################################################################################
 
@@ -205,6 +286,7 @@ def testGetDenseElementsBoolSplat():
 
 ### float and double arrays.
 
+
 # CHECK-LABEL: TEST: testGetDenseElementsF16
 @run
 def testGetDenseElementsF16():
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 1dfc5d1..0e8b161 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -71,6 +71,7 @@ namespace test {
 void registerTestCompositePass();
 void registerCommutativityUtils();
 void registerConvertCallOpPass();
+void registerConvertFuncOpPass();
 void registerInliner();
 void registerMemRefBoundCheck();
 void registerPatternsTestPass();
@@ -199,6 +200,7 @@ void registerTestPasses() {
   mlir::test::registerTestCompositePass();
   mlir::test::registerCommutativityUtils();
   mlir::test::registerConvertCallOpPass();
+  mlir::test::registerConvertFuncOpPass();
   mlir::test::registerInliner();
   mlir::test::registerMemRefBoundCheck();
   mlir::test::registerPatternsTestPass();
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index e013cca..adda7ce 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -4101,7 +4101,8 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(
                              "{}");
     }
     paramList.emplace_back("::mlir::RegionRange", "regions", "{}");
-    auto *baseConstructor = genericAdaptorBase.addConstructor(paramList);
+    auto *baseConstructor =
+        genericAdaptorBase.addConstructor<Method::Inline>(paramList);
     baseConstructor->addMemberInitializer("odsAttrs", "attrs");
     if (useProperties)
       baseConstructor->addMemberInitializer("properties", "properties");
@@ -4163,14 +4164,24 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(
   // and the value range from the parameter.
   {
     // Base class is in the cpp file and can simply access the members of the op
-    // class to initialize the template independent fields.
-    auto *constructor = genericAdaptorBase.addConstructor(
-        MethodParameter(op.getCppClassName(), "op"));
-    constructor->addMemberInitializer(
-        genericAdaptorBase.getClassName(),
-        llvm::Twine(!useProperties ? "op->getAttrDictionary()"
-                                   : "op->getDiscardableAttrDictionary()") +
-            ", op.getProperties(), op->getRegions()");
+    // class to initialize the template independent fields. If the op doesn't
+    // have properties, we can emit a generic constructor inline. Otherwise,
+    // emit it out-of-line because we need the op to be defined.
+    Constructor *constructor;
+    if (useProperties) {
+      constructor = genericAdaptorBase.addConstructor(
+          MethodParameter(op.getCppClassName(), "op"));
+    } else {
+      constructor = genericAdaptorBase.addConstructor<Method::Inline>(
+          MethodParameter("::mlir::Operation *", "op"));
+    }
+    constructor->addMemberInitializer("odsAttrs",
+                                      "op->getRawDictionaryAttrs()");
+    // Retrieve the operation name from the op directly.
+    constructor->addMemberInitializer("odsOpName", "op->getName()");
+    if (useProperties)
+      constructor->addMemberInitializer("properties", "op.getProperties()");
+    constructor->addMemberInitializer("odsRegions", "op->getRegions()");
 
     // Generic adaptor is templated and therefore defined inline in the header.
     // We cannot use the Op class here as it is an incomplete type (we have a
diff --git a/mlir/tools/mlir-tblgen/RewriterGen.cpp b/mlir/tools/mlir-tblgen/RewriterGen.cpp
index e63a065..d8e16d9 100644
--- a/mlir/tools/mlir-tblgen/RewriterGen.cpp
+++ b/mlir/tools/mlir-tblgen/RewriterGen.cpp
@@ -159,6 +159,10 @@ private:
   // Returns the symbol of the old value serving as the replacement.
   StringRef handleReplaceWithValue(DagNode tree);
 
+  // Emits the C++ statement to replace the matched DAG with an array of
+  // matched values.
+  std::string handleVariadic(DagNode tree, int depth);
+
   // Trailing directives are used at the end of DAG node argument lists to
   // specify additional behaviour for op matchers and creators, etc.
   struct TrailingDirectives {
@@ -1241,6 +1245,9 @@ std::string PatternEmitter::handleResultPattern(DagNode resultTree,
   if (resultTree.isReplaceWithValue())
     return handleReplaceWithValue(resultTree).str();
 
+  if (resultTree.isVariadic())
+    return handleVariadic(resultTree, depth);
+
   // Normal op creation.
   auto symbol = handleOpCreation(resultTree, resultIndex, depth);
   if (resultTree.getSymbol().empty()) {
@@ -1251,6 +1258,26 @@ std::string PatternEmitter::handleResultPattern(DagNode resultTree,
   return symbol;
 }
 
+std::string PatternEmitter::handleVariadic(DagNode tree, int depth) {
+  assert(tree.isVariadic());
+
+  auto name = std::string(formatv("tblgen_variadic_values_{0}", nextValueId++));
+  symbolInfoMap.bindValue(name);
+  os << "::llvm::SmallVector<::mlir::Value, 4> " << name << ";\n";
+  for (int i = 0, e = tree.getNumArgs(); i != e; ++i) {
+    if (auto child = tree.getArgAsNestedDag(i)) {
+      os << name << ".push_back(" << handleResultPattern(child, i, depth + 1)
+         << ");\n";
+    } else {
+      os << name << ".push_back("
+         << handleOpArgument(tree.getArgAsLeaf(i), tree.getArgName(i))
+         << ");\n";
+    }
+  }
+
+  return name;
+}
+
 StringRef PatternEmitter::handleReplaceWithValue(DagNode tree) {
   assert(tree.isReplaceWithValue());
 
diff --git a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
index cea4935..a8fe20d 100644
--- a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
+++ b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
@@ -30,7 +30,7 @@
 using namespace mlir;
 
 // Skip the test if the NVPTX target was not built.
-#if MLIR_ENABLE_CUDA_CONVERSIONS
+#if LLVM_HAS_NVPTX_TARGET
 #define SKIP_WITHOUT_NVPTX(x) x
 #else
 #define SKIP_WITHOUT_NVPTX(x) DISABLED_##x
diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index c3dcebf..ef90dc9 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -143,9 +143,31 @@ set(LIBOMPTARGET_PLUGINS_TO_BUILD "all" CACHE STRING
 if(LIBOMPTARGET_PLUGINS_TO_BUILD STREQUAL "all")
   set(LIBOMPTARGET_PLUGINS_TO_BUILD ${LIBOMPTARGET_ALL_PLUGIN_TARGETS})
 endif()
+
+if(NOT CMAKE_SYSTEM_NAME MATCHES "Linux" AND
+   "host" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
+  message(STATUS "Not building host plugin: only Linux systems are supported")
+  list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "host")
+endif()
+if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$"
+        AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
+  if("amdgpu" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
+    message(STATUS "Not building AMDGPU plugin: only support AMDGPU in "
+                   "Linux x86_64, ppc64le, or aarch64 hosts")
+    list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "amdgpu")
+  endif()
+  if("nvptx" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
+    message(STATUS "Not building CUDA plugin: only support AMDGPU in "
+                   "Linux x86_64, ppc64le, or aarch64 hosts")
+    list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "cuda")
+  endif()
+endif()
 message(STATUS "Building the offload library with support for "
                "the \"${LIBOMPTARGET_PLUGINS_TO_BUILD}\" plugins")
 
+set(LIBOMPTARGET_DLOPEN_PLUGINS "${LIBOMPTARGET_PLUGINS_TO_BUILD}" CACHE STRING
+    "Semicolon-separated list of plugins to use 'dlopen' for runtime linking")
+
 set(LIBOMPTARGET_ENUM_PLUGIN_TARGETS "")
 foreach(plugin IN LISTS LIBOMPTARGET_PLUGINS_TO_BUILD)
   set(LIBOMPTARGET_ENUM_PLUGIN_TARGETS
diff --git a/offload/DeviceRTL/include/Utils.h b/offload/DeviceRTL/include/Utils.h
index 4ab0aea..d43b7f5 100644
--- a/offload/DeviceRTL/include/Utils.h
+++ b/offload/DeviceRTL/include/Utils.h
@@ -25,6 +25,8 @@ int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
 
 int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width);
 
+uint64_t ballotSync(uint64_t Mask, int32_t Pred);
+
 /// Return \p LowBits and \p HighBits packed into a single 64 bit value.
 uint64_t pack(uint32_t LowBits, uint32_t HighBits);
 
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index b2028a8..4f39d2a 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -364,4 +364,8 @@ _TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel)
 _TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock)
 _TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel)
 
+extern "C" uint64_t ompx_ballot_sync(uint64_t mask, int pred) {
+  return utils::ballotSync(mask, pred);
+}
+
 #pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Utils.cpp b/offload/DeviceRTL/src/Utils.cpp
index d07ac0f..606e3be 100644
--- a/offload/DeviceRTL/src/Utils.cpp
+++ b/offload/DeviceRTL/src/Utils.cpp
@@ -37,6 +37,8 @@ int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
 int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
                     int32_t Width);
 
+uint64_t ballotSync(uint64_t Mask, int32_t Pred);
+
 /// AMDGCN Implementation
 ///
 ///{
@@ -57,6 +59,10 @@ int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
   return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
 }
 
+uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
+  return Mask & __builtin_amdgcn_ballot_w64(Pred);
+}
+
 bool isSharedMemPtr(const void *Ptr) {
   return __builtin_amdgcn_is_shared(
       (const __attribute__((address_space(0))) void *)Ptr);
@@ -80,6 +86,10 @@ int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
   return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
 }
 
+uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
+  return __nvvm_vote_ballot_sync(static_cast<uint32_t>(Mask), Pred);
+}
+
 bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
 
 #pragma omp end declare variant
@@ -103,6 +113,10 @@ int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
   return impl::shuffleDown(Mask, Var, Delta, Width);
 }
 
+uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) {
+  return impl::ballotSync(Mask, Pred);
+}
+
 bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); }
 
 extern "C" {
diff --git a/offload/cmake/Modules/LibomptargetGetDependencies.cmake b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
index e37b86b..c296f7e 100644
--- a/offload/cmake/Modules/LibomptargetGetDependencies.cmake
+++ b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
@@ -3,7 +3,6 @@
 #
 # libffi : required to launch target kernels given function and argument
 #          pointers.
-# CUDA : required to control offloading to NVIDIA GPUs.
 
 include (FindPackageHandleStandardArgs)
 
@@ -44,13 +43,6 @@ find_package(FFI QUIET)
 set(LIBOMPTARGET_DEP_LIBFFI_FOUND ${FFI_FOUND})
 
 ################################################################################
-# Looking for CUDA...
-################################################################################
-
-find_package(CUDAToolkit QUIET)
-set(LIBOMPTARGET_DEP_CUDA_FOUND ${CUDAToolkit_FOUND})
-
-################################################################################
 # Looking for NVIDIA GPUs...
 ################################################################################
 set(LIBOMPTARGET_DEP_CUDA_ARCH "sm_35")
diff --git a/offload/plugins-nextgen/amdgpu/CMakeLists.txt b/offload/plugins-nextgen/amdgpu/CMakeLists.txt
index 2f4057c..47cd2fe 100644
--- a/offload/plugins-nextgen/amdgpu/CMakeLists.txt
+++ b/offload/plugins-nextgen/amdgpu/CMakeLists.txt
@@ -1,11 +1,6 @@
 # As of rocm-3.7, hsa is installed with cmake packages and kmt is found via hsa
 find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
 
-if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
-  message(STATUS "Not building AMDGPU NextGen plugin: only support AMDGPU in Linux x86_64, ppc64le, or aarch64 hosts")
-  return()
-endif()
-
 # Create the library and add the default arguments.
 add_target_library(omptarget.rtl.amdgpu AMDGPU)
 
@@ -13,8 +8,7 @@ target_sources(omptarget.rtl.amdgpu PRIVATE src/rtl.cpp)
 target_include_directories(omptarget.rtl.amdgpu PRIVATE
                            ${CMAKE_CURRENT_SOURCE_DIR}/utils)
 
-option(LIBOMPTARGET_FORCE_DLOPEN_LIBHSA "Build with dlopened libhsa" OFF)
-if(hsa-runtime64_FOUND AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBHSA)
+if(hsa-runtime64_FOUND AND NOT "amdgpu" IN_LIST LIBOMPTARGET_DLOPEN_PLUGINS)
   message(STATUS "Building AMDGPU plugin linked against libhsa")
   target_link_libraries(omptarget.rtl.amdgpu PRIVATE hsa-runtime64::hsa-runtime64)
 else()
diff --git a/offload/plugins-nextgen/common/include/JIT.h b/offload/plugins-nextgen/common/include/JIT.h
index b22197b..4414926 100644
--- a/offload/plugins-nextgen/common/include/JIT.h
+++ b/offload/plugins-nextgen/common/include/JIT.h
@@ -55,10 +55,6 @@ struct JITEngine {
   process(const __tgt_device_image &Image,
           target::plugin::GenericDeviceTy &Device);
 
-  /// Return true if \p Image is a bitcode image that can be JITed for the given
-  /// architecture.
-  Expected<bool> checkBitcodeImage(StringRef Buffer) const;
-
 private:
   /// Compile the bitcode image \p Image and generate the binary image that can
   /// be loaded to the target device of the triple \p Triple architecture \p
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 83f6e8d..eda6a4f 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -1052,6 +1052,10 @@ struct GenericPluginTy {
   /// given target. Returns true if the \p Image is compatible with the plugin.
   Expected<bool> checkELFImage(StringRef Image) const;
 
+  /// Return true if the \p Image can be compiled to run on the platform's
+  /// target architecture.
+  Expected<bool> checkBitcodeImage(StringRef Image) const;
+
   /// Indicate if an image is compatible with the plugin devices. Notice that
   /// this function may be called before actually initializing the devices. So
   /// we could not move this function into GenericDeviceTy.
@@ -1066,8 +1070,11 @@ protected:
 public:
   // TODO: This plugin interface needs to be cleaned up.
 
+  /// Returns true if the plugin has been initialized.
+  int32_t is_initialized() const;
+
   /// Returns non-zero if the provided \p Image can be executed by the runtime.
-  int32_t is_valid_binary(__tgt_device_image *Image);
+  int32_t is_valid_binary(__tgt_device_image *Image, bool Initialized = true);
 
   /// Initialize the device inside of the plugin.
   int32_t init_device(int32_t DeviceId);
@@ -1187,6 +1194,9 @@ public:
                        void **KernelPtr);
 
 private:
+  /// Indicates if the platform runtime has been fully initialized.
+  bool Initialized = false;
+
   /// Number of devices available for the plugin.
   int32_t NumDevices = 0;
 
diff --git a/offload/plugins-nextgen/common/src/JIT.cpp b/offload/plugins-nextgen/common/src/JIT.cpp
index 9d58e60..9dbba14 100644
--- a/offload/plugins-nextgen/common/src/JIT.cpp
+++ b/offload/plugins-nextgen/common/src/JIT.cpp
@@ -323,19 +323,3 @@ JITEngine::process(const __tgt_device_image &Image,
 
   return &Image;
 }
-
-Expected<bool> JITEngine::checkBitcodeImage(StringRef Buffer) const {
-  TimeTraceScope TimeScope("Check bitcode image");
-
-  assert(identify_magic(Buffer) == file_magic::bitcode &&
-         "Input is not bitcode");
-
-  LLVMContext Context;
-  auto ModuleOrErr = getLazyBitcodeModule(MemoryBufferRef(Buffer, ""), Context,
-                                          /*ShouldLazyLoadMetadata=*/true);
-  if (!ModuleOrErr)
-    return ModuleOrErr.takeError();
-  Module &M = **ModuleOrErr;
-
-  return Triple(M.getTargetTriple()).getArch() == TT.getArch();
-}
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 550ebc9..913721a 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -24,6 +24,7 @@
 #include "omp-tools.h"
 #endif
 
+#include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/JSON.h"
@@ -1495,6 +1496,7 @@ Error GenericPluginTy::init() {
   if (!NumDevicesOrErr)
     return NumDevicesOrErr.takeError();
 
+  Initialized = true;
   NumDevices = *NumDevicesOrErr;
   if (NumDevices == 0)
     return Plugin::success();
@@ -1578,14 +1580,27 @@ Expected<bool> GenericPluginTy::checkELFImage(StringRef Image) const {
   if (!MachineOrErr)
     return MachineOrErr.takeError();
 
-  if (!*MachineOrErr)
+  return MachineOrErr;
+}
+
+Expected<bool> GenericPluginTy::checkBitcodeImage(StringRef Image) const {
+  if (identify_magic(Image) != file_magic::bitcode)
     return false;
 
-  // Perform plugin-dependent checks for the specific architecture if needed.
-  return isELFCompatible(Image);
+  LLVMContext Context;
+  auto ModuleOrErr = getLazyBitcodeModule(MemoryBufferRef(Image, ""), Context,
+                                          /*ShouldLazyLoadMetadata=*/true);
+  if (!ModuleOrErr)
+    return ModuleOrErr.takeError();
+  Module &M = **ModuleOrErr;
+
+  return Triple(M.getTargetTriple()).getArch() == getTripleArch();
 }
 
-int32_t GenericPluginTy::is_valid_binary(__tgt_device_image *Image) {
+int32_t GenericPluginTy::is_initialized() const { return Initialized; }
+
+int32_t GenericPluginTy::is_valid_binary(__tgt_device_image *Image,
+                                         bool Initialized) {
   StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart),
                    target::getPtrDiff(Image->ImageEnd, Image->ImageStart));
 
@@ -1603,10 +1618,17 @@ int32_t GenericPluginTy::is_valid_binary(__tgt_device_image *Image) {
     auto MatchOrErr = checkELFImage(Buffer);
     if (Error Err = MatchOrErr.takeError())
       return HandleError(std::move(Err));
-    return *MatchOrErr;
+    if (!Initialized || !*MatchOrErr)
+      return *MatchOrErr;
+
+    // Perform plugin-dependent checks for the specific architecture if needed.
+    auto CompatibleOrErr = isELFCompatible(Buffer);
+    if (Error Err = CompatibleOrErr.takeError())
+      return HandleError(std::move(Err));
+    return *CompatibleOrErr;
   }
   case file_magic::bitcode: {
-    auto MatchOrErr = getJIT().checkBitcodeImage(Buffer);
+    auto MatchOrErr = checkBitcodeImage(Buffer);
     if (Error Err = MatchOrErr.takeError())
       return HandleError(std::move(Err));
     return *MatchOrErr;
diff --git a/offload/plugins-nextgen/cuda/CMakeLists.txt b/offload/plugins-nextgen/cuda/CMakeLists.txt
index 10ff61284..5fdfb8f 100644
--- a/offload/plugins-nextgen/cuda/CMakeLists.txt
+++ b/offload/plugins-nextgen/cuda/CMakeLists.txt
@@ -1,17 +1,10 @@
-if (NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
-  message(STATUS "Not building CUDA NextGen offloading plugin: only support CUDA in Linux x86_64, ppc64le, or aarch64 hosts.")
-  return()
-endif()
-
-message(STATUS "Building CUDA NextGen offloading plugin.")
-
 # Create the library and add the default arguments.
 add_target_library(omptarget.rtl.cuda CUDA)
 
 target_sources(omptarget.rtl.cuda PRIVATE src/rtl.cpp)
 
-option(LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA "Build with dlopened libcuda" OFF)
-if(LIBOMPTARGET_DEP_CUDA_FOUND AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
+find_package(CUDAToolkit QUIET)
+if(CUDAToolkit_FOUND AND NOT "cuda" IN_LIST LIBOMPTARGET_DLOPEN_PLUGINS)
   message(STATUS "Building CUDA plugin linked against libcuda")
   target_link_libraries(omptarget.rtl.cuda PRIVATE CUDA::cuda_driver)
 else()
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 32031c2..d65e5cf 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -16,6 +16,15 @@
 #include <cstddef>
 #include <cstdint>
 
+#define cuDeviceTotalMem cuDeviceTotalMem_v2
+#define cuModuleGetGlobal cuModuleGetGlobal_v2
+#define cuMemGetInfo cuMemGetInfo_v2
+#define cuMemAlloc cuMemAlloc_v2
+#define cuMemFree cuMemFree_v2
+#define cuMemAllocHost cuMemAllocHost_v2
+#define cuDevicePrimaryCtxRelease cuDevicePrimaryCtxRelease_v2
+#define cuDevicePrimaryCtxSetFlags cuDevicePrimaryCtxSetFlags_v2
+
 typedef int CUdevice;
 typedef uintptr_t CUdeviceptr;
 typedef struct CUmod_st *CUmodule;
diff --git a/offload/plugins-nextgen/host/CMakeLists.txt b/offload/plugins-nextgen/host/CMakeLists.txt
index 9c6aa27..817d128 100644
--- a/offload/plugins-nextgen/host/CMakeLists.txt
+++ b/offload/plugins-nextgen/host/CMakeLists.txt
@@ -1,7 +1,3 @@
-if(NOT CMAKE_SYSTEM_NAME MATCHES "Linux")
-  return()
-endif()
-
 set(supported_targets x86_64 aarch64 ppc64 ppc64le s390x)
 if(NOT ${CMAKE_SYSTEM_PROCESSOR} IN_LIST supported_targets)
   message(STATUS "Not building ${machine} NextGen offloading plugin")
diff --git a/offload/src/PluginManager.cpp b/offload/src/PluginManager.cpp
index 191afa3..f720078 100644
--- a/offload/src/PluginManager.cpp
+++ b/offload/src/PluginManager.cpp
@@ -34,15 +34,8 @@ void PluginManager::init() {
   // Attempt to create an instance of each supported plugin.
 #define PLUGIN_TARGET(Name)                                                    \
   do {                                                                         \
-    auto Plugin = std::unique_ptr<GenericPluginTy>(createPlugin_##Name());     \
-    if (auto Err = Plugin->init()) {                                           \
-      [[maybe_unused]] std::string InfoMsg = toString(std::move(Err));         \
-      DP("Failed to init plugin: %s\n", InfoMsg.c_str());                      \
-    } else {                                                                   \
-      DP("Registered plugin %s with %d visible device(s)\n",                   \
-         Plugin->getName(), Plugin->number_of_devices());                      \
-      Plugins.emplace_back(std::move(Plugin));                                 \
-    }                                                                          \
+    Plugins.emplace_back(                                                      \
+        std::unique_ptr<GenericPluginTy>(createPlugin_##Name()));              \
   } while (false);
 #include "Shared/Targets.def"
 
@@ -160,6 +153,27 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) {
     if (Entry.flags == OMP_REGISTER_REQUIRES)
       PM->addRequirements(Entry.data);
 
+  // Initialize all the plugins that have associated images.
+  for (auto &Plugin : Plugins) {
+    if (Plugin->is_initialized())
+      continue;
+
+    // Extract the exectuable image and extra information if availible.
+    for (int32_t i = 0; i < Desc->NumDeviceImages; ++i) {
+      if (!Plugin->is_valid_binary(&Desc->DeviceImages[i],
+                                   /*Initialized=*/false))
+        continue;
+
+      if (auto Err = Plugin->init()) {
+        [[maybe_unused]] std::string InfoMsg = toString(std::move(Err));
+        DP("Failed to init plugin: %s\n", InfoMsg.c_str());
+      } else {
+        DP("Registered plugin %s with %d visible device(s)\n",
+           Plugin->getName(), Plugin->number_of_devices());
+      }
+    }
+  }
+
   // Extract the exectuable image and extra information if availible.
   for (int32_t i = 0; i < Desc->NumDeviceImages; ++i)
     PM->addDeviceImage(*Desc, Desc->DeviceImages[i]);
@@ -177,7 +191,7 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) {
       if (!R.number_of_devices())
         continue;
 
-      if (!R.is_valid_binary(Img)) {
+      if (!R.is_valid_binary(Img, /*Initialized=*/true)) {
         DP("Image " DPxMOD " is NOT compatible with RTL %s!\n",
            DPxPTR(Img->ImageStart), R.getName());
         continue;
diff --git a/offload/test/offloading/dynamic_module.c b/offload/test/offloading/dynamic_module.c
index f1e9862..9dcf3a1 100644
--- a/offload/test/offloading/dynamic_module.c
+++ b/offload/test/offloading/dynamic_module.c
@@ -2,6 +2,8 @@
 // RUN: %libomptarget-compile-generic %t.so && %libomptarget-run-generic 2>&1 | %fcheck-generic
 // RUN: %libomptarget-compileopt-generic -DSHARED -fPIC -shared -o %t.so && \
 // RUN: %libomptarget-compileopt-generic %t.so && %libomptarget-run-generic 2>&1 | %fcheck-generic
+//
+// UNSUPPORTED: x86_64-pc-linux-gnu
 
 #ifdef SHARED
 void foo() {}
diff --git a/offload/test/offloading/fortran/dump_map_tables.f90 b/offload/test/offloading/fortran/dump_map_tables.f90
new file mode 100644
index 0000000..cb66ef3
--- /dev/null
+++ b/offload/test/offloading/fortran/dump_map_tables.f90
@@ -0,0 +1,38 @@
+! Offloading test with runtine call to ompx_dump_mapping_tables
+! Fortran array writing some values and printing the variable mapped to device
+! correctly receives the updates made on the device.
+! REQUIRES: flang
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+
+program map_dump_example
+  INTERFACE
+    SUBROUTINE ompx_dump_mapping_tables() BIND(C)
+    END SUBROUTINE ompx_dump_mapping_tables
+  END INTERFACE
+
+  integer i,j,k,N
+  integer async_q(4)
+  real :: A(5000000)
+  N=5000000
+  do i=1, N
+    A(i)=0
+  enddo
+! clang-format off
+! CHECK: omptarget device 0 info: OpenMP Host-Device pointer mappings after block
+! CHECK-NEXT: omptarget device 0 info: Host Ptr Target Ptr Size (B) DynRefCount HoldRefCount Declaration
+! CHECK-NEXT: omptarget device 0 info: {{(0x[0-9a-f]{16})}} {{(0x[0-9a-f]{16})}}  20000000 1 0 {{.*}} at a(:n):21:11
+! clang-format on
+!$omp target enter data map(to:A(:N))
+  call ompx_dump_mapping_tables()
+!$omp target parallel do
+  do i=1, N
+    A(i)=A(i)*2
+  enddo
+!$omp target exit data map(from:A)
+end program
diff --git a/offload/test/offloading/ompx_bare_ballot_sync.c b/offload/test/offloading/ompx_bare_ballot_sync.c
new file mode 100644
index 0000000..d8e1769
--- /dev/null
+++ b/offload/test/offloading/ompx_bare_ballot_sync.c
@@ -0,0 +1,45 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+//
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#if defined __AMDGCN_WAVEFRONT_SIZE && __AMDGCN_WAVEFRONT_SIZE == 64
+#define MASK 0xaaaaaaaaaaaaaaaa
+#else
+#define MASK 0xaaaaaaaa
+#endif
+
+#include <assert.h>
+#include <ompx.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, char *argv[]) {
+  const int num_blocks = 1;
+  const int block_size = 256;
+  const int N = num_blocks * block_size;
+  uint64_t *data = (uint64_t *)malloc(N * sizeof(uint64_t));
+
+  for (int i = 0; i < N; ++i)
+    data[i] = i & 0x1;
+
+#pragma omp target teams ompx_bare num_teams(num_blocks) thread_limit(block_size) map(tofrom: data[0:N])
+  {
+    int tid = ompx_thread_id_x();
+    uint64_t mask = ompx_ballot_sync(~0U, data[tid]);
+    data[tid] += mask;
+  }
+
+  for (int i = 0; i < N; ++i)
+    assert(data[i] == ((i & 0x1) + MASK));
+
+  // CHECK: PASS
+  printf("PASS\n");
+
+  return 0;
+}
diff --git a/openmp/cmake/OpenMPTesting.cmake b/openmp/cmake/OpenMPTesting.cmake
index ab2348ae..c67ad8b 100644
--- a/openmp/cmake/OpenMPTesting.cmake
+++ b/openmp/cmake/OpenMPTesting.cmake
@@ -58,7 +58,7 @@ if (${OPENMP_STANDALONE_BUILD})
     set(DEFAULT_LIT_ARGS "${DEFAULT_LIT_ARGS} --no-progress-bar")
   endif()
   if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
-    set(DEFAULT_LIT_ARGS "${DEFAULT_LIT_ARGS} --time-tests --timeout=1800")
+    set(DEFAULT_LIT_ARGS "${DEFAULT_LIT_ARGS} --time-tests --timeout=3000")
   endif()
   set(OPENMP_LIT_ARGS "${DEFAULT_LIT_ARGS}" CACHE STRING "Options for lit.")
   separate_arguments(OPENMP_LIT_ARGS)
diff --git a/openmp/docs/SupportAndFAQ.rst b/openmp/docs/SupportAndFAQ.rst
index 9e6974d..a158422 100644
--- a/openmp/docs/SupportAndFAQ.rst
+++ b/openmp/docs/SupportAndFAQ.rst
@@ -454,6 +454,15 @@ Q: What command line options can I use for OpenMP?
 We recommend taking a look at the OpenMP 
 :doc:`command line argument reference <CommandLineArgumentReference>` page.
 
+Q: Can I build the offloading runtimes without CUDA or HSA?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+By default, the offloading runtime will load the associated vendor runtime 
+during initialization rather than directly linking against them. This allows the 
+program to be built and run on many machine. If you wish to directly link 
+against these libraries, use the ``LIBOMPTARGET_DLOPEN_PLUGINS=""`` option to 
+suppress it for each plugin. The default value is every plugin enabled with 
+``LIBOMPTARGET_PLUGINS_TO_BUILD``.
+
 Q: Why is my build taking a long time?
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 When installing OpenMP and other LLVM components, the build time on multicore 
diff --git a/openmp/docs/remarks/OMP121.rst b/openmp/docs/remarks/OMP121.rst
index 88561b8..f3ceeac 100644
--- a/openmp/docs/remarks/OMP121.rst
+++ b/openmp/docs/remarks/OMP121.rst
@@ -1,6 +1,6 @@
 .. _omp121:
 
-Value has potential side effects preventing SPMD-mode execution. Add `__attribute__((assume(\"ompx_spmd_amenable\")))` to the called function to override. [OMP121]
+Value has potential side effects preventing SPMD-mode execution. Add `[[omp::assume(\"ompx_spmd_amenable\")]]` to the called function to override. [OMP121]
 ===================================================================================================================================================================
 
 This analysis remarks indicates that a potential side-effect that cannot be
@@ -42,7 +42,7 @@ or operations that cannot be executed in SPMD-mode.
 
    $ clang++ -fopenmp -fopenmp-targets=nvptx64 -O2 -Rpass-analysis=openmp-opt omp121.cpp
    omp121.cpp:8:13: remark: Value has potential side effects preventing SPMD-mode
-   execution.  Add `__attribute__((assume("ompx_spmd_amenable")))` to the called function
+   execution.  Add `[[omp::assume("ompx_spmd_amenable")]]` to the called function
    to override. [OMP121]
    int x = work();
             ^
@@ -53,7 +53,7 @@ contain any code that prevents SPMD-mode execution.
 
 .. code-block:: c++
 
-  __attribute__((assume("ompx_spmd_amenable"))) extern int work();
+  [[omp::assume("ompx_spmd_amenable")]] extern int work();
 
   void use(int x);
 
diff --git a/openmp/docs/remarks/OMP133.rst b/openmp/docs/remarks/OMP133.rst
index f025352d..5a73447 100644
--- a/openmp/docs/remarks/OMP133.rst
+++ b/openmp/docs/remarks/OMP133.rst
@@ -1,4 +1,4 @@
-Call may contain unknown parallel regions. Use `__attribute__((assume("omp_no_parallelism")))` to override. [OMP133]
+Call may contain unknown parallel regions. Use `[[omp::assume("omp_no_parallelism")]]` to override. [OMP133]
 ====================================================================================================================
 
 .. _omp133:
@@ -33,7 +33,7 @@ regions. This is typically coupled with the :ref:`OMP132 <omp132>` remark.
 
    $ clang++ -fopenmp -fopenmp-targets=nvptx64 -O2 -Rpass-analysis=openmp-opt omp133.cpp
    omp133.cpp:6:5: remark: Call may contain unknown parallel regions. Use
-   `__attribute__((assume("omp_no_parallelism")))` to override. [OMP133]
+   `[[omp::assume("omp_no_parallelism")]]` to override. [OMP133]
    setup();
    ^
 
@@ -43,7 +43,7 @@ specialized state machine.
 
 .. code-block:: c++
 
-   __attribute__((assume("omp_no_parallelism"))) extern void setup();
+   [[omp::assume("omp_no_parallelism")]] extern void setup();
 
 
    void foo() {
diff --git a/openmp/docs/remarks/OptimizationRemarks.rst b/openmp/docs/remarks/OptimizationRemarks.rst
index a29dce6..2c683a4 100644
--- a/openmp/docs/remarks/OptimizationRemarks.rst
+++ b/openmp/docs/remarks/OptimizationRemarks.rst
@@ -81,7 +81,7 @@ OpenMP Remarks
    * - :ref:`OMP121 <omp121>`
      - Analysis
      - Value has potential side effects preventing SPMD-mode execution. Add
-       `__attribute__((assume(\"ompx_spmd_amenable\")))` to the called function
+       `[[omp::assume(\"ompx_spmd_amenable\")]]` to the called function
        to override.
    * - :ref:`OMP130 <omp130>`
      - Optimization
@@ -96,7 +96,7 @@ OpenMP Remarks
    * - :ref:`OMP133 <omp133>`
      - Analysis
      - Call may contain unknown parallel regions. Use
-       `__attribute__((assume("omp_no_parallelism")))` to override.
+       `[[omp::assume("omp_no_parallelism")]]` to override.
    * - :ref:`OMP140 <omp140>`
      - Analysis
      - Could not internalize function. Some optimizations may not be possible.
diff --git a/openmp/runtime/src/include/ompx.h.var b/openmp/runtime/src/include/ompx.h.var
index 579d31a..1985188 100644
--- a/openmp/runtime/src/include/ompx.h.var
+++ b/openmp/runtime/src/include/ompx.h.var
@@ -9,6 +9,8 @@
 #ifndef __OMPX_H
 #define __OMPX_H
 
+typedef unsigned long uint64_t;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -81,6 +83,10 @@ _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(void, sync_block_divergent, int Ordering,
 #undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C
 ///}
 
+static inline uint64_t ompx_ballot_sync(uint64_t mask, int pred) {
+  __builtin_trap();
+}
+
 #pragma omp end declare variant
 
 /// ompx_{sync_block}_{,divergent}
@@ -109,6 +115,8 @@ _TGT_KERNEL_LANGUAGE_DECL_GRID_C(grid_dim)
 #undef _TGT_KERNEL_LANGUAGE_DECL_GRID_C
 ///}
 
+uint64_t ompx_ballot_sync(uint64_t mask, int pred);
+
 #ifdef __cplusplus
 }
 #endif
@@ -160,6 +168,10 @@ _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX(void, sync_block_divergent,
 #undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX
 ///}
 
+static inline uint64_t ballot_sync(uint64_t mask, int pred) {
+  return ompx_ballot_sync(mask, pred);
+}
+
 } // namespace ompx
 #endif
 
diff --git a/openmp/runtime/test/lit.cfg b/openmp/runtime/test/lit.cfg
index e8f7f34..14c7468 100644
--- a/openmp/runtime/test/lit.cfg
+++ b/openmp/runtime/test/lit.cfg
@@ -171,10 +171,14 @@ config.substitutions.append(("%libomp-c99-compile-and-run", \
     "%libomp-c99-compile && %libomp-run"))
 config.substitutions.append(("%libomp-cxx-compile-and-run", \
     "%libomp-cxx-compile && %libomp-run"))
+config.substitutions.append(("%libomp-cxx20-compile-and-run", \
+    "%libomp-cxx20-compile && %libomp-run"))
 config.substitutions.append(("%libomp-cxx-compile-c", \
     "%clangXX %openmp_flags %flags -std=c++17 -x c++ %s -o %t" + libs))
 config.substitutions.append(("%libomp-cxx-compile", \
     "%clangXX %openmp_flags %flags -std=c++17 %s -o %t" + libs))
+config.substitutions.append(("%libomp-cxx20-compile", \
+    "%clangXX %openmp_flags %flags -std=c++20 %s -o %t" + libs))
 config.substitutions.append(("%libomp-compile", \
     "%clang %openmp_flags %flags %s -o %t" + libs))
 config.substitutions.append(("%libomp-irbuilder-compile", \
diff --git a/openmp/runtime/test/transform/tile/foreach.cpp b/openmp/runtime/test/transform/tile/foreach.cpp
new file mode 100644
index 0000000..4fb3595
--- /dev/null
+++ b/openmp/runtime/test/transform/tile/foreach.cpp
@@ -0,0 +1,228 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator move ctor");
+    }
+
+    ~Iterator() { owner->print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      owner->print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      owner->print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    Iterator &operator++() {
+      owner->print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      owner->print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+
+    void print(const char *msg) const { owner->print(msg); }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+#pragma omp tile sizes(2, 2)
+  for (Reporter c{"C"}; auto &&v : Reporter("A"))
+    for (Reporter d{"D"}; auto &&w : Reporter("B"))
+      printf("v=%d w=%d\n", v, w);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: [C] ctor
+// CHECK-NEXT: [A] ctor
+// CHECK-NEXT: [A] end()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] iterator distance: 3
+// CHECK-NEXT: [D] ctor
+// CHECK-NEXT: [B] ctor
+// CHECK-NEXT: [B] end()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] iterator distance: 3
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: v=0 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: v=0 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: v=1 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: v=1 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: v=0 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: v=1 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: v=2 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: v=2 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: v=2 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] dtor
+// CHECK-NEXT: [D] dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] dtor
+// CHECK-NEXT: [C] dtor
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/tile/iterfor.cpp b/openmp/runtime/test/transform/tile/iterfor.cpp
new file mode 100644
index 0000000..1261354
--- /dev/null
+++ b/openmp/runtime/test/transform/tile/iterfor.cpp
@@ -0,0 +1,233 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator move ctor");
+    }
+
+    ~Iterator() { owner->print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      owner->print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      owner->print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    bool operator!=(const Iterator &that) const {
+      owner->print("iterator %d != %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    Iterator &operator++() {
+      owner->print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      owner->print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+  {
+    Reporter A("A"), B("B");
+#pragma omp tile sizes(2, 2)
+    for (auto it = A.begin(); it != A.end(); ++it)
+      for (auto jt = B.begin(); jt != B.end(); ++jt)
+        printf("i=%d j=%d\n", *it, *jt);
+  }
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: [A] ctor
+// CHECK-NEXT: [B] ctor
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] end()
+// CHECK-NEXT: [A] iterator distance: 3
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] end()
+// CHECK-NEXT: [B] iterator distance: 3
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=0 j=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=0 j=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=1 j=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=1 j=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=0 j=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=1 j=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=2 j=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=2 j=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=2 j=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] dtor
+// CHECK-NEXT: [A] dtor
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/tile/parallel-wsloop-collapse-foreach.cpp b/openmp/runtime/test/transform/tile/parallel-wsloop-collapse-foreach.cpp
new file mode 100644
index 0000000..b1f4d98
--- /dev/null
+++ b/openmp/runtime/test/transform/tile/parallel-wsloop-collapse-foreach.cpp
@@ -0,0 +1,366 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator move ctor");
+    }
+
+    ~Iterator() { owner->print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      owner->print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      owner->print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    Iterator &operator++() {
+      owner->print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      owner->print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+#pragma omp parallel for collapse(3) num_threads(1)
+  for (int i = 0; i < 3; ++i)
+#pragma omp tile sizes(2, 2)
+    for (Reporter c{"C"}; auto &&v : Reporter("A"))
+      for (Reporter d{"D"}; auto &&w : Reporter("B"))
+        printf("i=%d v=%d w=%d\n", i, v, w);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: [C] ctor
+// CHECK-NEXT: [A] ctor
+// CHECK-NEXT: [A] end()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] iterator distance: 3
+// CHECK-NEXT: [D] ctor
+// CHECK-NEXT: [B] ctor
+// CHECK-NEXT: [B] end()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] iterator distance: 3
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=0 v=0 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=0 v=0 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=0 v=1 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=0 v=1 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=0 v=0 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=0 v=1 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=0 v=2 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=0 v=2 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=0 v=2 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=1 v=0 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=1 v=0 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=1 v=1 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=1 v=1 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=1 v=0 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=1 v=1 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=1 v=2 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=1 v=2 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=1 v=2 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=2 v=0 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=2 v=0 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=2 v=1 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=2 v=1 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=2 v=0 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=2 v=1 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=2 v=2 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=2 v=2 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=2 v=2 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] dtor
+// CHECK-NEXT: [D] dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] dtor
+// CHECK-NEXT: [C] dtor
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/unroll/factor_foreach.cpp b/openmp/runtime/test/transform/unroll/factor_foreach.cpp
new file mode 100644
index 0000000..29fef7c
--- /dev/null
+++ b/openmp/runtime/test/transform/unroll/factor_foreach.cpp
@@ -0,0 +1,162 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator move ctor");
+    }
+
+    ~Iterator() { owner->print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      owner->print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      owner->print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    bool operator!=(const Iterator &that) const {
+      owner->print("iterator %d != %d", 2 - this->pos, 2 - that.pos);
+      return this->pos != that.pos;
+    }
+
+    Iterator &operator++() {
+      owner->print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      owner->print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+
+    void print(const char *msg) const { owner->print(msg); }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+#pragma omp unroll partial(2)
+  for (Reporter c{"init-stmt"}; auto &&v : Reporter("range"))
+    printf("v=%d\n", v);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: [init-stmt] ctor
+// CHECK-NEXT: [range] ctor
+// CHECK-NEXT: [range] begin()
+// CHECK-NEXT: [range] end()
+// CHECK-NEXT: [range] iterator 0 != 3
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: v=0
+// CHECK-NEXT: [range] iterator prefix ++
+// CHECK-NEXT: [range] iterator 1 != 3
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: v=1
+// CHECK-NEXT: [range] iterator prefix ++
+// CHECK-NEXT: [range] iterator 2 != 3
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: v=2
+// CHECK-NEXT: [range] iterator prefix ++
+// CHECK-NEXT: [range] iterator 3 != 3
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] dtor
+// CHECK-NEXT: [init-stmt] dtor
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/unroll/factor_intfor.c b/openmp/runtime/test/transform/unroll/factor_intfor.c
new file mode 100644
index 0000000..42ebeb4
--- /dev/null
+++ b/openmp/runtime/test/transform/unroll/factor_intfor.c
@@ -0,0 +1,25 @@
+// RUN: %libomp-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <stdlib.h>
+#include <stdio.h>
+
+int main() {
+  printf("do\n");
+#pragma omp unroll partial(2)
+  for (int i = 7; i < 19; i += 3)
+    printf("i=%d\n", i);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: i=7
+// CHECK-NEXT: i=10
+// CHECK-NEXT: i=13
+// CHECK-NEXT: i=16
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/unroll/factor_iterfor.cpp b/openmp/runtime/test/transform/unroll/factor_iterfor.cpp
new file mode 100644
index 0000000..0298477
--- /dev/null
+++ b/openmp/runtime/test/transform/unroll/factor_iterfor.cpp
@@ -0,0 +1,169 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      print("iterator move ctor");
+    }
+
+    ~Iterator() { print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    bool operator!=(const Iterator &that) const {
+      owner->print("iterator %d != %d", 2 - this->pos, 2 - that.pos);
+      return this->pos != that.pos;
+    }
+
+    Iterator &operator++() {
+      print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+
+    void print(const char *msg) const { owner->print(msg); }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+  {
+    Reporter range("range");
+#pragma omp unroll partial(2)
+    for (auto it = range.begin(); it != range.end(); ++it)
+      printf("v=%d\n", *it);
+  }
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: [range] ctor
+// CHECK-NEXT: [range] begin()
+// CHECK-NEXT: [range] end()
+// CHECK-NEXT: [range] iterator 0 != 3
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: v=0
+// CHECK-NEXT: [range] iterator prefix ++
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] end()
+// CHECK-NEXT: [range] iterator 1 != 3
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: v=1
+// CHECK-NEXT: [range] iterator prefix ++
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] end()
+// CHECK-NEXT: [range] iterator 2 != 3
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: v=2
+// CHECK-NEXT: [range] iterator prefix ++
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] end()
+// CHECK-NEXT: [range] iterator 3 != 3
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] dtor
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/unroll/factor_parallel-wsloop-collapse-foreach.cpp b/openmp/runtime/test/transform/unroll/factor_parallel-wsloop-collapse-foreach.cpp
new file mode 100644
index 0000000..71567fa
--- /dev/null
+++ b/openmp/runtime/test/transform/unroll/factor_parallel-wsloop-collapse-foreach.cpp
@@ -0,0 +1,199 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator move ctor");
+    }
+
+    ~Iterator() { owner->print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      owner->print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      owner->print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    bool operator!=(const Iterator &that) const {
+      owner->print("iterator %d != %d", 2 - this->pos, 2 - that.pos);
+      return this->pos != that.pos;
+    }
+
+    Iterator &operator++() {
+      owner->print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      owner->print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+
+    void print(const char *msg) const { owner->print(msg); }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+#pragma omp parallel for collapse(2) num_threads(1)
+  for (int i = 0; i < 3; ++i)
+#pragma omp unroll partial(2)
+    for (Reporter c{"init-stmt"}; auto &&v : Reporter("range"))
+      printf("i=%d v=%d\n", i, v);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: [init-stmt] ctor
+// CHECK-NEXT: [range] ctor
+// CHECK-NEXT: [range] end()
+// CHECK-NEXT: [range] begin()
+// CHECK-NEXT: [range] begin()
+// CHECK-NEXT: [range] iterator distance: 3
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=0 v=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=0 v=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=0 v=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=1 v=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=1 v=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=1 v=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=2 v=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=2 v=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=2 v=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] dtor
+// CHECK-NEXT: [init-stmt] dtor
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/unroll/factor_parallel-wsloop-collapse-intfor.cpp b/openmp/runtime/test/transform/unroll/factor_parallel-wsloop-collapse-intfor.cpp
new file mode 100644
index 0000000..0a31f8d
--- /dev/null
+++ b/openmp/runtime/test/transform/unroll/factor_parallel-wsloop-collapse-intfor.cpp
@@ -0,0 +1,32 @@
+// RUN: %libomp-cxx-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdio>
+
+int main() {
+  printf("do\n");
+#pragma omp parallel for collapse(2) num_threads(1)
+  for (int i = 0; i < 3; ++i)
+#pragma omp unroll partial(2)
+    for (int j = 0; j < 3; ++j)
+      printf("i=%d j=%d\n", i, j);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: i=0 j=0
+// CHECK-NEXT: i=0 j=1
+// CHECK-NEXT: i=0 j=2
+// CHECK-NEXT: i=1 j=0
+// CHECK-NEXT: i=1 j=1
+// CHECK-NEXT: i=1 j=2
+// CHECK-NEXT: i=2 j=0
+// CHECK-NEXT: i=2 j=1
+// CHECK-NEXT: i=2 j=2
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/unroll/full_intfor.c b/openmp/runtime/test/transform/unroll/full_intfor.c
new file mode 100644
index 0000000..0814511
--- /dev/null
+++ b/openmp/runtime/test/transform/unroll/full_intfor.c
@@ -0,0 +1,25 @@
+// RUN: %libomp-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <stdlib.h>
+#include <stdio.h>
+
+int main() {
+  printf("do\n");
+#pragma omp unroll full
+  for (int i = 7; i < 19; i += 3)
+    printf("i=%d\n", i);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: i=7
+// CHECK-NEXT: i=10
+// CHECK-NEXT: i=13
+// CHECK-NEXT: i=16
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/unroll/heuristic_intfor.c b/openmp/runtime/test/transform/unroll/heuristic_intfor.c
new file mode 100644
index 0000000..b07bec7
--- /dev/null
+++ b/openmp/runtime/test/transform/unroll/heuristic_intfor.c
@@ -0,0 +1,25 @@
+// RUN: %libomp-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <stdlib.h>
+#include <stdio.h>
+
+int main() {
+  printf("do\n");
+#pragma omp unroll
+  for (int i = 7; i < 19; i += 3)
+    printf("i=%d\n", i);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: i=7
+// CHECK-NEXT: i=10
+// CHECK-NEXT: i=13
+// CHECK-NEXT: i=16
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/unroll/partial_intfor.c b/openmp/runtime/test/transform/unroll/partial_intfor.c
new file mode 100644
index 0000000..2ede94e
--- /dev/null
+++ b/openmp/runtime/test/transform/unroll/partial_intfor.c
@@ -0,0 +1,25 @@
+// RUN: %libomp-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <stdlib.h>
+#include <stdio.h>
+
+int main() {
+  printf("do\n");
+#pragma omp unroll partial
+  for (int i = 7; i < 19; i += 3)
+    printf("i=%d\n", i);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: i=7
+// CHECK-NEXT: i=10
+// CHECK-NEXT: i=13
+// CHECK-NEXT: i=16
+// CHECK-NEXT: done
diff --git a/polly/test/CodeGen/20100617.ll b/polly/test/CodeGen/20100617.ll
index 71a889f..7229a6e 100644
--- a/polly/test/CodeGen/20100617.ll
+++ b/polly/test/CodeGen/20100617.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @init_array() nounwind {
diff --git a/polly/test/CodeGen/20100622.ll b/polly/test/CodeGen/20100622.ll
index 872d6a0..bed7377 100644
--- a/polly/test/CodeGen/20100622.ll
+++ b/polly/test/CodeGen/20100622.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-codegen -disable-output < %s
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | not FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s | not FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
 
diff --git a/polly/test/CodeGen/20100707.ll b/polly/test/CodeGen/20100707.ll
index 3381980..ee0422e 100644
--- a/polly/test/CodeGen/20100707.ll
+++ b/polly/test/CodeGen/20100707.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @clause_SetSplitField(i32 %Length) nounwind inlinehint {
diff --git a/polly/test/CodeGen/20100707_2.ll b/polly/test/CodeGen/20100707_2.ll
index df784c6..a4cd76a 100644
--- a/polly/test/CodeGen/20100707_2.ll
+++ b/polly/test/CodeGen/20100707_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @win193 = external global [4 x [36 x double]], align 32 ; <ptr> [#uses=3]
diff --git a/polly/test/CodeGen/20100708.ll b/polly/test/CodeGen/20100708.ll
index 50b8e38..9080451 100644
--- a/polly/test/CodeGen/20100708.ll
+++ b/polly/test/CodeGen/20100708.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-detect < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define fastcc void @execute() nounwind {
diff --git a/polly/test/CodeGen/20100708_2.ll b/polly/test/CodeGen/20100708_2.ll
index 2f4807d..51dc9d3 100644
--- a/polly/test/CodeGen/20100708_2.ll
+++ b/polly/test/CodeGen/20100708_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @init_array() nounwind {
diff --git a/polly/test/CodeGen/20100713.ll b/polly/test/CodeGen/20100713.ll
index edd352a..a836795 100644
--- a/polly/test/CodeGen/20100713.ll
+++ b/polly/test/CodeGen/20100713.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @fft_float(i32 %NumSamples) nounwind {
diff --git a/polly/test/CodeGen/20100713_2.ll b/polly/test/CodeGen/20100713_2.ll
index 92f8959..28b984b 100644
--- a/polly/test/CodeGen/20100713_2.ll
+++ b/polly/test/CodeGen/20100713_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define hidden void @luaD_callhook() nounwind {
diff --git a/polly/test/CodeGen/20100717.ll b/polly/test/CodeGen/20100717.ll
index a400eea..51c453c 100644
--- a/polly/test/CodeGen/20100717.ll
+++ b/polly/test/CodeGen/20100717.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly  -polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly  -passes=polly-codegen -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @matrixTranspose(ptr %A) nounwind {
diff --git a/polly/test/CodeGen/20100718-DomInfo-2.ll b/polly/test/CodeGen/20100718-DomInfo-2.ll
index 512b4c5..fdac75f 100644
--- a/polly/test/CodeGen/20100718-DomInfo-2.ll
+++ b/polly/test/CodeGen/20100718-DomInfo-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -verify-dom-info -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-dom-info -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @getNonAffNeighbour() nounwind {
diff --git a/polly/test/CodeGen/20100718-DomInfo.ll b/polly/test/CodeGen/20100718-DomInfo.ll
index e123343..da68eb0 100644
--- a/polly/test/CodeGen/20100718-DomInfo.ll
+++ b/polly/test/CodeGen/20100718-DomInfo.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -verify-dom-info -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-dom-info -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @intrapred_luma_16x16(i32 %predmode) nounwind {
diff --git a/polly/test/CodeGen/20100720-MultipleConditions.ll b/polly/test/CodeGen/20100720-MultipleConditions.ll
index 9f22687..3dece4ef 100644
--- a/polly/test/CodeGen/20100720-MultipleConditions.ll
+++ b/polly/test/CodeGen/20100720-MultipleConditions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-ast -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s
 
 ;int bar1();
 ;int bar2();
diff --git a/polly/test/CodeGen/20100809-IndependentBlock.ll b/polly/test/CodeGen/20100809-IndependentBlock.ll
index 8d59668..f45b654 100644
--- a/polly/test/CodeGen/20100809-IndependentBlock.ll
+++ b/polly/test/CodeGen/20100809-IndependentBlock.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 define void @cfft2(ptr %x) nounwind {
 entry:
diff --git a/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll b/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll
index 261a205..82da9d2 100644
--- a/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll
+++ b/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/CodeGen/20101030-Overflow.ll b/polly/test/CodeGen/20101030-Overflow.ll
index caaa4851..fecdb9d 100644
--- a/polly/test/CodeGen/20101030-Overflow.ll
+++ b/polly/test/CodeGen/20101030-Overflow.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @compdecomp() nounwind {
diff --git a/polly/test/CodeGen/20101103-Overflow3.ll b/polly/test/CodeGen/20101103-Overflow3.ll
index b2faf14..f1503e2 100644
--- a/polly/test/CodeGen/20101103-Overflow3.ll
+++ b/polly/test/CodeGen/20101103-Overflow3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 define void @Reflection_coefficients(ptr %r) nounwind {
 bb20:
diff --git a/polly/test/CodeGen/20101103-signmissmatch.ll b/polly/test/CodeGen/20101103-signmissmatch.ll
index e157d29..3d0c929 100644
--- a/polly/test/CodeGen/20101103-signmissmatch.ll
+++ b/polly/test/CodeGen/20101103-signmissmatch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @CleanNet() nounwind {
diff --git a/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll b/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll
index c792d8c..0e62e67 100644
--- a/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll
+++ b/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @main() nounwind {
diff --git a/polly/test/CodeGen/20110226-PHI-Node-removed.ll b/polly/test/CodeGen/20110226-PHI-Node-removed.ll
index 3458d75..32b018f 100644
--- a/polly/test/CodeGen/20110226-PHI-Node-removed.ll
+++ b/polly/test/CodeGen/20110226-PHI-Node-removed.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/CodeGen/20120316-InvalidCast.ll b/polly/test/CodeGen/20120316-InvalidCast.ll
index 8355cc5..b87a3dc 100644
--- a/polly/test/CodeGen/20120316-InvalidCast.ll
+++ b/polly/test/CodeGen/20120316-InvalidCast.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 
 ; CHECK: polly.start
 
diff --git a/polly/test/CodeGen/20120403-RHS-type-mismatch.ll b/polly/test/CodeGen/20120403-RHS-type-mismatch.ll
index 1d629e3..dac78bf 100644
--- a/polly/test/CodeGen/20120403-RHS-type-mismatch.ll
+++ b/polly/test/CodeGen/20120403-RHS-type-mismatch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 
 ; We just check that this compilation does not crash.
 
diff --git a/polly/test/CodeGen/20130221.ll b/polly/test/CodeGen/20130221.ll
index 4541467..5728a76 100644
--- a/polly/test/CodeGen/20130221.ll
+++ b/polly/test/CodeGen/20130221.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 define void @list_sequence(ptr %A) {
diff --git a/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll b/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll
index d54be5c..cafd68e 100644
--- a/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll
+++ b/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/Intrinsics/llvm-expect.ll b/polly/test/CodeGen/Intrinsics/llvm-expect.ll
index 84057e2..47fd4f07 100644
--- a/polly/test/CodeGen/Intrinsics/llvm-expect.ll
+++ b/polly/test/CodeGen/Intrinsics/llvm-expect.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; Check that we generate code without crashing.
 ;
diff --git a/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll b/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll
index b043195..2853124 100644
--- a/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll
+++ b/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll
@@ -1,6 +1,6 @@
 ; This test checks that we do not accidently mutate the debug info when
 ; inserting loop parallel metadata.
-; RUN: opt %loadPolly < %s  -S -polly -polly-codegen -polly-ast-detect-parallel | FileCheck %s
+; RUN: opt %loadNPMPolly < %s  -S -polly -passes=polly-codegen -polly-ast-detect-parallel | FileCheck %s
 ; CHECK-NOT: !7 = !{!7}
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll b/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll
index 7b131c5..9bb086f 100644
--- a/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll
+++ b/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-ast-detect-parallel -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ast-detect-parallel -S < %s | FileCheck %s
 ;
 ; Check that we mark multiple parallel loops correctly including the memory instructions.
 ;
diff --git a/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll b/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll
index ec927ac..96b50ce 100644
--- a/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll
+++ b/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s -check-prefix=SEQUENTIAL
-; RUN: opt %loadPolly -polly-codegen -polly-ast-detect-parallel -S < %s | FileCheck %s -check-prefix=PARALLEL
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=SEQUENTIAL
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ast-detect-parallel -S < %s | FileCheck %s -check-prefix=PARALLEL
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; This is a trivially parallel loop. We just use it to ensure that we actually
diff --git a/polly/test/CodeGen/MemAccess/bad_alignment.ll b/polly/test/CodeGen/MemAccess/bad_alignment.ll
index 32f3cfe..82fff27 100644
--- a/polly/test/CodeGen/MemAccess/bad_alignment.ll
+++ b/polly/test/CodeGen/MemAccess/bad_alignment.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -disable-output 2>&1 < %s | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -disable-output 2>&1 < %s | FileCheck %s
 ;
 ; Check that we do not allow to access elements not accessed before because the
 ; alignment information would become invalid.
diff --git a/polly/test/CodeGen/MemAccess/codegen_address_space.ll b/polly/test/CodeGen/MemAccess/codegen_address_space.ll
index 7c9b12d..3ce363e 100644
--- a/polly/test/CodeGen/MemAccess/codegen_address_space.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_address_space.ll
@@ -1,4 +1,4 @@
-;RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen < %s -S | FileCheck %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s
 
 ;int A[100];
 ;
diff --git a/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll b/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll
index e008a78..0563ca8 100644
--- a/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll
@@ -1,4 +1,4 @@
-;RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen < %s -S | FileCheck %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s
 
 ;int A[100];
 ;
diff --git a/polly/test/CodeGen/MemAccess/codegen_simple.ll b/polly/test/CodeGen/MemAccess/codegen_simple.ll
index 5ba6f32..ee0187f 100644
--- a/polly/test/CodeGen/MemAccess/codegen_simple.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_simple.ll
@@ -1,4 +1,4 @@
-;RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen < %s -S | FileCheck %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s
 
 ;int A[100];
 ;
diff --git a/polly/test/CodeGen/MemAccess/codegen_simple_float.ll b/polly/test/CodeGen/MemAccess/codegen_simple_float.ll
index cf8913f..6970565 100644
--- a/polly/test/CodeGen/MemAccess/codegen_simple_float.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_simple_float.ll
@@ -1,4 +1,4 @@
-;RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen < %s -S | FileCheck %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s
 ;
 ;float A[100];
 ;
diff --git a/polly/test/CodeGen/MemAccess/codegen_simple_md.ll b/polly/test/CodeGen/MemAccess/codegen_simple_md.ll
index e4afcc8..f0896e2 100644
--- a/polly/test/CodeGen/MemAccess/codegen_simple_md.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_simple_md.ll
@@ -1,5 +1,5 @@
-;RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed+withconst -polly-codegen < %s -S | FileCheck -check-prefix=WITHCONST %s
-;RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed+withoutconst -polly-codegen < %s -S | FileCheck -check-prefix=WITHOUTCONST %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withconst < %s -S | FileCheck -check-prefix=WITHCONST %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withoutconst < %s -S | FileCheck -check-prefix=WITHOUTCONST %s
 
 ;int A[1040];
 ;
diff --git a/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll b/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll
index c9913f3..99fc369 100644
--- a/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll
@@ -1,5 +1,5 @@
-;RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed+withconst -polly-codegen < %s -S | FileCheck -check-prefix=WITHCONST %s
-;RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed+withoutconst -polly-codegen < %s -S | FileCheck -check-prefix=WITHOUTCONST %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withconst < %s -S | FileCheck -check-prefix=WITHCONST %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withoutconst < %s -S | FileCheck -check-prefix=WITHOUTCONST %s
 ;
 ;float A[1040];
 ;
diff --git a/polly/test/CodeGen/MemAccess/different_types.ll b/polly/test/CodeGen/MemAccess/different_types.ll
index 624de62..5371819 100644
--- a/polly/test/CodeGen/MemAccess/different_types.ll
+++ b/polly/test/CodeGen/MemAccess/different_types.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-import-jscop \
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
 ; RUN: \
-; RUN: -polly-codegen -S < %s | FileCheck %s
+; RUN: -S < %s | FileCheck %s
 ;
 ;    void foo(float A[], float B[]) {
 ;      for (long i = 0; i < 100; i++)
diff --git a/polly/test/CodeGen/MemAccess/generate-all.ll b/polly/test/CodeGen/MemAccess/generate-all.ll
index 6f92ba1..d1f695d 100644
--- a/polly/test/CodeGen/MemAccess/generate-all.ll
+++ b/polly/test/CodeGen/MemAccess/generate-all.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-codegen -polly-codegen-generate-expressions=false \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-generate-expressions=false \
 ; RUN:     -S < %s | FileCheck %s -check-prefix=SCEV
-; RUN: opt %loadPolly -polly-codegen -polly-codegen-generate-expressions=true \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-generate-expressions=true \
 ; RUN:     -S < %s | FileCheck %s -check-prefix=ASTEXPR
 ;
 ;    void foo(float A[]) {
diff --git a/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll b/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll
index a6d1de0..5c926ac 100644
--- a/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll
+++ b/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-import-jscop \
-; RUN:   -polly-codegen -polly-invariant-load-hoisting -S \
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
+; RUN:   -polly-invariant-load-hoisting -S \
 ; RUN:   2>&1 < %s | FileCheck %s
 
 ; Setting new access functions where the base pointer of the array that is newly
diff --git a/polly/test/CodeGen/MemAccess/multiple_types.ll b/polly/test/CodeGen/MemAccess/multiple_types.ll
index 1793bd3..7848977 100644
--- a/polly/test/CodeGen/MemAccess/multiple_types.ll
+++ b/polly/test/CodeGen/MemAccess/multiple_types.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop \
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,polly-codegen' \
 ; RUN: -polly-allow-differing-element-types \
-; RUN:   -polly-codegen -S    < %s | FileCheck %s
+; RUN:   -S    < %s | FileCheck %s
 ;
 ;    // Check that accessing one array with different types works.
 ;    void multiple_types(char *Short, char *Float, char *Double) {
diff --git a/polly/test/CodeGen/MemAccess/simple.ll b/polly/test/CodeGen/MemAccess/simple.ll
index 39e8a2c..5077e1a 100644
--- a/polly/test/CodeGen/MemAccess/simple.ll
+++ b/polly/test/CodeGen/MemAccess/simple.ll
@@ -1,4 +1,4 @@
-;RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -stats < %s 2>&1  | FileCheck %s
+;RUN: opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -stats < %s 2>&1  | FileCheck %s
 ; REQUIRES: asserts
 
 ;int A[100];
diff --git a/polly/test/CodeGen/MemAccess/update_access_functions.ll b/polly/test/CodeGen/MemAccess/update_access_functions.ll
index 05d2087..51fa97a 100644
--- a/polly/test/CodeGen/MemAccess/update_access_functions.ll
+++ b/polly/test/CodeGen/MemAccess/update_access_functions.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-import-jscop \
-; RUN:                -polly-import-jscop-postfix=transformed -polly-codegen \
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
+; RUN:                -polly-import-jscop-postfix=transformed \
 ; RUN:                 < %s -S | FileCheck %s
 
 ; CHECK-LABEL: polly.stmt.loop1:
diff --git a/polly/test/CodeGen/OpenMP/alias-metadata.ll b/polly/test/CodeGen/OpenMP/alias-metadata.ll
index 07d7963..b80b18f 100644
--- a/polly/test/CodeGen/OpenMP/alias-metadata.ll
+++ b/polly/test/CodeGen/OpenMP/alias-metadata.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-parallel -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -S < %s | FileCheck %s
 ;
 ;    void foo(float *A, float *B) {
 ;      for (long i = 0; i < 1000; i++)
diff --git a/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll b/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll
index eb9dfcd..9eb7f5f 100644
--- a/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll
+++ b/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-opt-max-coefficient=-1 -polly-parallel -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-opt-max-coefficient=-1 -polly-parallel -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Check that we do not crash but generate parallel code
 ;
diff --git a/polly/test/CodeGen/OpenMP/inlineasm.ll b/polly/test/CodeGen/OpenMP/inlineasm.ll
index 69b1b0a..82a7378 100644
--- a/polly/test/CodeGen/OpenMP/inlineasm.ll
+++ b/polly/test/CodeGen/OpenMP/inlineasm.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-parallel -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,polly-codegen' -polly-parallel -S < %s | FileCheck %s
 ; llvm.org/PR51960
 
 ; CHECK-LABEL: define internal void @foo_polly_subfn
diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll
index 30beef5..b4c61d1 100644
--- a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll
+++ b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
 ; RUN: -polly-parallel-force -S < %s | FileCheck %s
 ;
 ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction.
diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll
index fe5d2ab..8cf6148 100644
--- a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll
+++ b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
 ; RUN: -polly-parallel-force -S < %s | FileCheck %s
 ;
 ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction.
diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll
index 49b9321..823e5ca 100644
--- a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll
+++ b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
 ; RUN: -polly-parallel-force -S < %s | FileCheck %s
 ;
 ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction but
diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll
index 06c4cda..5557839 100644
--- a/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll
+++ b/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
 ; RUN: -polly-parallel-force -S < %s | FileCheck %s
 ;
 ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction.
diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll
index db58c3a..a987fac 100644
--- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll
+++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; This code has failed the scev based code generation as the scev in the scop
 ; contains an AddRecExpr of an outer loop. When generating code, we did not
diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll
index c2ddc1e..b81e120 100644
--- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll
+++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; AST: #pragma simd
 ; AST: #pragma omp parallel for
diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll
index 0f025bb..c4ad665 100644
--- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll
+++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadPolly -basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true -polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; The interesting part of this test case is the instruction:
 ;   %tmp = bitcast i8* %call to i64**
diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll
index f9612d7..07aae42 100644
--- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll
+++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -S < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=IR
 
 ; Make sure we correctly forward the reference to 'A' to the OpenMP subfunction.
 ;
diff --git a/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll b/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll
index da9da18..27e1bdd 100644
--- a/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll
+++ b/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-parallel -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadPolly -polly-parallel -polly-codegen -S < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=IR
 ;
 ; float A[100];
 ;
diff --git a/polly/test/CodeGen/OpenMP/mapped-phi-access.ll b/polly/test/CodeGen/OpenMP/mapped-phi-access.ll
index 1b84336..ac78b4e 100644
--- a/polly/test/CodeGen/OpenMP/mapped-phi-access.ll
+++ b/polly/test/CodeGen/OpenMP/mapped-phi-access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-parallel -polly-delicm -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-parallel '-passes=polly-delicm,polly-codegen' -S < %s | FileCheck %s
 ;
 ; Verify that -polly-parallel can handle mapped scalar MemoryAccesses.
 ;
diff --git a/polly/test/CodeGen/OpenMP/matmul-parallel.ll b/polly/test/CodeGen/OpenMP/matmul-parallel.ll
index 5ee9a7c..43326b2 100644
--- a/polly/test/CodeGen/OpenMP/matmul-parallel.ll
+++ b/polly/test/CodeGen/OpenMP/matmul-parallel.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-parallel -polly-opt-isl -polly-ast -disable-output -debug-only=polly-ast < %s 2>&1 | FileCheck --check-prefix=AST %s
-; RUN: opt %loadPolly -polly-parallel -polly-opt-isl -polly-codegen -S < %s | FileCheck --check-prefix=CODEGEN %s
+; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly-opt-isl,print<polly-ast>' -disable-output -debug-only=polly-ast < %s 2>&1 | FileCheck --check-prefix=AST %s
+; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly-opt-isl,polly-codegen' -S < %s | FileCheck --check-prefix=CODEGEN %s
 ; REQUIRES: asserts
 
 ; Parallelization of detected matrix-multiplication.
diff --git a/polly/test/CodeGen/OpenMP/recomputed-srem.ll b/polly/test/CodeGen/OpenMP/recomputed-srem.ll
index cfae8e9..67db35a 100644
--- a/polly/test/CodeGen/OpenMP/recomputed-srem.ll
+++ b/polly/test/CodeGen/OpenMP/recomputed-srem.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-codegen -polly-parallel \
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-codegen -polly-parallel \
 ; RUN: -polly-parallel-force -S < %s | FileCheck %s
 ;
 ; Test to verify that we pass %rem96 to the parallel subfunction.
diff --git a/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll b/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll
index f243c3a..96dc425 100644
--- a/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll
+++ b/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll
@@ -1,15 +1,15 @@
-; RUN: opt %loadPolly -polly-parallel \
-; RUN: -polly-parallel-force -polly-codegen \
+; RUN: opt %loadNPMPolly -polly-parallel \
+; RUN: -polly-parallel-force -passes=polly-codegen \
 ; RUN: -S -verify-dom-info < %s \
 ; RUN: | FileCheck %s -check-prefix=IR
 
-; RUN: opt %loadPolly -polly-parallel \
-; RUN: -polly-parallel-force -polly-codegen -polly-scheduling=runtime \
+; RUN: opt %loadNPMPolly -polly-parallel \
+; RUN: -polly-parallel-force -passes=polly-codegen -polly-scheduling=runtime \
 ; RUN: -S -verify-dom-info < %s \
 ; RUN: | FileCheck %s -check-prefix=IR
 
-; RUN: opt %loadPolly -polly-parallel \
-; RUN: -polly-parallel-force -polly-codegen -polly-omp-backend=LLVM \
+; RUN: opt %loadNPMPolly -polly-parallel \
+; RUN: -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM \
 ; RUN: -S -verify-dom-info < %s \
 ; RUN: | FileCheck %s -check-prefix=LIBOMP-IR
 
diff --git a/polly/test/CodeGen/OpenMP/reference-other-bb.ll b/polly/test/CodeGen/OpenMP/reference-other-bb.ll
index b7abdc2..dbfbd9a 100644
--- a/polly/test/CodeGen/OpenMP/reference-other-bb.ll
+++ b/polly/test/CodeGen/OpenMP/reference-other-bb.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; IR: @foo_polly_subfn
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll b/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll
index b88589f..ee43b8aa 100644
--- a/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll
+++ b/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 
 ; - Test the case where scalar evolution references a loop that is outside
diff --git a/polly/test/CodeGen/OpenMP/reference_latest.ll b/polly/test/CodeGen/OpenMP/reference_latest.ll
index 54875c2..7a8cd77 100644
--- a/polly/test/CodeGen/OpenMP/reference_latest.ll
+++ b/polly/test/CodeGen/OpenMP/reference_latest.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-delicm -polly-simplify -polly-parallel -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-delicm,polly-simplify,polly-codegen' -polly-parallel -S < %s | FileCheck %s
 ;
 ; Test that parallel codegen handles scalars mapped to other arrays.
 ; After mapping "store double %add10" references the array "MemRef2".
diff --git a/polly/test/CodeGen/OpenMP/scev-rewriting.ll b/polly/test/CodeGen/OpenMP/scev-rewriting.ll
index 1b229fc..9b79f29 100644
--- a/polly/test/CodeGen/OpenMP/scev-rewriting.ll
+++ b/polly/test/CodeGen/OpenMP/scev-rewriting.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly < %s -polly-vectorizer=stripmine -polly-parallel -polly-parallel-force -polly-process-unprofitable -polly-codegen -S | FileCheck %s
+; RUN: opt %loadNPMPolly < %s -polly-vectorizer=stripmine -polly-parallel -polly-parallel-force -polly-process-unprofitable -passes=polly-codegen -S | FileCheck %s
 ; CHECK: define internal void @DoStringSort_polly_subfn
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnueabi"
diff --git a/polly/test/CodeGen/OpenMP/single_loop.ll b/polly/test/CodeGen/OpenMP/single_loop.ll
index f79653a..e5aee84 100644
--- a/polly/test/CodeGen/OpenMP/single_loop.ll
+++ b/polly/test/CodeGen/OpenMP/single_loop.ll
@@ -1,14 +1,14 @@
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-import-jscop -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST-STRIDE4
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-import-jscop -polly-codegen -S < %s | FileCheck %s -check-prefix=IR-STRIDE4
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-import-jscop,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST-STRIDE4
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-import-jscop,polly-codegen' -S < %s | FileCheck %s -check-prefix=IR-STRIDE4
 
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -polly-omp-backend=LLVM -polly-scheduling=static -polly-scheduling-chunksize=43 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC-CHUNKED
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -polly-omp-backend=LLVM -polly-scheduling=static -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -polly-omp-backend=LLVM -polly-scheduling=dynamic -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -polly-omp-backend=LLVM -polly-scheduling=dynamic -polly-scheduling-chunksize=4 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC-FOUR
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-import-jscop -polly-codegen -polly-omp-backend=LLVM -S < %s | FileCheck %s -check-prefix=LIBOMP-IR-STRIDE4
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=static -polly-scheduling-chunksize=43 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC-CHUNKED
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=static -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=dynamic -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=dynamic -polly-scheduling-chunksize=4 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC-FOUR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-import-jscop,polly-codegen' -polly-omp-backend=LLVM -S < %s | FileCheck %s -check-prefix=LIBOMP-IR-STRIDE4
 
 ; This extensive test case tests the creation of the full set of OpenMP calls
 ; as well as the subfunction creation using a trivial loop as example.
diff --git a/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll b/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll
index 50da5dd..c519bfd 100644
--- a/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll
+++ b/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadPolly -tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true -polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; #define N 1024
 ; float A[N];
diff --git a/polly/test/CodeGen/OpenMP/single_loop_with_param.ll b/polly/test/CodeGen/OpenMP/single_loop_with_param.ll
index d01b7a2..f6dfd62 100644
--- a/polly/test/CodeGen/OpenMP/single_loop_with_param.ll
+++ b/polly/test/CodeGen/OpenMP/single_loop_with_param.ll
@@ -1,15 +1,15 @@
-; RUN: opt %loadPolly -polly-parallel \
-; RUN: -polly-parallel-force -polly-codegen \
+; RUN: opt %loadNPMPolly -polly-parallel \
+; RUN: -polly-parallel-force -passes=polly-codegen \
 ; RUN: -S -verify-dom-info < %s \
 ; RUN: | FileCheck %s -check-prefix=IR
 
-; RUN: opt %loadPolly -polly-parallel \
-; RUN: -polly-parallel-force -polly-codegen -polly-omp-backend=LLVM \
+; RUN: opt %loadNPMPolly -polly-parallel \
+; RUN: -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM \
 ; RUN: -S -verify-dom-info < %s \
 ; RUN: | FileCheck %s -check-prefix=LIBOMP-IR
 
-; RUN: opt %loadPolly -polly-parallel \
-; RUN: -polly-parallel-force -polly-codegen -polly-omp-backend=LLVM \
+; RUN: opt %loadNPMPolly -polly-parallel \
+; RUN: -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM \
 ; RUN: -polly-scheduling=static \
 ; RUN: -S -verify-dom-info < %s \
 ; RUN: | FileCheck %s -check-prefix=LIBOMP-STATIC-IR
diff --git a/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll b/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll
index 05c6ed1..934e044 100644
--- a/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll
+++ b/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; This test case verifies that we create correct code even if two OpenMP loops
 ; share common outer variables.
diff --git a/polly/test/CodeGen/PHIInExit.ll b/polly/test/CodeGen/PHIInExit.ll
index eadd605..3e0c9d6 100644
--- a/polly/test/CodeGen/PHIInExit.ll
+++ b/polly/test/CodeGen/PHIInExit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 %struct..0__pthread_mutex_s = type { i32, i32, i32, i32, i32, i32, %struct.__pthread_list_t }
diff --git a/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll b/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll
index 84827dd..76b2fa9 100644
--- a/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll
+++ b/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
 ; RUN: -polly-codegen-add-debug-printing \
 ; RUN: -polly-ignore-aliasing < %s | FileCheck %s
 
diff --git a/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll b/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll
index 822eccc..4ffb7fd 100644
--- a/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll
+++ b/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen-trace-stmts -polly-codegen-trace-scalars -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-codegen-trace-stmts -polly-codegen-trace-scalars -passes=polly-codegen -S < %s | FileCheck %s
 ;
 
 define void @func(i32 %n, ptr %A) {
diff --git a/polly/test/CodeGen/alias-check-multi-dim.ll b/polly/test/CodeGen/alias-check-multi-dim.ll
index d923a4c..0440bda 100644
--- a/polly/test/CodeGen/alias-check-multi-dim.ll
+++ b/polly/test/CodeGen/alias-check-multi-dim.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen \
+; RUN: opt %loadNPMPolly -passes=polly-codegen \
 ; RUN:     -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/alias_metadata_too_many_arrays.ll b/polly/test/CodeGen/alias_metadata_too_many_arrays.ll
index 7c5ca012a..68c17a8 100644
--- a/polly/test/CodeGen/alias_metadata_too_many_arrays.ll
+++ b/polly/test/CodeGen/alias_metadata_too_many_arrays.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-ignore-aliasing -S < %s \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ignore-aliasing -S < %s \
 ; RUN:   | FileCheck %s
 ;
 ;    void manyarrays(float A1[], float A2[], float A3[], float A4[], float A5[],
diff --git a/polly/test/CodeGen/aliasing_different_base_and_access_type.ll b/polly/test/CodeGen/aliasing_different_base_and_access_type.ll
index a087414..8e1fc3b 100644
--- a/polly/test/CodeGen/aliasing_different_base_and_access_type.ll
+++ b/polly/test/CodeGen/aliasing_different_base_and_access_type.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; We have to cast %B to "short *" before we create RTCs.
 ;
diff --git a/polly/test/CodeGen/aliasing_different_pointer_types.ll b/polly/test/CodeGen/aliasing_different_pointer_types.ll
index 91f5eab6..e601c22 100644
--- a/polly/test/CodeGen/aliasing_different_pointer_types.ll
+++ b/polly/test/CodeGen/aliasing_different_pointer_types.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Check that we cast the different pointer types correctly before we compare
 ; them in the RTC's. We use i8* as max pointer type.
diff --git a/polly/test/CodeGen/aliasing_multidimensional_access.ll b/polly/test/CodeGen/aliasing_multidimensional_access.ll
index 4876839..e1dae03 100644
--- a/polly/test/CodeGen/aliasing_multidimensional_access.ll
+++ b/polly/test/CodeGen/aliasing_multidimensional_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; Check that we calculate the maximal access into array A correctly and track the overflow state.
 ;
diff --git a/polly/test/CodeGen/aliasing_parametric_simple_1.ll b/polly/test/CodeGen/aliasing_parametric_simple_1.ll
index 5422da4..a79ba25 100644
--- a/polly/test/CodeGen/aliasing_parametric_simple_1.ll
+++ b/polly/test/CodeGen/aliasing_parametric_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ;    void jd(int *A, int *B, int c) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/CodeGen/aliasing_parametric_simple_2.ll b/polly/test/CodeGen/aliasing_parametric_simple_2.ll
index de945d4..efe4af1 100644
--- a/polly/test/CodeGen/aliasing_parametric_simple_2.ll
+++ b/polly/test/CodeGen/aliasing_parametric_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ;    void jd(int *A, int *B, int c) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/CodeGen/aliasing_struct_element.ll b/polly/test/CodeGen/aliasing_struct_element.ll
index 2219ca9..3079e58 100644
--- a/polly/test/CodeGen/aliasing_struct_element.ll
+++ b/polly/test/CodeGen/aliasing_struct_element.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; We should only access (or compute the address of) "the first element" of %S
 ; as it is a single struct not a struct array. The maximal access to S, thus
diff --git a/polly/test/CodeGen/alignment.ll b/polly/test/CodeGen/alignment.ll
index a94b1f7..e0f6a95 100644
--- a/polly/test/CodeGen/alignment.ll
+++ b/polly/test/CodeGen/alignment.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Check that the special alignment information is kept
 ;
diff --git a/polly/test/CodeGen/annotated_alias_scopes.ll b/polly/test/CodeGen/annotated_alias_scopes.ll
index f8d14cd..b1777a1 100644
--- a/polly/test/CodeGen/annotated_alias_scopes.ll
+++ b/polly/test/CodeGen/annotated_alias_scopes.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s --check-prefix=SCOPES
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=SCOPES
 ;
 ; Check that we create alias scopes that indicate the accesses to A, B and C cannot alias in any way.
 ;
diff --git a/polly/test/CodeGen/blas_sscal_simplified.ll b/polly/test/CodeGen/blas_sscal_simplified.ll
index a370fcf..99f2eae 100644
--- a/polly/test/CodeGen/blas_sscal_simplified.ll
+++ b/polly/test/CodeGen/blas_sscal_simplified.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 ;
 ; Regression test for a bug in the runtime check generation.
 
diff --git a/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll b/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll
index e0f8c43..5dba933 100644
--- a/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll
+++ b/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -disable-output < %s
 ;
 ; CHECK: store i32 %tmp14_p_scalar_, ptr %tmp14.s2a
 ; CHECK: %tmp14.final_reload = load i32, ptr %tmp14.s2a
diff --git a/polly/test/CodeGen/constant_condition.ll b/polly/test/CodeGen/constant_condition.ll
index dad1f6c..905aa52 100644
--- a/polly/test/CodeGen/constant_condition.ll
+++ b/polly/test/CodeGen/constant_condition.ll
@@ -1,4 +1,4 @@
-;RUN: opt %loadPolly -polly-prepare -polly-print-ast -disable-output < %s | FileCheck %s
+;RUN: opt %loadNPMPolly '-passes=polly-prepare,scop(print<polly-ast>)' -disable-output < %s 2>&1 | FileCheck %s
 
 ;#include <string.h>
 ;int A[1];
diff --git a/polly/test/CodeGen/create-conditional-scop.ll b/polly/test/CodeGen/create-conditional-scop.ll
index f51a2dc..b8c9a81 100644
--- a/polly/test/CodeGen/create-conditional-scop.ll
+++ b/polly/test/CodeGen/create-conditional-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-codegen -verify-loop-info < %s -S | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -verify-loop-info < %s -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
 
diff --git a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll
index 991e3c8..6ffe6bf 100644
--- a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll
+++ b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 ;
 ; Check we do not crash even though the dead %tmp8 is referenced by a parameter
 ; and we do not pre-load it (as it is dead).
diff --git a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll
index 153f691..68c247a 100644
--- a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll
+++ b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 ;
 ; Check we do not crash even though there is a dead load that is referenced by
 ; a parameter and we do not pre-load it (as it is dead).
diff --git a/polly/test/CodeGen/debug-intrinsics.ll b/polly/test/CodeGen/debug-intrinsics.ll
index 2feeb7c..25c63da 100644
--- a/polly/test/CodeGen/debug-intrinsics.ll
+++ b/polly/test/CodeGen/debug-intrinsics.ll
@@ -1,9 +1,9 @@
-; RUN: opt %loadPolly \
-; RUN: -polly-analyze-read-only-scalars=false -polly-codegen -S < %s | \
+; RUN: opt %loadNPMPolly \
+; RUN: -polly-analyze-read-only-scalars=false -passes=polly-codegen -S < %s | \
 ; RUN: FileCheck %s
 
-; RUN: opt %loadPolly \
-; RUN: -polly-analyze-read-only-scalars=true -polly-codegen -S < %s | \
+; RUN: opt %loadNPMPolly \
+; RUN: -polly-analyze-read-only-scalars=true -passes=polly-codegen -S < %s | \
 ; RUN: FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll b/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll
index c9e006a..edc0333 100644
--- a/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll
+++ b/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
 ;
 ; This caused dominance problems at some point as we do bail out during
 ; code generation. Just verify it runs through.
diff --git a/polly/test/CodeGen/empty_domain_in_context.ll b/polly/test/CodeGen/empty_domain_in_context.ll
index c67ace9..a2fe805 100644
--- a/polly/test/CodeGen/empty_domain_in_context.ll
+++ b/polly/test/CodeGen/empty_domain_in_context.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-optree -polly-opt-isl -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-optree,polly-opt-isl,polly-codegen' -S < %s | FileCheck %s
 ;
 ; llvm.org/PR35362
 ; isl codegen does not allow to generate isl_ast_expr from pw_aff which have an
diff --git a/polly/test/CodeGen/entry_with_trivial_phi.ll b/polly/test/CodeGen/entry_with_trivial_phi.ll
index b057690..f2c9da0 100644
--- a/polly/test/CodeGen/entry_with_trivial_phi.ll
+++ b/polly/test/CodeGen/entry_with_trivial_phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s
 ;
 ; The entry of this scop's simple region (entry.split => for.end) has an trivial
 ; PHI node. LCSSA may create such PHI nodes. This is a breakdown of this case in
diff --git a/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll b/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll
index 5673cc7..2f1ec1a 100644
--- a/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll
+++ b/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; The entry of this scop's simple region (entry.split => for.end) has an trivial
 ; PHI node that is used in a different of the scop region. LCSSA may create such
diff --git a/polly/test/CodeGen/error-stmt-in-non-affine-region.ll b/polly/test/CodeGen/error-stmt-in-non-affine-region.ll
index 9832afe..63b6bec 100644
--- a/polly/test/CodeGen/error-stmt-in-non-affine-region.ll
+++ b/polly/test/CodeGen/error-stmt-in-non-affine-region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ; XFAIL: *
 ;
 ; CHECK-LABEL: polly.stmt.if.then:
diff --git a/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll b/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll
index 048847f..008e16c 100644
--- a/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll
+++ b/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/exprModDiv.ll b/polly/test/CodeGen/exprModDiv.ll
index 936b018..c9b419ab 100644
--- a/polly/test/CodeGen/exprModDiv.ll
+++ b/polly/test/CodeGen/exprModDiv.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-import-jscop \
-; RUN:     -polly-codegen -S < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-import-jscop \
-; RUN:     -polly-codegen -polly-import-jscop-postfix=pow2 \
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
+; RUN:     -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
+; RUN:     -polly-import-jscop-postfix=pow2 \
 ; RUN:     -S < %s | FileCheck %s -check-prefix=POW2
 ;
 ;    void exprModDiv(float *A, float *B, float *C, long N, long p) {
diff --git a/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll b/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll
index d7588b3..1ca2413 100644
--- a/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll
+++ b/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -S -polly-codegen \
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen \
 ; RUN: -polly-invariant-load-hoisting=false < %s | FileCheck %s
-; RUN: opt %loadPolly -S -polly-codegen \
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen \
 ; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
 ;
 ; Check that we generate valid code even if the load of cont_STACKPOINTER is
diff --git a/polly/test/CodeGen/hoisting_1.ll b/polly/test/CodeGen/hoisting_1.ll
index 86b5663..1f065be 100644
--- a/polly/test/CodeGen/hoisting_1.ll
+++ b/polly/test/CodeGen/hoisting_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -tbaa -polly-codegen -polly-allow-differing-element-types -disable-output %s
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -polly-allow-differing-element-types -disable-output %s
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/hoisting_2.ll b/polly/test/CodeGen/hoisting_2.ll
index 1f1be11..e76ee06 100644
--- a/polly/test/CodeGen/hoisting_2.ll
+++ b/polly/test/CodeGen/hoisting_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -tbaa -polly-codegen -polly-allow-differing-element-types -disable-output %s
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -polly-allow-differing-element-types -disable-output %s
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/inner_scev_sdiv_1.ll b/polly/test/CodeGen/inner_scev_sdiv_1.ll
index 1a463fc..d210105 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_1.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s
 ;
 ; Excerpt from the test-suite's oggenc reduced using bugpoint.
 ;
diff --git a/polly/test/CodeGen/inner_scev_sdiv_2.ll b/polly/test/CodeGen/inner_scev_sdiv_2.ll
index 7613803..74b914d 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_2.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; The SCEV expression in this test case refers to a sequence of sdiv
 ; instructions, which are part of different bbs in the SCoP. When code
diff --git a/polly/test/CodeGen/inner_scev_sdiv_3.ll b/polly/test/CodeGen/inner_scev_sdiv_3.ll
index 874ead1..3344045 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_3.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; This test case has a inner SCEV sdiv that will escape the SCoP. Just check we
 ; do not crash and generate valid code.
diff --git a/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll b/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll
index 6514e18..31c14e8 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=CODEGEN
 ;
 ; CHECK: [N] -> { Stmt_bb11[i0, i1] : i0 < N and i1 >= 0 and 3i1 <= -3 + i0 };
 ; CODEGEN: polly
diff --git a/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll b/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll
index 0329429..b42371b 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen \
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen \
 ; RUN:     < %s | FileCheck %s
 ;
 ; Check that this will not crash our code generation.
diff --git a/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll b/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll
index f7292ca..45af634 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen \
+; RUN: opt %loadNPMPolly -passes=polly-codegen \
 ; RUN:     -S < %s | FileCheck %s
 ;
 ; This will just check that we generate valid code here.
diff --git a/polly/test/CodeGen/intrinsics_lifetime.ll b/polly/test/CodeGen/intrinsics_lifetime.ll
index 6141b3a..6dca218 100644
--- a/polly/test/CodeGen/intrinsics_lifetime.ll
+++ b/polly/test/CodeGen/intrinsics_lifetime.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Verify that we remove the lifetime markers from everywhere.
 ;
diff --git a/polly/test/CodeGen/intrinsics_misc.ll b/polly/test/CodeGen/intrinsics_misc.ll
index c0a52fe..8416489 100644
--- a/polly/test/CodeGen/intrinsics_misc.ll
+++ b/polly/test/CodeGen/intrinsics_misc.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Verify that we remove the misc intrinsics  from the optimized SCoP.
 ;
diff --git a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll
index 6727247..e7cbf74 100644
--- a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll
+++ b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
 ; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
 ;
 ; This crashed our codegen at some point, verify it runs through
diff --git a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll
index a573049..24e9240 100644
--- a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll
+++ b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
 ; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
 ;
 ; This crashed our codegen at some point, verify it runs through
diff --git a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll
index e05ca99..d1d861e 100644
--- a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll
+++ b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
 ; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
 ;
 ; This crashed our codegen at some point, verify it runs through
diff --git a/polly/test/CodeGen/invariant-load-dimension.ll b/polly/test/CodeGen/invariant-load-dimension.ll
index 7793c3b..21e5305 100644
--- a/polly/test/CodeGen/invariant-load-dimension.ll
+++ b/polly/test/CodeGen/invariant-load-dimension.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOPS
-; RUN: opt %loadPolly -S < %s -polly-codegen -polly-process-unprofitable -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly -polly-process-unprofitable -polly-invariant-load-hoisting '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS
+; RUN: opt %loadNPMPolly -S < %s -passes=polly-codegen -polly-process-unprofitable -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
 
diff --git a/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll b/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll
index 4741009..1fd9cb8 100644
--- a/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll
+++ b/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen -polly-invariant-load-hoisting=true < %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true < %s
 ;
 ; Check that we generate valid code as we did non preload the base pointer
 ; origin of %tmp4 at some point.
diff --git a/polly/test/CodeGen/invariant_cannot_handle_void.ll b/polly/test/CodeGen/invariant_cannot_handle_void.ll
index de5d13d..0859a4e 100644
--- a/polly/test/CodeGen/invariant_cannot_handle_void.ll
+++ b/polly/test/CodeGen/invariant_cannot_handle_void.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting=true -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
-; RUN: opt %loadPolly -S -polly-codegen -polly-invariant-load-hoisting=true %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true %s | FileCheck %s
 ;
 ; The offset of the %tmp1 load wrt. to %buff (62 bytes) is not divisible
 ; by the type size (i32 = 4 bytes), thus we will have to represent %buff
diff --git a/polly/test/CodeGen/invariant_load.ll b/polly/test/CodeGen/invariant_load.ll
index be3f7a3..2d5e604 100644
--- a/polly/test/CodeGen/invariant_load.ll
+++ b/polly/test/CodeGen/invariant_load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK-NEXT:    %polly.access.B = getelementptr i32, ptr %B, i64 0
diff --git a/polly/test/CodeGen/invariant_load_address_space.ll b/polly/test/CodeGen/invariant_load_address_space.ll
index 7c611ad..3d1958e 100644
--- a/polly/test/CodeGen/invariant_load_address_space.ll
+++ b/polly/test/CodeGen/invariant_load_address_space.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK-NEXT:    %polly.access.B = getelementptr i32, ptr addrspace(1) %B, i64 0
diff --git a/polly/test/CodeGen/invariant_load_alias_metadata.ll b/polly/test/CodeGen/invariant_load_alias_metadata.ll
index 5a82d82..2524633 100644
--- a/polly/test/CodeGen/invariant_load_alias_metadata.ll
+++ b/polly/test/CodeGen/invariant_load_alias_metadata.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true \
 ; RUN: -S < %s | FileCheck %s
 ;
 ; This test case checks whether Polly generates alias metadata in case of
diff --git a/polly/test/CodeGen/invariant_load_base_pointer.ll b/polly/test/CodeGen/invariant_load_base_pointer.ll
index eb07f83..d4ac433 100644
--- a/polly/test/CodeGen/invariant_load_base_pointer.ll
+++ b/polly/test/CodeGen/invariant_load_base_pointer.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly  -polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly  -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK-NEXT:    %polly.access.BPLoc = getelementptr ptr, ptr %BPLoc, i64 0
diff --git a/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll b/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll
index 538077b..06a9a93 100644
--- a/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll
+++ b/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly  -polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly  -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK-NEXT:    %0 = sext i32 %N to i64
diff --git a/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll b/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll
index 7c2fb3e..66ab9a3 100644
--- a/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll
+++ b/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting=true -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -S -polly-codegen -polly-invariant-load-hoisting=true < %s | FileCheck %s --check-prefix=IR
-; RUN: opt %loadPolly -S -polly-codegen -polly-invariant-load-hoisting=true --polly-overflow-tracking=always < %s | FileCheck %s --check-prefix=IRA
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true --polly-overflow-tracking=always < %s | FileCheck %s --check-prefix=IRA
 ;
 ; As (p + q) can overflow we have to check that we load from
 ; I[p + q] only if it does not.
diff --git a/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll b/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll
index dc5a4c8..fa904e9 100644
--- a/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll
+++ b/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s \
 ; RUN:  -polly-invariant-load-hoisting \
 ; RUN:  | FileCheck %s
 
diff --git a/polly/test/CodeGen/invariant_load_condition.ll b/polly/test/CodeGen/invariant_load_condition.ll
index edf0814..36e5883 100644
--- a/polly/test/CodeGen/invariant_load_condition.ll
+++ b/polly/test/CodeGen/invariant_load_condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK-NEXT:     %polly.access.C = getelementptr i32, ptr %C, i64 0
diff --git a/polly/test/CodeGen/invariant_load_different_sized_types.ll b/polly/test/CodeGen/invariant_load_different_sized_types.ll
index 5b91a19..2995bce 100644
--- a/polly/test/CodeGen/invariant_load_different_sized_types.ll
+++ b/polly/test/CodeGen/invariant_load_different_sized_types.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S \
 ; RUN: -polly-allow-differing-element-types < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/CodeGen/invariant_load_escaping.ll b/polly/test/CodeGen/invariant_load_escaping.ll
index efccdf4..416148b 100644
--- a/polly/test/CodeGen/invariant_load_escaping.ll
+++ b/polly/test/CodeGen/invariant_load_escaping.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ;    int f(int *A, int *B) {
 ;      // Possible aliasing between A and B but if not then *B would be
diff --git a/polly/test/CodeGen/invariant_load_escaping_second_scop.ll b/polly/test/CodeGen/invariant_load_escaping_second_scop.ll
index c0ea888..906bfc1 100644
--- a/polly/test/CodeGen/invariant_load_escaping_second_scop.ll
+++ b/polly/test/CodeGen/invariant_load_escaping_second_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true  -polly-process-unprofitable -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true  -polly-process-unprofitable -S < %s | FileCheck %s
 ;
 ;    void fence(void);
 ;
diff --git a/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll b/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll
index 241252b..472c6c6 100644
--- a/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll
+++ b/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; This crashed at some point as the invariant load is in a non-affine
 ; subregion. Just check it does not anymore.
diff --git a/polly/test/CodeGen/invariant_load_loop_ub.ll b/polly/test/CodeGen/invariant_load_loop_ub.ll
index ab9aa0d..1db27ad 100644
--- a/polly/test/CodeGen/invariant_load_loop_ub.ll
+++ b/polly/test/CodeGen/invariant_load_loop_ub.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -polly-process-unprofitable -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-process-unprofitable -S < %s | FileCheck %s
 ;
 ; CHECK: polly.start
 ;
diff --git a/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll b/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll
index 08ff087..01b0176 100644
--- a/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll
+++ b/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
 ;
 ; Check that this does not crash as the invariant load is not executed (thus
 ; not preloaded) but still referenced by one of the parameters.
diff --git a/polly/test/CodeGen/invariant_load_outermost.ll b/polly/test/CodeGen/invariant_load_outermost.ll
index f42135c..7e0550f 100644
--- a/polly/test/CodeGen/invariant_load_outermost.ll
+++ b/polly/test/CodeGen/invariant_load_outermost.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 
 ; CHECK: polly.start
 
diff --git a/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll b/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll
index d365c99..abf957b 100644
--- a/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll
+++ b/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s --check-prefix=SCOP
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; SCOP:         Assumed Context:
 ; SCOP-NEXT:    [p_0, tmp4] -> {  :  }
diff --git a/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll b/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll
index b4d4c55..b565f1b 100644
--- a/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll
+++ b/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -S  < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -S  < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK:   %polly.access.A = getelementptr ptr, ptr %A, i64 42
diff --git a/polly/test/CodeGen/invariant_load_scalar_dep.ll b/polly/test/CodeGen/invariant_load_scalar_dep.ll
index 05a40a4..ba2999e 100644
--- a/polly/test/CodeGen/invariant_load_scalar_dep.ll
+++ b/polly/test/CodeGen/invariant_load_scalar_dep.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly  -polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly  -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK:    %polly.access.B = getelementptr i32, ptr %B, i64 0
diff --git a/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll b/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll
index 44c0358..26c964c 100644
--- a/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll
+++ b/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; Verify the preloaded %tmp0 is stored and communicated in the same alloca.
 ; In this case, we do not reload %ncol.load from the scalar stack slot, but
diff --git a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll
index 0b6929a..6bf11d5 100644
--- a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll
+++ b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true < %s
 ;
 ; Check we do not crash even though we pre-load values with different types
 ; from the same base pointer.
diff --git a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll
index 2eb913f..07ce941 100644
--- a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll
+++ b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true < %s
 ;
 ; Check we do not crash even though we pre-load values with different types
 ; from the same base pointer.
diff --git a/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll b/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll
index a0c1f89..19b30af 100644
--- a/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll
+++ b/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting \
 ; RUN:     -polly-ignore-parameter-bounds -S < %s | FileCheck %s
 
 ; CHECK: polly.preload.begin:
diff --git a/polly/test/CodeGen/invariant_verify_function_failed.ll b/polly/test/CodeGen/invariant_verify_function_failed.ll
index 6020cae..c9affac 100644
--- a/polly/test/CodeGen/invariant_verify_function_failed.ll
+++ b/polly/test/CodeGen/invariant_verify_function_failed.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(polly-codegen)' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; This crashed at some point as the pointer returned by the call
 ; to @__errno_location is invariant and defined in the SCoP but not
diff --git a/polly/test/CodeGen/invariant_verify_function_failed_2.ll b/polly/test/CodeGen/invariant_verify_function_failed_2.ll
index 81a4bd1..7ef5608 100644
--- a/polly/test/CodeGen/invariant_verify_function_failed_2.ll
+++ b/polly/test/CodeGen/invariant_verify_function_failed_2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -S -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s -check-prefix=SCOPS
-; RUN: opt %loadPolly -S -polly-codegen -polly-invariant-load-hoisting=true %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true %s | FileCheck %s
 ;
 ; Check we generate valid code.
 
diff --git a/polly/test/CodeGen/issue56692.ll b/polly/test/CodeGen/issue56692.ll
index e935e43..34c4e39 100644
--- a/polly/test/CodeGen/issue56692.ll
+++ b/polly/test/CodeGen/issue56692.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-omp-backend=LLVM -polly-codegen-verify -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -polly-omp-backend=LLVM -polly-codegen-verify -passes=polly-codegen -S < %s | FileCheck %s
 ; https://github.com/llvm/llvm-project/issues/56692
 ;
 ; CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call({{.*}}), !dbg ![[OPTLOC:[0-9]+]]
diff --git a/polly/test/CodeGen/large-numbers-in-boundary-context.ll b/polly/test/CodeGen/large-numbers-in-boundary-context.ll
index a0328df..b228baf 100644
--- a/polly/test/CodeGen/large-numbers-in-boundary-context.ll
+++ b/polly/test/CodeGen/large-numbers-in-boundary-context.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ; XFAIL: *
 ;
 ; The boundary context contains a constant that does not fit in 64 bits. Hence,
diff --git a/polly/test/CodeGen/load_subset_with_context.ll b/polly/test/CodeGen/load_subset_with_context.ll
index ef0e051..ccd4198 100644
--- a/polly/test/CodeGen/load_subset_with_context.ll
+++ b/polly/test/CodeGen/load_subset_with_context.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; A load must provide a value for every statement instance.
 ; Statement instances not in the SCoP's context are irrelevant.
diff --git a/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll b/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll
index 90c61c5..d906585 100644
--- a/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll
+++ b/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/CodeGen/loop_with_condition.ll b/polly/test/CodeGen/loop_with_condition.ll
index 618a542..49e3124 100644
--- a/polly/test/CodeGen/loop_with_condition.ll
+++ b/polly/test/CodeGen/loop_with_condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#include <string.h>
 ;#define N 1024
diff --git a/polly/test/CodeGen/loop_with_condition_2.ll b/polly/test/CodeGen/loop_with_condition_2.ll
index b1a1167..8ae38ee 100644
--- a/polly/test/CodeGen/loop_with_condition_2.ll
+++ b/polly/test/CodeGen/loop_with_condition_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 
 ; Verify that we actually detect this loop as the innermost loop even though
 ; there is a conditional inside.
diff --git a/polly/test/CodeGen/loop_with_condition_ineq.ll b/polly/test/CodeGen/loop_with_condition_ineq.ll
index c35208c..64019a6 100644
--- a/polly/test/CodeGen/loop_with_condition_ineq.ll
+++ b/polly/test/CodeGen/loop_with_condition_ineq.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#include <string.h>
 ;#define N 1024
diff --git a/polly/test/CodeGen/loop_with_condition_nested.ll b/polly/test/CodeGen/loop_with_condition_nested.ll
index 24a49b4..5dcb51d 100644
--- a/polly/test/CodeGen/loop_with_condition_nested.ll
+++ b/polly/test/CodeGen/loop_with_condition_nested.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -basic-aa -polly-codegen < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS
 
 
 ;#include <string.h>
diff --git a/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll b/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll
index 4444cf1..26fe4eb 100644
--- a/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll
+++ b/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Test case to trigger the hard way of creating a unique entering
 ; edge for the SCoP. It is triggered because the entering edge
diff --git a/polly/test/CodeGen/memcpy_annotations.ll b/polly/test/CodeGen/memcpy_annotations.ll
index a0a09b7..501aa8f 100644
--- a/polly/test/CodeGen/memcpy_annotations.ll
+++ b/polly/test/CodeGen/memcpy_annotations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Verify that @llvm.memcpy does not get a !alias.scope annotation.
 ; @llvm.memcpy takes two pointers, it is ambiguous to which the
diff --git a/polly/test/CodeGen/multidim-non-matching-typesize-2.ll b/polly/test/CodeGen/multidim-non-matching-typesize-2.ll
index 63afad6..f63eb18 100644
--- a/polly/test/CodeGen/multidim-non-matching-typesize-2.ll
+++ b/polly/test/CodeGen/multidim-non-matching-typesize-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -disable-basic-aa -polly-codegen \
+; RUN: opt %loadNPMPolly -disable-basic-aa -passes=polly-codegen \
 ; RUN:     -S < %s | FileCheck %s
 ; CHECK: polly
 target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
diff --git a/polly/test/CodeGen/multidim-non-matching-typesize.ll b/polly/test/CodeGen/multidim-non-matching-typesize.ll
index d117cef..63e43c8 100644
--- a/polly/test/CodeGen/multidim-non-matching-typesize.ll
+++ b/polly/test/CodeGen/multidim-non-matching-typesize.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -disable-basic-aa -polly-codegen \
+; RUN: opt %loadNPMPolly -disable-basic-aa -passes=polly-codegen \
 ; RUN:     -S < %s | FileCheck %s
 
 target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
diff --git a/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll b/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll
index 464ddb3..86b1757 100644
--- a/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll
+++ b/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/CodeGen/multidim_alias_check.ll b/polly/test/CodeGen/multidim_alias_check.ll
index 585577d..93e34e2 100644
--- a/polly/test/CodeGen/multidim_alias_check.ll
+++ b/polly/test/CodeGen/multidim_alias_check.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; CHECK: %polly.access.sext.A = sext i32 %n to i64
diff --git a/polly/test/CodeGen/multiple-codegens.ll b/polly/test/CodeGen/multiple-codegens.ll
index f950fa4..2fa974e 100644
--- a/polly/test/CodeGen/multiple-codegens.ll
+++ b/polly/test/CodeGen/multiple-codegens.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-scops -polly-opt-isl -polly-codegen -polly-scops -polly-codegen -S < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly "-passes=scop(polly-opt-isl,polly-codegen,polly-codegen)" -S < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly "-passes=scop(polly-opt-isl,polly-codegen),scop(polly-codegen)" -S < %s | FileCheck %s
 ;
diff --git a/polly/test/CodeGen/multiple-scops-in-a-row.ll b/polly/test/CodeGen/multiple-scops-in-a-row.ll
index a24a2e7..b81ba04 100644
--- a/polly/test/CodeGen/multiple-scops-in-a-row.ll
+++ b/polly/test/CodeGen/multiple-scops-in-a-row.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 
 ; This test case has two scops in a row. When code generating the first scop,
 ; the second scop is invalidated. This test case verifies that we do not crash
diff --git a/polly/test/CodeGen/multiple-types-invariant-load-2.ll b/polly/test/CodeGen/multiple-types-invariant-load-2.ll
index 0fd1df7..f6aca37 100644
--- a/polly/test/CodeGen/multiple-types-invariant-load-2.ll
+++ b/polly/test/CodeGen/multiple-types-invariant-load-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
 ; RUN: -polly-allow-differing-element-types < %s | FileCheck %s
 
 ; CHECK: polly
diff --git a/polly/test/CodeGen/multiple-types-invariant-load.ll b/polly/test/CodeGen/multiple-types-invariant-load.ll
index b143467..930041e 100644
--- a/polly/test/CodeGen/multiple-types-invariant-load.ll
+++ b/polly/test/CodeGen/multiple-types-invariant-load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-differing-element-types -polly-codegen -S \
+; RUN: opt %loadNPMPolly -polly-allow-differing-element-types -passes=polly-codegen -S \
 ; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
 
 ; CHECK: %polly.access.global.load = getelementptr i32, ptr %global.load, i64 0
diff --git a/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll b/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll
index 0163f24..1e06a7e 100644
--- a/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll
+++ b/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-position=before-vectorizer -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
-; RUN: opt %loadPolly -polly-position=before-vectorizer -polly-codegen -S < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-position=before-vectorizer '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -polly-position=before-vectorizer -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=IR
 
 ; The IR has two ScopArrayInfo for the value %next.0. This used to produce two
 ; phi nodes in polly.merge_new_and_old, one illegaly using the result of the
diff --git a/polly/test/CodeGen/no-overflow-tracking.ll b/polly/test/CodeGen/no-overflow-tracking.ll
index f11e892..d5ad9a7 100644
--- a/polly/test/CodeGen/no-overflow-tracking.ll
+++ b/polly/test/CodeGen/no-overflow-tracking.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting=true -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-invariant-load-hoisting=true -polly-overflow-tracking=never -polly-codegen -S < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true -polly-overflow-tracking=never -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=IR
 ;
 ; As (p + q) can overflow we have to check that we load from
 ; I[p + q] only if it does not.
diff --git a/polly/test/CodeGen/no_guard_bb.ll b/polly/test/CodeGen/no_guard_bb.ll
index 47c87ff..a022083 100644
--- a/polly/test/CodeGen/no_guard_bb.ll
+++ b/polly/test/CodeGen/no_guard_bb.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S -verify-dom-info < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s
 ;
 ; CHECK-NOT: br i1 true, label %polly.{{.*}}, label %polly.{{.*}}
 ;
diff --git a/polly/test/CodeGen/non-affine-dominance-generated-entering.ll b/polly/test/CodeGen/non-affine-dominance-generated-entering.ll
index ebf36ac..6015516 100644
--- a/polly/test/CodeGen/non-affine-dominance-generated-entering.ll
+++ b/polly/test/CodeGen/non-affine-dominance-generated-entering.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; llvm.org/PR25439
 ; Scalar reloads in the generated entering block were not recognized as
diff --git a/polly/test/CodeGen/non-affine-exit-node-dominance.ll b/polly/test/CodeGen/non-affine-exit-node-dominance.ll
index af19d24..0d0f634 100644
--- a/polly/test/CodeGen/non-affine-exit-node-dominance.ll
+++ b/polly/test/CodeGen/non-affine-exit-node-dominance.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; llvm.org/PR25439
 ; The dominance of the generated non-affine subregion block was based on the
diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll b/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll
index 2aca316..b7394b24 100644
--- a/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll
+++ b/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen \
+; RUN: opt %loadNPMPolly -passes=polly-codegen \
 ; RUN:     -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll b/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll
index 18a4b6e..b938633 100644
--- a/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll
+++ b/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen \
+; RUN: opt %loadNPMPolly -passes=polly-codegen \
 ; RUN:     -S < %s | FileCheck %s
 
 define void @foo(ptr %A, i1 %cond0, i1 %cond1) {
diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll b/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll
index 8a07ee7c..6460c42 100644
--- a/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll
+++ b/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen \
+; RUN: opt %loadNPMPolly -passes=polly-codegen \
 ; RUN:     -S < %s | FileCheck %s
 
 define void @foo(ptr %A, i1 %cond0, i1 %cond1) {
diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion.ll b/polly/test/CodeGen/non-affine-phi-node-expansion.ll
index 091fc3e3..8fd8cc1 100644
--- a/polly/test/CodeGen/non-affine-phi-node-expansion.ll
+++ b/polly/test/CodeGen/non-affine-phi-node-expansion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen \
+; RUN: opt %loadNPMPolly -passes=polly-codegen \
 ; RUN:     -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll
index 6a1d1f1..007a4c5 100644
--- a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll
+++ b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; This caused the code generation to generate invalid code as the same operand
 ; of the PHI node in the non-affine region was synthesized at the wrong place.
diff --git a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll
index 036bf34..20edbf2 100644
--- a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll
+++ b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; This caused the code generation to generate invalid code as the same BBMap was
 ; used for the whole non-affine region. When %add is synthesized for the
diff --git a/polly/test/CodeGen/non-affine-region-implicit-store.ll b/polly/test/CodeGen/non-affine-region-implicit-store.ll
index e89197e..0ff39d3 100644
--- a/polly/test/CodeGen/non-affine-region-implicit-store.ll
+++ b/polly/test/CodeGen/non-affine-region-implicit-store.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; llvm.org/PR25438
 ; After loop versioning, a dominance check of a non-affine subregion's exit node
diff --git a/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll b/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll
index f6e4eb5..7df3d89 100644
--- a/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll
+++ b/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-allow-nonaffine-loops \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-allow-nonaffine-loops \
 ; RUN: -S < %s | FileCheck %s
 
 ; This test verifies that values defined in another scop statement and used by
diff --git a/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll b/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll
index 6c749a4..179062d 100644
--- a/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll
+++ b/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S -verify-dom-info \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S -verify-dom-info \
 ; RUN:     < %s | FileCheck %s
 ;
 ; Check that we do not reuse the B[i-1] GEP created in block S again in
diff --git a/polly/test/CodeGen/non-affine-switch.ll b/polly/test/CodeGen/non-affine-switch.ll
index 9c08b98..427e7e2 100644
--- a/polly/test/CodeGen/non-affine-switch.ll
+++ b/polly/test/CodeGen/non-affine-switch.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly \
-; RUN: -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly \
+; RUN: -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/CodeGen/non-affine-synthesized-in-branch.ll b/polly/test/CodeGen/non-affine-synthesized-in-branch.ll
index cc0e60a..292c0f2 100644
--- a/polly/test/CodeGen/non-affine-synthesized-in-branch.ll
+++ b/polly/test/CodeGen/non-affine-synthesized-in-branch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; llvm.org/PR25412
 ; %synthgep caused %gep to be synthesized in subregion_if which was reused for
diff --git a/polly/test/CodeGen/non-affine-update.ll b/polly/test/CodeGen/non-affine-update.ll
index d2b7fae..03f091a 100644
--- a/polly/test/CodeGen/non-affine-update.ll
+++ b/polly/test/CodeGen/non-affine-update.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-import-jscop \
-; RUN:     -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
+; RUN:     -S < %s | FileCheck %s
 ;
 ;    void non-affine-update(double A[], double C[], double B[]) {
 ;      for (int i = 0; i < 10; i++) {
diff --git a/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll b/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll
index 5f6642b..153cdb7 100644
--- a/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll
+++ b/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -tbaa -polly-codegen -disable-output %s
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -disable-output %s
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/non_affine_float_compare.ll b/polly/test/CodeGen/non_affine_float_compare.ll
index be310b5..304a901 100644
--- a/polly/test/CodeGen/non_affine_float_compare.ll
+++ b/polly/test/CodeGen/non_affine_float_compare.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen \
+; RUN: opt %loadNPMPolly -passes=polly-codegen \
 ; RUN:     -polly-allow-nonaffine-branches -S -verify-dom-info \
 ; RUN:     < %s | FileCheck %s
 ;
diff --git a/polly/test/CodeGen/only_non_affine_error_region.ll b/polly/test/CodeGen/only_non_affine_error_region.ll
index b2ad1c1..445cef0 100644
--- a/polly/test/CodeGen/only_non_affine_error_region.ll
+++ b/polly/test/CodeGen/only_non_affine_error_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; CHECK-NOT: polly.start
 ;
diff --git a/polly/test/CodeGen/openmp_limit_threads.ll b/polly/test/CodeGen/openmp_limit_threads.ll
index e8eb819..4c33be3 100644
--- a/polly/test/CodeGen/openmp_limit_threads.ll
+++ b/polly/test/CodeGen/openmp_limit_threads.ll
@@ -1,10 +1,10 @@
-; RUN: opt %loadPolly -polly-codegen -polly-parallel -S < %s | FileCheck %s --check-prefix=AUTO
-; RUN: opt %loadPolly -polly-codegen -polly-parallel -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=ONE
-; RUN: opt %loadPolly -polly-codegen -polly-parallel -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=FOUR
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -S < %s | FileCheck %s --check-prefix=AUTO
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=ONE
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=FOUR
 
-; RUN: opt %loadPolly -polly-codegen -polly-parallel -polly-omp-backend=LLVM -S < %s | FileCheck %s --check-prefix=LIBOMP-AUTO
-; RUN: opt %loadPolly -polly-codegen -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=LIBOMP-ONE
-; RUN: opt %loadPolly -polly-codegen -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=LIBOMP-FOUR
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-omp-backend=LLVM -S < %s | FileCheck %s --check-prefix=LIBOMP-AUTO
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=LIBOMP-ONE
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=LIBOMP-FOUR
 
 ; Ensure that the provided thread numbers are forwarded to the OpenMP calls.
 ;
diff --git a/polly/test/CodeGen/out-of-scop-phi-node-use.ll b/polly/test/CodeGen/out-of-scop-phi-node-use.ll
index 54e909e..a4f9423 100644
--- a/polly/test/CodeGen/out-of-scop-phi-node-use.ll
+++ b/polly/test/CodeGen/out-of-scop-phi-node-use.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/CodeGen/param_div_div_div_2.ll b/polly/test/CodeGen/param_div_div_div_2.ll
index 764ca24..8eba644 100644
--- a/polly/test/CodeGen/param_div_div_div_2.ll
+++ b/polly/test/CodeGen/param_div_div_div_2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=IR
 ;
 ; Check that we guard the divisions because we moved them and thereby increased
 ; their domain.
diff --git a/polly/test/CodeGen/partial_write_array.ll b/polly/test/CodeGen/partial_write_array.ll
index 6dc5550..8bb1bc2 100644
--- a/polly/test/CodeGen/partial_write_array.ll
+++ b/polly/test/CodeGen/partial_write_array.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; Partial write of an array access.
 ;
diff --git a/polly/test/CodeGen/partial_write_emptyset.ll b/polly/test/CodeGen/partial_write_emptyset.ll
index a25195f..6782880 100644
--- a/polly/test/CodeGen/partial_write_emptyset.ll
+++ b/polly/test/CodeGen/partial_write_emptyset.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; Partial write, where "partial" is the empty set.
 ; The store is never executed in this case and we do generate it in the
diff --git a/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll b/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll
index 18a809b..b26bd81 100644
--- a/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll
+++ b/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; CHECK:      polly.stmt.if.then81:                             ; preds = %polly.stmt.if.end75
 ; CHECK-NEXT:   store float undef, ptr %fX64, align 4, !alias.scope !0, !noalias !3
diff --git a/polly/test/CodeGen/partial_write_impossible_restriction.ll b/polly/test/CodeGen/partial_write_impossible_restriction.ll
index 178227f..edee3b9 100644
--- a/polly/test/CodeGen/partial_write_impossible_restriction.ll
+++ b/polly/test/CodeGen/partial_write_impossible_restriction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; The isl scheduler isolates %cond.false into two instances.
 ; A partial write access in one of the instances was never executed,
diff --git a/polly/test/CodeGen/partial_write_in_region.ll b/polly/test/CodeGen/partial_write_in_region.ll
index d8f57b3..7c138c8 100644
--- a/polly/test/CodeGen/partial_write_in_region.ll
+++ b/polly/test/CodeGen/partial_write_in_region.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-import-jscop \
-; RUN: -polly-import-jscop-postfix=transformed -polly-codegen \
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
+; RUN: -polly-import-jscop-postfix=transformed \
 ; RUN: -verify-dom-info \
 ; RUN: -S < %s | FileCheck %s
 ;
diff --git a/polly/test/CodeGen/partial_write_in_region_with_loop.ll b/polly/test/CodeGen/partial_write_in_region_with_loop.ll
index 48a9dbe..ba15a78 100644
--- a/polly/test/CodeGen/partial_write_in_region_with_loop.ll
+++ b/polly/test/CodeGen/partial_write_in_region_with_loop.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-import-jscop \
-; RUN: -polly-import-jscop-postfix=transformed -polly-codegen \
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
+; RUN: -polly-import-jscop-postfix=transformed \
 ; RUN: -verify-dom-info -polly-allow-nonaffine-loops \
 ; RUN: -S < %s | FileCheck %s
 
diff --git a/polly/test/CodeGen/partial_write_mapped_scalar.ll b/polly/test/CodeGen/partial_write_mapped_scalar.ll
index 9137ef2..b8c4138 100644
--- a/polly/test/CodeGen/partial_write_mapped_scalar.ll
+++ b/polly/test/CodeGen/partial_write_mapped_scalar.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; Partial write of a (mapped) scalar.
 ;
diff --git a/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll b/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll
index e054b65..8c1953a 100644
--- a/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll
+++ b/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; Partial write of a (mapped) scalar in a non-affine subregion.
 ;
diff --git a/polly/test/CodeGen/perf_monitoring.ll b/polly/test/CodeGen/perf_monitoring.ll
index 2abbf24..4b91e50 100644
--- a/polly/test/CodeGen/perf_monitoring.ll
+++ b/polly/test/CodeGen/perf_monitoring.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-codegen-perf-monitoring \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-perf-monitoring \
 ; RUN:   -S < %s | FileCheck %s
 
 ; void f(long A[], long N) {
diff --git a/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll b/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll
index 11d63fc..d5c33d6 100644
--- a/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll
+++ b/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-codegen-perf-monitoring \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-perf-monitoring \
 ; RUN:   -S < %s | FileCheck %s
 
 ; void f(long A[], long N) {
diff --git a/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll b/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll
index 9b7f324..ab99c4d 100644
--- a/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll
+++ b/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-codegen-perf-monitoring \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-perf-monitoring \
 ; RUN:   -S < %s | FileCheck %s
 
 ; void f(long A[], long N) {
diff --git a/polly/test/CodeGen/phi-defined-before-scop.ll b/polly/test/CodeGen/phi-defined-before-scop.ll
index a3b1ba2..f083222 100644
--- a/polly/test/CodeGen/phi-defined-before-scop.ll
+++ b/polly/test/CodeGen/phi-defined-before-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; CHECK-LABEL: polly.merge_new_and_old:
 ; CHECK-NEXT: %tmp7.ph.merge = phi ptr [ %tmp7.ph.final_reload, %polly.exiting ], [ %tmp7.ph, %bb6.region_exiting ]
diff --git a/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll b/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll
index c34ebfc..e096aa2 100644
--- a/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll
+++ b/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; Make sure code generation does not break in case an 'error block' is detected
 ; outside of the scope. In this situation, we should not affect code generation.
diff --git a/polly/test/CodeGen/phi_condition_modeling_1.ll b/polly/test/CodeGen/phi_condition_modeling_1.ll
index b14d329..9d73d8a7 100644
--- a/polly/test/CodeGen/phi_condition_modeling_1.ll
+++ b/polly/test/CodeGen/phi_condition_modeling_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    void f(int *A, int c, int N) {
 ;      int tmp;
diff --git a/polly/test/CodeGen/phi_condition_modeling_2.ll b/polly/test/CodeGen/phi_condition_modeling_2.ll
index dab2977..2d13648 100644
--- a/polly/test/CodeGen/phi_condition_modeling_2.ll
+++ b/polly/test/CodeGen/phi_condition_modeling_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S  -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S  -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    void f(int *A, int c, int N) {
 ;      int tmp;
diff --git a/polly/test/CodeGen/phi_conditional_simple_1.ll b/polly/test/CodeGen/phi_conditional_simple_1.ll
index f1b93b5..25bcf2a 100644
--- a/polly/test/CodeGen/phi_conditional_simple_1.ll
+++ b/polly/test/CodeGen/phi_conditional_simple_1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    void jd(int *A, int c) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll
index 1368848..43d29b9 100644
--- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll
+++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; This caused an lnt crash at some point, just verify it will run through.
 ;
diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll
index 01dd450..9f28024 100644
--- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll
+++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; This caused an lnt crash at some point, just verify it will run through and
 ; produce the PHI node in the exit we are looking for.
diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll
index 66b95b0..73e99ac 100644
--- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll
+++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; This caused an lnt crash at some point, just verify it will run through and
 ; produce the PHI node in the exit we are looking for.
diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll
index 9a04636..6c9bd56 100644
--- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll
+++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; This caused an lnt crash at some point, just verify it will run through and
 ; produce the PHI node in the exit we are looking for.
diff --git a/polly/test/CodeGen/phi_loop_carried_float.ll b/polly/test/CodeGen/phi_loop_carried_float.ll
index ca1870f..d671db0 100644
--- a/polly/test/CodeGen/phi_loop_carried_float.ll
+++ b/polly/test/CodeGen/phi_loop_carried_float.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S  -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S  -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    float f(float *A, int N) {
 ;      float tmp = 0;
diff --git a/polly/test/CodeGen/phi_loop_carried_float_escape.ll b/polly/test/CodeGen/phi_loop_carried_float_escape.ll
index 3b2ed01..3e244c5 100644
--- a/polly/test/CodeGen/phi_loop_carried_float_escape.ll
+++ b/polly/test/CodeGen/phi_loop_carried_float_escape.ll
@@ -1,8 +1,8 @@
-; RUN: opt %loadPolly -S \
-; RUN: -polly-analyze-read-only-scalars=false -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S \
+; RUN: -polly-analyze-read-only-scalars=false -passes=polly-codegen < %s | FileCheck %s
 
-; RUN: opt %loadPolly -S \
-; RUN: -polly-analyze-read-only-scalars=true -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S \
+; RUN: -polly-analyze-read-only-scalars=true -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    float f(float *A, int N) {
 ;      float tmp = 0;
diff --git a/polly/test/CodeGen/phi_scalar_simple_1.ll b/polly/test/CodeGen/phi_scalar_simple_1.ll
index d62975b..80a1c41 100644
--- a/polly/test/CodeGen/phi_scalar_simple_1.ll
+++ b/polly/test/CodeGen/phi_scalar_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    int jd(int *restrict A, int x, int N) {
 ;      for (int i = 1; i < N; i++)
diff --git a/polly/test/CodeGen/phi_scalar_simple_2.ll b/polly/test/CodeGen/phi_scalar_simple_2.ll
index e58945d..614c8ac 100644
--- a/polly/test/CodeGen/phi_scalar_simple_2.ll
+++ b/polly/test/CodeGen/phi_scalar_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    int jd(int *restrict A, int x, int N, int c) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll b/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll
index 17e4b7d..7e21666 100644
--- a/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll
+++ b/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; CHECK: polly.merge_new_and_old:
 ; CHECK:   %result.ph.merge = phi float [ %result.ph.final_reload, %polly.exiting ], [ %result.ph, %next.region_exiting ]
diff --git a/polly/test/CodeGen/phi_with_one_exit_edge.ll b/polly/test/CodeGen/phi_with_one_exit_edge.ll
index 81fd73b..36a8684 100644
--- a/polly/test/CodeGen/phi_with_one_exit_edge.ll
+++ b/polly/test/CodeGen/phi_with_one_exit_edge.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ;
 ; CHECK: polly.merge_new_and_old:
diff --git a/polly/test/CodeGen/pointer-type-expressions-2.ll b/polly/test/CodeGen/pointer-type-expressions-2.ll
index b261cfe..918e4c6 100644
--- a/polly/test/CodeGen/pointer-type-expressions-2.ll
+++ b/polly/test/CodeGen/pointer-type-expressions-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define void @foo(ptr %start, ptr %end) {
diff --git a/polly/test/CodeGen/pointer-type-expressions.ll b/polly/test/CodeGen/pointer-type-expressions.ll
index 6bb3fa2..e7feebc 100644
--- a/polly/test/CodeGen/pointer-type-expressions.ll
+++ b/polly/test/CodeGen/pointer-type-expressions.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN
 
 ; void f(int a[], int N, float *P) {
 ;   int i;
diff --git a/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll b/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll
index eaef640..9ee050a 100644
--- a/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll
+++ b/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN
 ;
 
 ;    void f(int a[], int N, float *P, float *Q) {
diff --git a/polly/test/CodeGen/pointer_rem.ll b/polly/test/CodeGen/pointer_rem.ll
index 5c92ee5..b820231 100644
--- a/polly/test/CodeGen/pointer_rem.ll
+++ b/polly/test/CodeGen/pointer_rem.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-scops -polly-print-ast -disable-output -S < %s | FileCheck %s --check-prefix=AST
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-scops -polly-codegen -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>,scop(print<polly-ast>)' -disable-output -S < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>,scop(polly-codegen)' -S < %s | FileCheck %s --check-prefix=CODEGEN
 
 target datalayout = "e-m:e-i64:64-i128:128-n8:16:32:64-S128"
 target triple = "aarch64--linux-gnu"
diff --git a/polly/test/CodeGen/pr25241.ll b/polly/test/CodeGen/pr25241.ll
index 9fa67e0..4a4add8 100644
--- a/polly/test/CodeGen/pr25241.ll
+++ b/polly/test/CodeGen/pr25241.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; PR25241 (https://llvm.org/bugs/show_bug.cgi?id=25241)
 ; Ensure that synthesized values of a PHI node argument are generated in the
diff --git a/polly/test/CodeGen/ptrtoint_as_parameter.ll b/polly/test/CodeGen/ptrtoint_as_parameter.ll
index 4f6c807..a551d81 100644
--- a/polly/test/CodeGen/ptrtoint_as_parameter.ll
+++ b/polly/test/CodeGen/ptrtoint_as_parameter.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; CHECK:      if.then260:
 ; CHECK-NEXT:   %p.4 = getelementptr inbounds i8, ptr null, i64 1
diff --git a/polly/test/CodeGen/read-only-scalars.ll b/polly/test/CodeGen/read-only-scalars.ll
index a5e1d27..365cbbc 100644
--- a/polly/test/CodeGen/read-only-scalars.ll
+++ b/polly/test/CodeGen/read-only-scalars.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-analyze-read-only-scalars=false -polly-codegen \
+; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false -passes=polly-codegen \
 ; RUN:     \
 ; RUN:     -S < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-analyze-read-only-scalars=true -polly-codegen \
+; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true -passes=polly-codegen \
 ; RUN:     \
 ; RUN:     -S < %s | FileCheck %s -check-prefix=SCALAR
 
diff --git a/polly/test/CodeGen/reduction.ll b/polly/test/CodeGen/reduction.ll
index 6e5a230..8c5f707 100644
--- a/polly/test/CodeGen/reduction.ll
+++ b/polly/test/CodeGen/reduction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s 2>&1 | not FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | not FileCheck %s
 
 ;#include <string.h>
 ;#include <stdio.h>
diff --git a/polly/test/CodeGen/reduction_2.ll b/polly/test/CodeGen/reduction_2.ll
index 7a50cea..4aa3067 100644
--- a/polly/test/CodeGen/reduction_2.ll
+++ b/polly/test/CodeGen/reduction_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-invariant-load-hoisting=true -polly-print-ast -disable-output < %s | FileCheck %s --allow-empty
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-invariant-load-hoisting=true '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s --allow-empty
 
 ;#include <string.h>
 ;#include <stdio.h>
diff --git a/polly/test/CodeGen/reduction_simple_binary.ll b/polly/test/CodeGen/reduction_simple_binary.ll
index c7c5501..0fe1085 100644
--- a/polly/test/CodeGen/reduction_simple_binary.ll
+++ b/polly/test/CodeGen/reduction_simple_binary.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK: pragma simd reduction
 ;
diff --git a/polly/test/CodeGen/region-with-instructions.ll b/polly/test/CodeGen/region-with-instructions.ll
index 28cabef..e5f7d0f 100644
--- a/polly/test/CodeGen/region-with-instructions.ll
+++ b/polly/test/CodeGen/region-with-instructions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; CHECK-LABEL:   polly.stmt.bb48:
 ; CHECK-NEXT:   %[[offset:.*]] = shl i64 %polly.indvar, 3
diff --git a/polly/test/CodeGen/region_exiting-domtree.ll b/polly/test/CodeGen/region_exiting-domtree.ll
index 05983da..06e0d9d 100644
--- a/polly/test/CodeGen/region_exiting-domtree.ll
+++ b/polly/test/CodeGen/region_exiting-domtree.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -verify-dom-info -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-dom-info -disable-output < %s
 
 ; Verify that the DominatorTree is preserved correctly for the inserted
 ; %polly.stmt.exit.exit block, which serves as new exit block for the generated
diff --git a/polly/test/CodeGen/region_multiexit_partialwrite.ll b/polly/test/CodeGen/region_multiexit_partialwrite.ll
index b98d7f5..39e04db 100644
--- a/polly/test/CodeGen/region_multiexit_partialwrite.ll
+++ b/polly/test/CodeGen/region_multiexit_partialwrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; This text case has a partial write of PHI in a region-statement. It
 ; requires that the new PHINode from the region's exiting block is
diff --git a/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll b/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll
index 0f62a8c..4afaab5 100644
--- a/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll
+++ b/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; TODO: FIXME: Simplify the context.
 ; AST: if (n >= 1 && 0 == n <= -1)
diff --git a/polly/test/CodeGen/run-time-condition.ll b/polly/test/CodeGen/run-time-condition.ll
index 0faefad..914b76f 100644
--- a/polly/test/CodeGen/run-time-condition.ll
+++ b/polly/test/CodeGen/run-time-condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll b/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll
index 3f88942..0b49da0 100644
--- a/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll
+++ b/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; Test the code generation in the presence of a scalar out-of-scop value being
 ; used from within the SCoP.
diff --git a/polly/test/CodeGen/scalar-store-from-same-bb.ll b/polly/test/CodeGen/scalar-store-from-same-bb.ll
index ac8fab4..3f232da 100644
--- a/polly/test/CodeGen/scalar-store-from-same-bb.ll
+++ b/polly/test/CodeGen/scalar-store-from-same-bb.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly \
-; RUN: -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly \
+; RUN: -passes=polly-codegen -S < %s | FileCheck %s
 
 ; This test ensures that the expression N + 1 that is stored in the phi-node
 ; alloca, is directly computed and not incorrectly transfered through memory.
diff --git a/polly/test/CodeGen/scalar_codegen_crash.ll b/polly/test/CodeGen/scalar_codegen_crash.ll
index c41a00f..375f097 100644
--- a/polly/test/CodeGen/scalar_codegen_crash.ll
+++ b/polly/test/CodeGen/scalar_codegen_crash.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly \
-; RUN:     -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly \
+; RUN:     -passes=polly-codegen -S < %s | FileCheck %s
 
 ; This test cases used to crash the scalar code generation. Check that we
 ; can generate code for it.
diff --git a/polly/test/CodeGen/scev-backedgetaken.ll b/polly/test/CodeGen/scev-backedgetaken.ll
index 15e12ee..f5e68ec 100644
--- a/polly/test/CodeGen/scev-backedgetaken.ll
+++ b/polly/test/CodeGen/scev-backedgetaken.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; llvm.org/PR48422
 ; Use of ScalarEvolution in Codegen not possible because DominatorTree is not updated.
diff --git a/polly/test/CodeGen/scev-division-invariant-load.ll b/polly/test/CodeGen/scev-division-invariant-load.ll
index 3156bdc..70f090e 100644
--- a/polly/test/CodeGen/scev-division-invariant-load.ll
+++ b/polly/test/CodeGen/scev-division-invariant-load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s
 ;
 ; Check that we generate valid code as we did not use the preloaded
 ; value of %tmp1 for the access function of the preloaded %tmp4.
diff --git a/polly/test/CodeGen/scev.ll b/polly/test/CodeGen/scev.ll
index 07d726d..e2b5afd 100644
--- a/polly/test/CodeGen/scev.ll
+++ b/polly/test/CodeGen/scev.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-detect < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define fastcc void @f () inlinehint align 2 {
diff --git a/polly/test/CodeGen/scev_expansion_in_nonaffine.ll b/polly/test/CodeGen/scev_expansion_in_nonaffine.ll
index f61f21d..6c6c257 100644
--- a/polly/test/CodeGen/scev_expansion_in_nonaffine.ll
+++ b/polly/test/CodeGen/scev_expansion_in_nonaffine.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
 ; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
 
 ; bugpoint-reduced testcase of MiBench/consumer-lame/quantize-pvt.c from the
diff --git a/polly/test/CodeGen/scev_looking_through_bitcasts.ll b/polly/test/CodeGen/scev_looking_through_bitcasts.ll
index c87d932..142e83f 100644
--- a/polly/test/CodeGen/scev_looking_through_bitcasts.ll
+++ b/polly/test/CodeGen/scev_looking_through_bitcasts.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Scalar write of bitcasted value. Instead of writing %b of type
 ; %structty, the SCEV expression looks through the bitcast such that
diff --git a/polly/test/CodeGen/scop_expander_insert_point.ll b/polly/test/CodeGen/scop_expander_insert_point.ll
index 8492873..92f2772 100644
--- a/polly/test/CodeGen/scop_expander_insert_point.ll
+++ b/polly/test/CodeGen/scop_expander_insert_point.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
 ; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
 ;
 ; CHECK:      entry:
diff --git a/polly/test/CodeGen/scop_expander_segfault.ll b/polly/test/CodeGen/scop_expander_segfault.ll
index 293c1e5..d94a1fd 100644
--- a/polly/test/CodeGen/scop_expander_segfault.ll
+++ b/polly/test/CodeGen/scop_expander_segfault.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S %s | FileCheck %s
 ;
 ; This test was extracted from gcc in SPEC2006 and it crashed our code
 ; generation, or to be more precise, the ScopExpander due to a endless
diff --git a/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll b/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll
index 91a5815..9f968e5 100644
--- a/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll
+++ b/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; Verify that we generate the runtime check code after the conditional branch
 ; in the SCoP region entering block (here %entry).
diff --git a/polly/test/CodeGen/select-base-pointer.ll b/polly/test/CodeGen/select-base-pointer.ll
index 29bc400..85be377 100644
--- a/polly/test/CodeGen/select-base-pointer.ll
+++ b/polly/test/CodeGen/select-base-pointer.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -tbaa -polly-codegen -disable-output %s
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -disable-output %s
 ;
 ; Check that we do not crash here.
 ;
diff --git a/polly/test/CodeGen/sequential_loops.ll b/polly/test/CodeGen/sequential_loops.ll
index 97d280d..33a3ee9 100644
--- a/polly/test/CodeGen/sequential_loops.ll
+++ b/polly/test/CodeGen/sequential_loops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#include <string.h>
 ;#define N 1024
diff --git a/polly/test/CodeGen/simple_loop_non_single_exit.ll b/polly/test/CodeGen/simple_loop_non_single_exit.ll
index dc1b09b..a7e36bc 100644
--- a/polly/test/CodeGen/simple_loop_non_single_exit.ll
+++ b/polly/test/CodeGen/simple_loop_non_single_exit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/CodeGen/simple_loop_non_single_exit_2.ll b/polly/test/CodeGen/simple_loop_non_single_exit_2.ll
index 178601c..22e9da0 100644
--- a/polly/test/CodeGen/simple_loop_non_single_exit_2.ll
+++ b/polly/test/CodeGen/simple_loop_non_single_exit_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/CodeGen/simple_non_single_entry.ll b/polly/test/CodeGen/simple_non_single_entry.ll
index 3b4bf59..c33a77a 100644
--- a/polly/test/CodeGen/simple_non_single_entry.ll
+++ b/polly/test/CodeGen/simple_non_single_entry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/CodeGen/simple_nonaffine_loop.ll b/polly/test/CodeGen/simple_nonaffine_loop.ll
index d4e9c60..bc62047 100644
--- a/polly/test/CodeGen/simple_nonaffine_loop.ll
+++ b/polly/test/CodeGen/simple_nonaffine_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-allow-nonaffine -disable-output < %s | FileCheck %s
 
 ;#include <stdio.h>
 ;#include <stdlib.h>
diff --git a/polly/test/CodeGen/single_do_loop_int_max_iterations.ll b/polly/test/CodeGen/single_do_loop_int_max_iterations.ll
index 9648fbe..a65e3a2 100644
--- a/polly/test/CodeGen/single_do_loop_int_max_iterations.ll
+++ b/polly/test/CodeGen/single_do_loop_int_max_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#define N 20
 ;#include "limits.h"
diff --git a/polly/test/CodeGen/single_do_loop_int_param_iterations.ll b/polly/test/CodeGen/single_do_loop_int_param_iterations.ll
index f28d828a..acccb48 100644
--- a/polly/test/CodeGen/single_do_loop_int_param_iterations.ll
+++ b/polly/test/CodeGen/single_do_loop_int_param_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 ; XFAIL: *
 
 ;define N 20
diff --git a/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll b/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll
index 68aaab9..7a67f6b 100644
--- a/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll
+++ b/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 
 ;#define N 20
 ;#include "limits.h"
diff --git a/polly/test/CodeGen/single_do_loop_one_iteration.ll b/polly/test/CodeGen/single_do_loop_one_iteration.ll
index 9d97cb8..2d93916 100644
--- a/polly/test/CodeGen/single_do_loop_one_iteration.ll
+++ b/polly/test/CodeGen/single_do_loop_one_iteration.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 ; XFAIL: *
 
 ;#define N 20
diff --git a/polly/test/CodeGen/single_do_loop_scev_replace.ll b/polly/test/CodeGen/single_do_loop_scev_replace.ll
index 7963d9d..83c9e9d 100644
--- a/polly/test/CodeGen/single_do_loop_scev_replace.ll
+++ b/polly/test/CodeGen/single_do_loop_scev_replace.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#define N 20
 ;#include "limits.h"
diff --git a/polly/test/CodeGen/single_loop.ll b/polly/test/CodeGen/single_loop.ll
index 68cc498..2db3466 100644
--- a/polly/test/CodeGen/single_loop.ll
+++ b/polly/test/CodeGen/single_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#include <string.h>
 ;#define N 1024
diff --git a/polly/test/CodeGen/single_loop_int_max_iterations.ll b/polly/test/CodeGen/single_loop_int_max_iterations.ll
index bfb5e4a..f83e882 100644
--- a/polly/test/CodeGen/single_loop_int_max_iterations.ll
+++ b/polly/test/CodeGen/single_loop_int_max_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#define N 20
 ;#include "limits.h"
diff --git a/polly/test/CodeGen/single_loop_ll_max_iterations.ll b/polly/test/CodeGen/single_loop_ll_max_iterations.ll
index bdfd7fc..1427189 100644
--- a/polly/test/CodeGen/single_loop_ll_max_iterations.ll
+++ b/polly/test/CodeGen/single_loop_ll_max_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#include "limits.h"
 ;#define N 20
diff --git a/polly/test/CodeGen/single_loop_one_iteration.ll b/polly/test/CodeGen/single_loop_one_iteration.ll
index 7d4dd59..1a70d4a 100644
--- a/polly/test/CodeGen/single_loop_one_iteration.ll
+++ b/polly/test/CodeGen/single_loop_one_iteration.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#define N 20
 ;
diff --git a/polly/test/CodeGen/single_loop_param.ll b/polly/test/CodeGen/single_loop_param.ll
index 5d72da3..44ce123 100644
--- a/polly/test/CodeGen/single_loop_param.ll
+++ b/polly/test/CodeGen/single_loop_param.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @A = common global [1024 x i32] zeroinitializer, align 16 ; <ptr> [#uses=3]
diff --git a/polly/test/CodeGen/single_loop_param_less_equal.ll b/polly/test/CodeGen/single_loop_param_less_equal.ll
index e63ee29..fda9bfa 100644
--- a/polly/test/CodeGen/single_loop_param_less_equal.ll
+++ b/polly/test/CodeGen/single_loop_param_less_equal.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen  -S < %s | FileCheck %s -check-prefix=CODEGEN
-; RUN: opt %loadPolly -polly-codegen < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen  -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @A = common global [1024 x i32] zeroinitializer
diff --git a/polly/test/CodeGen/single_loop_param_less_than.ll b/polly/test/CodeGen/single_loop_param_less_than.ll
index 95130f9..b888c86 100644
--- a/polly/test/CodeGen/single_loop_param_less_than.ll
+++ b/polly/test/CodeGen/single_loop_param_less_than.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen  -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen  -S < %s | FileCheck %s -check-prefix=CODEGEN
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @A = common global [1024 x i32] zeroinitializer
diff --git a/polly/test/CodeGen/single_loop_zero_iterations.ll b/polly/test/CodeGen/single_loop_zero_iterations.ll
index 4f18968..b1ce491 100644
--- a/polly/test/CodeGen/single_loop_zero_iterations.ll
+++ b/polly/test/CodeGen/single_loop_zero_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=SCALAR --allow-empty
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=SCALAR --allow-empty
 
 ;#define N 20
 ;
diff --git a/polly/test/CodeGen/split_edge_of_exit.ll b/polly/test/CodeGen/split_edge_of_exit.ll
index 56ce215..f4b17e6 100644
--- a/polly/test/CodeGen/split_edge_of_exit.ll
+++ b/polly/test/CodeGen/split_edge_of_exit.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -verify-region-info -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-region-info -disable-output < %s
 ;
 ; This is a scop directly precedented by a region, i.e. the scop's entry is the
 ; region's exit block. This test is to ensure that the RegionInfo is correctly
diff --git a/polly/test/CodeGen/split_edges.ll b/polly/test/CodeGen/split_edges.ll
index e01d901..b921202 100644
--- a/polly/test/CodeGen/split_edges.ll
+++ b/polly/test/CodeGen/split_edges.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -verify-region-info -verify-dom-info -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-region-info -verify-dom-info -S < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @A = common global [1536 x float] zeroinitializer
diff --git a/polly/test/CodeGen/split_edges_2.ll b/polly/test/CodeGen/split_edges_2.ll
index 4135d6f..8f4d48f 100644
--- a/polly/test/CodeGen/split_edges_2.ll
+++ b/polly/test/CodeGen/split_edges_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -verify-region-info -verify-dom-info -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-region-info -verify-dom-info -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/CodeGen/srem-in-other-bb.ll b/polly/test/CodeGen/srem-in-other-bb.ll
index 8bde1a3..a13a1b6 100644
--- a/polly/test/CodeGen/srem-in-other-bb.ll
+++ b/polly/test/CodeGen/srem-in-other-bb.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
 ; RUN:     < %s | FileCheck %s
 ;
 ;    void pos(float *A, long n) {
diff --git a/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll b/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll
index 02dfe96..cb9d9a2 100644
--- a/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll
+++ b/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -verify-dom-info -polly-codegen -S < %s \
+; RUN: opt %loadNPMPolly -verify-dom-info -passes=polly-codegen -S < %s \
 ; RUN: -polly-invariant-load-hoisting=true | FileCheck %s
 ;
 ; This caused an infinite recursion during invariant load hoisting at some
diff --git a/polly/test/CodeGen/stmt_split_no_dependence.ll b/polly/test/CodeGen/stmt_split_no_dependence.ll
index a395aa1..381cd30 100644
--- a/polly/test/CodeGen/stmt_split_no_dependence.ll
+++ b/polly/test/CodeGen/stmt_split_no_dependence.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; CHECK:   store i32 %9, ptr %scevgep, align 4, !alias.scope !1, !noalias !4
 ; CHECK:   store i32 %11, ptr %scevgep4, align 4, !alias.scope !4, !noalias !1
diff --git a/polly/test/CodeGen/switch-in-non-affine-region.ll b/polly/test/CodeGen/switch-in-non-affine-region.ll
index 930755e..1a9e7081b 100644
--- a/polly/test/CodeGen/switch-in-non-affine-region.ll
+++ b/polly/test/CodeGen/switch-in-non-affine-region.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly \
-; RUN: -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly \
+; RUN: -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll b/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll
index 6a8d3b9..b2a06236 100644
--- a/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll
+++ b/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Check for the correct written value of a scalar phi write whose value is
 ; defined within the loop, but its effective value is its last definition when
diff --git a/polly/test/CodeGen/test-invalid-operands-for-select-2.ll b/polly/test/CodeGen/test-invalid-operands-for-select-2.ll
index 5fa4773..5668063 100644
--- a/polly/test/CodeGen/test-invalid-operands-for-select-2.ll
+++ b/polly/test/CodeGen/test-invalid-operands-for-select-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen -verify-loop-info < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen -verify-loop-info < %s | FileCheck %s
 ;
 ; Check that we do not crash as described here: http://llvm.org/bugs/show_bug.cgi?id=21167
 ;
diff --git a/polly/test/CodeGen/test-invalid-operands-for-select.ll b/polly/test/CodeGen/test-invalid-operands-for-select.ll
index 40695af..9f5013c 100644
--- a/polly/test/CodeGen/test-invalid-operands-for-select.ll
+++ b/polly/test/CodeGen/test-invalid-operands-for-select.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; Check that we do not crash as described here: http://llvm.org/PR21167
 ;
diff --git a/polly/test/CodeGen/test.ll b/polly/test/CodeGen/test.ll
index ac99688e..aad998b 100644
--- a/polly/test/CodeGen/test.ll
+++ b/polly/test/CodeGen/test.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 ; XFAIL: *
 
 ;int bar1();
diff --git a/polly/test/CodeGen/two-loops-right-after-each-other-2.ll b/polly/test/CodeGen/two-loops-right-after-each-other-2.ll
index a7cae0a..1c68389 100644
--- a/polly/test/CodeGen/two-loops-right-after-each-other-2.ll
+++ b/polly/test/CodeGen/two-loops-right-after-each-other-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; CHECK:       polly.merge_new_and_old:
 ; CHECK-NEXT:    merge = phi
diff --git a/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll b/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll
index 4470f97..4396c38 100644
--- a/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll
+++ b/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: for.cond:
 ; CHECK:         %num.0 = phi i32 [ %add, %for.body15 ], [ 0, %for.cond.pre_entry_bb ]
diff --git a/polly/test/CodeGen/two-scops-in-row.ll b/polly/test/CodeGen/two-scops-in-row.ll
index 3e922cb..dd3f310 100644
--- a/polly/test/CodeGen/two-scops-in-row.ll
+++ b/polly/test/CodeGen/two-scops-in-row.ll
@@ -1,6 +1,6 @@
 
-; RUN: opt %loadPolly -polly-print-ast -polly-ignore-aliasing -disable-output < %s | FileCheck %s -check-prefix=SCALAR
-; RUN: opt %loadPolly -polly-codegen -polly-ignore-aliasing -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ignore-aliasing -disable-output < %s | FileCheck %s -check-prefix=SCALAR
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ignore-aliasing -disable-output < %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; SCALAR: if (
diff --git a/polly/test/CodeGen/udiv_expansion_position.ll b/polly/test/CodeGen/udiv_expansion_position.ll
index bb37fed..354e3cd 100644
--- a/polly/test/CodeGen/udiv_expansion_position.ll
+++ b/polly/test/CodeGen/udiv_expansion_position.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Verify we do not crash when we synthezise code for the udiv in the SCoP.
 ;
diff --git a/polly/test/CodeGen/uninitialized_scalar_memory.ll b/polly/test/CodeGen/uninitialized_scalar_memory.ll
index 935ccc3..e08af07 100644
--- a/polly/test/CodeGen/uninitialized_scalar_memory.ll
+++ b/polly/test/CodeGen/uninitialized_scalar_memory.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; Verify we initialize the scalar locations reserved for the incoming phi
 ; values.
diff --git a/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll b/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll
index 9164bb4..4670680 100644
--- a/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll
+++ b/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops \
-; RUN: -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-codegen \
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' \
+; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-codegen \
 ; RUN: -polly-invariant-load-hoisting=true -disable-output < %s
 
 ; The loop for.body is a scop with invariant load hoisting, but does not
diff --git a/polly/test/CodeGen/variant_load_empty_domain.ll b/polly/test/CodeGen/variant_load_empty_domain.ll
index f5ad0b1..6f2d3dc 100644
--- a/polly/test/CodeGen/variant_load_empty_domain.ll
+++ b/polly/test/CodeGen/variant_load_empty_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
 ;
 ;
 ;    void f(int *A) {
diff --git a/polly/test/CodeGen/whole-scop-non-affine-subregion.ll b/polly/test/CodeGen/whole-scop-non-affine-subregion.ll
index 931e644..b342b1c 100644
--- a/polly/test/CodeGen/whole-scop-non-affine-subregion.ll
+++ b/polly/test/CodeGen/whole-scop-non-affine-subregion.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly \
-; RUN: -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly \
+; RUN: -passes=polly-codegen -S < %s | FileCheck %s
 
 ; CHECK: polly.start
 ;    int /* pure */ g()
diff --git a/polly/test/DeLICM/confused_order.ll b/polly/test/DeLICM/confused_order.ll
index 2015ebc..0c19eb6 100644
--- a/polly/test/DeLICM/confused_order.ll
+++ b/polly/test/DeLICM/confused_order.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-delicm -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-delicm -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s -check-prefix=REMARKS
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-delicm>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-delicm' -polly-import-jscop-postfix=transformed -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s -check-prefix=REMARKS
 ;
 ; ForwardOptree changes the SCoP and may already map some accesses.
 ; DeLICM must be prepared to encounter implicit reads
diff --git a/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll b/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll
index 4e039b2..66d9ae8 100644
--- a/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll
+++ b/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ; The domain of bb14 contradicts the SCoP's assumptions. This leads to
 ; 'anything goes' inside the statement since it is never executed,
diff --git a/polly/test/DeLICM/load-in-cond-inf-loop.ll b/polly/test/DeLICM/load-in-cond-inf-loop.ll
index f0aecfd..f6e2311 100644
--- a/polly/test/DeLICM/load-in-cond-inf-loop.ll
+++ b/polly/test/DeLICM/load-in-cond-inf-loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 
 ; When %b is 0, %for.body13 is an infite loop. In this case the loaded
 ; value %1 is not used anywhere.
diff --git a/polly/test/DeLICM/map_memset_zero.ll b/polly/test/DeLICM/map_memset_zero.ll
index 1a08eee..9a8e598 100644
--- a/polly/test/DeLICM/map_memset_zero.ll
+++ b/polly/test/DeLICM/map_memset_zero.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output < %s | FileCheck -match-full-lines %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-delicm>)" -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Check that PHI mapping works even in presence of a memset whose'
 ; zero value is used.
diff --git a/polly/test/DeLICM/nomap_alreadymapped.ll b/polly/test/DeLICM/nomap_alreadymapped.ll
index 7adf4ba..da5f4ec 100644
--- a/polly/test/DeLICM/nomap_alreadymapped.ll
+++ b/polly/test/DeLICM/nomap_alreadymapped.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/nomap_escaping.ll b/polly/test/DeLICM/nomap_escaping.ll
index 034c0a9..6095536 100644
--- a/polly/test/DeLICM/nomap_escaping.ll
+++ b/polly/test/DeLICM/nomap_escaping.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/nomap_occupied.ll b/polly/test/DeLICM/nomap_occupied.ll
index db33532..9ba8ce2 100644
--- a/polly/test/DeLICM/nomap_occupied.ll
+++ b/polly/test/DeLICM/nomap_occupied.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/nomap_readonly.ll b/polly/test/DeLICM/nomap_readonly.ll
index 1f3b574..7a185d3 100644
--- a/polly/test/DeLICM/nomap_readonly.ll
+++ b/polly/test/DeLICM/nomap_readonly.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      fsomeval = 21.0 + 21.0;
diff --git a/polly/test/DeLICM/nomap_spuriouswrite.ll b/polly/test/DeLICM/nomap_spuriouswrite.ll
index ef470f7..0ed7f6e 100644
--- a/polly/test/DeLICM/nomap_spuriouswrite.ll
+++ b/polly/test/DeLICM/nomap_spuriouswrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/nomap_storagesize.ll b/polly/test/DeLICM/nomap_storagesize.ll
index fab8d54..bf851ac 100644
--- a/polly/test/DeLICM/nomap_storagesize.ll
+++ b/polly/test/DeLICM/nomap_storagesize.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ;    void func(float *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/nomap_writewrite.ll b/polly/test/DeLICM/nomap_writewrite.ll
index 06192d9..9fcd52a 100644
--- a/polly/test/DeLICM/nomap_writewrite.ll
+++ b/polly/test/DeLICM/nomap_writewrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/outofquota-reverseDomain.ll b/polly/test/DeLICM/outofquota-reverseDomain.ll
index d40ee03..1f7527c8 100644
--- a/polly/test/DeLICM/outofquota-reverseDomain.ll
+++ b/polly/test/DeLICM/outofquota-reverseDomain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-delicm-max-ops=1000000 -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-delicm-max-ops=1000000 '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ; This causes an assertion to fail on out-of-quota after 1000000 operations.
 ; (The error was specific to -polly-delicm-max-ops=1000000 and changes
diff --git a/polly/test/DeLICM/pass_existence.ll b/polly/test/DeLICM/pass_existence.ll
index 7ed2da9..64302d9 100644
--- a/polly/test/DeLICM/pass_existence.ll
+++ b/polly/test/DeLICM/pass_existence.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-delicm -disable-output < %s
-; RUN: opt %loadPolly -polly-print-delicm -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly "-passes=scop(print<polly-delicm>)" -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-delicm -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=scop(print<polly-delicm>)' -disable-output < %s | FileCheck %s
 ;
 ; Simple test for the existence of the DeLICM pass.
 ;
diff --git a/polly/test/DeLICM/pr41656.ll b/polly/test/DeLICM/pr41656.ll
index 965ad9f..d7cfde3 100644
--- a/polly/test/DeLICM/pr41656.ll
+++ b/polly/test/DeLICM/pr41656.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,scop(print<polly-delicm>)' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; llvm.org/PR41656
 ;
diff --git a/polly/test/DeLICM/pr48783.ll b/polly/test/DeLICM/pr48783.ll
index 3cbd54b..e3c3eb6 100644
--- a/polly/test/DeLICM/pr48783.ll
+++ b/polly/test/DeLICM/pr48783.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,scop(print<polly-delicm>)' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; llvm.org/PR48783
 ;
diff --git a/polly/test/DeLICM/reduction.ll b/polly/test/DeLICM/reduction.ll
index 78c1a4c..29b7a36 100644
--- a/polly/test/DeLICM/reduction.ll
+++ b/polly/test/DeLICM/reduction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true -polly-print-delicm -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll
index b5bc0d5..d9c5268 100644
--- a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll
+++ b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Load (but not store) of A[j] hoisted, reduction only over some iterations.
 ;
diff --git a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll
index e995be1..6a4223f 100644
--- a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll
+++ b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Load (but not store) of A[j] hoisted, reduction not written in all iterations.
 ; FIXME: %join is not mapped because the MemoryKind::Value mapping does not
diff --git a/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll b/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll
index ca3a121..bf4b801 100644
--- a/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll
+++ b/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Hosted reduction load (but not the store) without preheader.
 ;
diff --git a/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll b/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll
index 4153823..027df44 100644
--- a/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll
+++ b/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ; Register-promoted reduction but without preheader.
 ;
diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll
index 35c723e8..4ea3fa5 100644
--- a/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll
+++ b/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Reduction over parametric number of elements and a loopguard if the
 ; reduction loop is not executed at all. Load hoisted before loop.
diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll
index 2b5f4d81..2e7abe4 100644
--- a/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll
+++ b/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Reduction over parametric number of elements and a loopguard if the
 ; reduction loop is not executed at all.
diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll
index 2e92813..60afdeb5f 100644
--- a/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll
+++ b/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Reduction over parametric number of elements and a loopguard if the
 ; reduction loop is not executed at all, such that A[j] is also not written to.
diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll
index 784c8ef..e63b457 100644
--- a/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll
+++ b/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Reduction over parametric number of elements and a loopguard if the
 ; reduction loop is not executed at all, such that A[j] is also not accessed.
diff --git a/polly/test/DeLICM/reduction_unrelatedunusual.ll b/polly/test/DeLICM/reduction_unrelatedunusual.ll
index 04c4377..97826f6 100644
--- a/polly/test/DeLICM/reduction_unrelatedunusual.ll
+++ b/polly/test/DeLICM/reduction_unrelatedunusual.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true -polly-print-delicm -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Map %add and %phi to A[j].
 ; The non-analyzable store to C[0] is unrelated and can be ignored.
diff --git a/polly/test/DeLICM/reject_loadafterstore.ll b/polly/test/DeLICM/reject_loadafterstore.ll
index 8af6e5e..4460620 100644
--- a/polly/test/DeLICM/reject_loadafterstore.ll
+++ b/polly/test/DeLICM/reject_loadafterstore.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reject_outofquota.ll b/polly/test/DeLICM/reject_outofquota.ll
index 551431f..820679a 100644
--- a/polly/test/DeLICM/reject_outofquota.ll
+++ b/polly/test/DeLICM/reject_outofquota.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-delicm -pass-remarks-analysis=polly-delicm -polly-delicm-max-ops=1 -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadPolly -polly-delicm -polly-print-dependences -polly-delicm-max-ops=1 -polly-dependences-computeout=0 -disable-output < %s | FileCheck %s -check-prefix=DEP
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-analysis=polly-delicm -polly-delicm-max-ops=1 -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-delicm,print<polly-dependences>' -polly-delicm-max-ops=1 -polly-dependences-computeout=0 -disable-output < %s | FileCheck %s -check-prefix=DEP
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reject_storeafterstore.ll b/polly/test/DeLICM/reject_storeafterstore.ll
index 1ec5ef6..ddd13da 100644
--- a/polly/test/DeLICM/reject_storeafterstore.ll
+++ b/polly/test/DeLICM/reject_storeafterstore.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reject_storeinsubregion.ll b/polly/test/DeLICM/reject_storeinsubregion.ll
index 1d38e80..c987156 100644
--- a/polly/test/DeLICM/reject_storeinsubregion.ll
+++ b/polly/test/DeLICM/reject_storeinsubregion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reject_unusualstore.ll b/polly/test/DeLICM/reject_unusualstore.ll
index a18a0c3..342888c6 100644
--- a/polly/test/DeLICM/reject_unusualstore.ll
+++ b/polly/test/DeLICM/reject_unusualstore.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-delicm -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=STATS
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-delicm -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=STATS
 ; REQUIRES: asserts
 ;
 ;    void func(double *A) {
diff --git a/polly/test/DeLICM/skip_maywrite.ll b/polly/test/DeLICM/skip_maywrite.ll
index 1e5f6b1..0d30791 100644
--- a/polly/test/DeLICM/skip_maywrite.ll
+++ b/polly/test/DeLICM/skip_maywrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/skip_multiaccess.ll b/polly/test/DeLICM/skip_multiaccess.ll
index 6a8c8e5..a7c79f7 100644
--- a/polly/test/DeLICM/skip_multiaccess.ll
+++ b/polly/test/DeLICM/skip_multiaccess.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; llvm.org/PR34485
 ; llvm.org/PR34989
diff --git a/polly/test/DeLICM/skip_notinloop.ll b/polly/test/DeLICM/skip_notinloop.ll
index 0730a3a..8e265e1 100644
--- a/polly/test/DeLICM/skip_notinloop.ll
+++ b/polly/test/DeLICM/skip_notinloop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      double phi = 0.0;
diff --git a/polly/test/DeLICM/skip_scalaraccess.ll b/polly/test/DeLICM/skip_scalaraccess.ll
index fa95d38..2cf13af 100644
--- a/polly/test/DeLICM/skip_scalaraccess.ll
+++ b/polly/test/DeLICM/skip_scalaraccess.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeadCodeElimination/chained_iterations.ll b/polly/test/DeadCodeElimination/chained_iterations.ll
index b79fdd6..f3bf07b 100644
--- a/polly/test/DeadCodeElimination/chained_iterations.ll
+++ b/polly/test/DeadCodeElimination/chained_iterations.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S %loadPolly -basic-aa -polly-dependences-analysis-type=value-based -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt -S %loadPolly -basic-aa -polly-dependences-analysis-type=value-based -polly-dce -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 ;
 ; for(i = 0; i < 200; i++ )
diff --git a/polly/test/DeadCodeElimination/chained_iterations_2.ll b/polly/test/DeadCodeElimination/chained_iterations_2.ll
index 1d1af92..52f034f 100644
--- a/polly/test/DeadCodeElimination/chained_iterations_2.ll
+++ b/polly/test/DeadCodeElimination/chained_iterations_2.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S %loadPolly -basic-aa -polly-dependences-analysis-type=value-based -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt -S %loadPolly -basic-aa -polly-dependences-analysis-type=value-based -polly-dce -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 ;
 ; for(i = 0; i < 200; i++ )
diff --git a/polly/test/DeadCodeElimination/computeout.ll b/polly/test/DeadCodeElimination/computeout.ll
index 51850d7..e54df42 100644
--- a/polly/test/DeadCodeElimination/computeout.ll
+++ b/polly/test/DeadCodeElimination/computeout.ll
@@ -1,6 +1,5 @@
-; RUN: opt -S %loadPolly -basic-aa -polly-dce -polly-print-ast -disable-output < %s | FileCheck %s
 ; RUN: opt -S %loadNPMPolly "-passes=scop(polly-dce,print<polly-ast>)" < %s | FileCheck %s
-; RUN: opt -S %loadPolly -basic-aa -polly-dce -polly-print-ast -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa "-passes=scop(polly-dce,print<polly-ast>)" -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ;     for(i = 0; i < 100; i++ )
diff --git a/polly/test/DeadCodeElimination/dead_iteration_elimination.ll b/polly/test/DeadCodeElimination/dead_iteration_elimination.ll
index f496f78..c102f60 100644
--- a/polly/test/DeadCodeElimination/dead_iteration_elimination.ll
+++ b/polly/test/DeadCodeElimination/dead_iteration_elimination.ll
@@ -1,4 +1,3 @@
-; RUN: opt -S %loadPolly -basic-aa -polly-dependences-analysis-type=value-based -polly-dce -polly-dce-precise-steps=2 -polly-print-ast -disable-output < %s | FileCheck %s
 ; RUN: opt -S %loadNPMPolly "-passes=scop(polly-dce,print<polly-ast>)" -polly-dependences-analysis-type=value-based -polly-dce-precise-steps=2 < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 ;
diff --git a/polly/test/DeadCodeElimination/non-affine-affine-mix.ll b/polly/test/DeadCodeElimination/non-affine-affine-mix.ll
index e6a5dd2..36f5547 100644
--- a/polly/test/DeadCodeElimination/non-affine-affine-mix.ll
+++ b/polly/test/DeadCodeElimination/non-affine-affine-mix.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-dce -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ;    void f(int *A) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/DeadCodeElimination/non-affine.ll b/polly/test/DeadCodeElimination/non-affine.ll
index 38a7fcb..ef528b4 100644
--- a/polly/test/DeadCodeElimination/non-affine.ll
+++ b/polly/test/DeadCodeElimination/non-affine.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-dce -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK: for (int c0 = 0; c0 <= 1023; c0 += 1)
 ;
diff --git a/polly/test/DeadCodeElimination/null_schedule.ll b/polly/test/DeadCodeElimination/null_schedule.ll
index 633a84b..01d34e9 100644
--- a/polly/test/DeadCodeElimination/null_schedule.ll
+++ b/polly/test/DeadCodeElimination/null_schedule.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S %loadPolly -basic-aa -polly-dependences-analysis-type=value-based -polly-dce -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 ; A[0] = 1;
 ;
diff --git a/polly/test/DependenceInfo/computeout.ll b/polly/test/DependenceInfo/computeout.ll
index 048de29..c2a3456 100644
--- a/polly/test/DependenceInfo/computeout.ll
+++ b/polly/test/DependenceInfo/computeout.ll
@@ -1,7 +1,5 @@
-; RUN: opt -S %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s -check-prefix=VALUE
-; RUN: opt -S %loadPolly -polly-print-function-dependences -disable-output < %s | FileCheck %s -check-prefix=FUNC-VALUE
-; RUN: opt -S %loadPolly -polly-print-dependences -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
-; RUN: opt -S %loadPolly -polly-print-function-dependences -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
+; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s -check-prefix=VALUE
+; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ;     for(i = 0; i < 100; i++ )
diff --git a/polly/test/DependenceInfo/different_schedule_dimensions.ll b/polly/test/DependenceInfo/different_schedule_dimensions.ll
index 3f96616..f89791f 100644
--- a/polly/test/DependenceInfo/different_schedule_dimensions.ll
+++ b/polly/test/DependenceInfo/different_schedule_dimensions.ll
@@ -1,7 +1,5 @@
-; RUN: opt -S %loadPolly -polly-print-dependences \
+; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' \
 ; RUN:                   -disable-output < %s | FileCheck %s
-; RUN: opt -S %loadPolly -polly-print-function-dependences \
-; RUN:                   -disable-output < %s | FileCheck %s -check-prefix=FUNC
 
 ; CHECK: RAW dependences:
 ; CHECK:   { Stmt_bb9[0] -> Stmt_bb10[0] }
diff --git a/polly/test/DependenceInfo/do_pluto_matmult.ll b/polly/test/DependenceInfo/do_pluto_matmult.ll
index d71608e..b88cf9b 100644
--- a/polly/test/DependenceInfo/do_pluto_matmult.ll
+++ b/polly/test/DependenceInfo/do_pluto_matmult.ll
@@ -1,7 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-dependences -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE
-; RUN: opt %loadPolly -basic-aa -polly-print-dependences -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY
-; RUN: opt %loadPolly -basic-aa -polly-print-function-dependences -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=FUNC-VALUE
-; RUN: opt %loadPolly -basic-aa -polly-print-function-dependences -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=FUNC-MEMORY
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/DependenceInfo/fine_grain_dep_0.ll b/polly/test/DependenceInfo/fine_grain_dep_0.ll
index 9c79e36..f93814c 100644
--- a/polly/test/DependenceInfo/fine_grain_dep_0.ll
+++ b/polly/test/DependenceInfo/fine_grain_dep_0.ll
@@ -1,7 +1,6 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-dependences -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s --check-prefix=REF
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-dependences -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s --check-prefix=ACC
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-function-dependences -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s --check-prefix=ACC
-;
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s --check-prefix=REF
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s --check-prefix=ACC
+
 ; REF:      RAW dependences:
 ; REF-NEXT:     [N] -> { [Stmt_for_body[i0] -> MemRef_b[]] -> [Stmt_for_body[6 + i0] -> MemRef_b[]] : 0 <= i0 <= -13 + N; Stmt_for_body[i0] -> Stmt_for_body[6 + i0] : 0 <= i0 <= -13 + N; Stmt_for_body[i0] -> Stmt_for_body[4 + i0] : 0 <= i0 <= -11 + N; [Stmt_for_body[i0] -> MemRef_a[]] -> [Stmt_for_body[4 + i0] -> MemRef_a[]] : 0 <= i0 <= -11 + N }
 ; REF-NEXT: WAR dependences:
diff --git a/polly/test/DependenceInfo/generate_may_write_dependence_info.ll b/polly/test/DependenceInfo/generate_may_write_dependence_info.ll
index 0b7f2d4..6773234 100644
--- a/polly/test/DependenceInfo/generate_may_write_dependence_info.ll
+++ b/polly/test/DependenceInfo/generate_may_write_dependence_info.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s -check-prefix=VALUE
+; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s -check-prefix=VALUE
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 ;                     for (int i = 0; i < N; i++) {
diff --git a/polly/test/DependenceInfo/infeasible_context.ll b/polly/test/DependenceInfo/infeasible_context.ll
index d701b82..cde3102d 100644
--- a/polly/test/DependenceInfo/infeasible_context.ll
+++ b/polly/test/DependenceInfo/infeasible_context.ll
@@ -1,10 +1,9 @@
-; RUN: opt %loadPolly -polly-print-function-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN:  | FileCheck %s -check-prefix=FUNC-SCOP
-; RUN: opt %loadPolly -polly-print-function-dependences -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(print<polly-dependences>)' -disable-output < %s 2>&1 \
 ; RUN:  | FileCheck %s -check-prefix=FUNC-DEPS
 ;
 ; FUNC-SCOP-NOT: Statement
-; FUNC-DEPS-LABEL: Printing analysis 'Polly - Calculate dependences for all the SCoPs of a function' for function 'readgeo'
 ; FUNC-DEPS-NOT: RAW dependences
 ;
 ; Due to an infeasible run-time check, scop object is empty and we do not compute dependences.
diff --git a/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll b/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll
index 09c5162..392a347 100644
--- a/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll
+++ b/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; Verify that the presence of a may-write (S1) between a read (S0) and a
 ; must-write (S2) does not block the generation of RAW dependences. This makes
diff --git a/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll b/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll
index 25c7e3d..ae5fd3b 100644
--- a/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll
+++ b/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-allow-nonaffine-loops -polly-allow-nonaffine -debug-only=polly-dependence < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-allow-nonaffine-loops -polly-allow-nonaffine -debug-only=polly-dependence < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 ; CHECK:        MayWriteAccess :=   [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/DependenceInfo/reduction_complex_location.ll b/polly/test/DependenceInfo/reduction_complex_location.ll
index 7ca8399..7722ee9 100644
--- a/polly/test/DependenceInfo/reduction_complex_location.ll
+++ b/polly/test/DependenceInfo/reduction_complex_location.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-dependences -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-dependences -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     {  }
diff --git a/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll b/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll
index 3632bd2..840d1f3 100644
--- a/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll
+++ b/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; This loopnest contains a reduction which imposes the same dependences as the
 ; accesses to the array A. We need to ensure we keep the dependences of A.
diff --git a/polly/test/DependenceInfo/reduction_dependences_not_null.ll b/polly/test/DependenceInfo/reduction_dependences_not_null.ll
index 69fd744..56d84a9 100644
--- a/polly/test/DependenceInfo/reduction_dependences_not_null.ll
+++ b/polly/test/DependenceInfo/reduction_dependences_not_null.ll
@@ -1,7 +1,7 @@
 ; Test that the reduction dependences are always initialised, even in a case
 ; where we have no reduction. If this object is NULL, then isl operations on
 ; it will fail.
-; RUN: opt -S %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s -check-prefix=VALUE
+; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s -check-prefix=VALUE
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ;     for(i = 0; i < 100; i++ )
diff --git a/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll b/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll
index 71903d9..76c7fc6 100644
--- a/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll
+++ b/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_for_body3[i0, i1] -> Stmt_for_body3[i0 + i1, o1] : i0 >= 0 and 0 <= i1 <= 1023 - i0 and i1 <= 1 and 0 < o1 <= 511 }
diff --git a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll
index 234de5c..02b814a 100644
--- a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll
+++ b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll
@@ -1,6 +1,6 @@
-; RUN: opt -basic-aa %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
-; RUN: opt -basic-aa %loadPolly -polly-print-dependences -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s
-; RUN: opt -basic-aa %loadPolly -polly-print-dependences -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s
 ;
 ; Verify that only the inner reduction like accesses cause reduction dependences
 ;
diff --git a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll
index acd674d..91bd35d 100644
--- a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll
+++ b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -basic-aa -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     {  }
diff --git a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll
index bdfcfc9..040d513 100644
--- a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll
+++ b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -basic-aa -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      Reduction dependences:
 ; CHECK-NEXT:     { Stmt_for_inc[i0, i1] -> Stmt_for_inc[i0, 1 + i1] : 0 <= i0 <= 99 and 0 <= i1 <= 98 }
diff --git a/polly/test/DependenceInfo/reduction_multiple_reductions.ll b/polly/test/DependenceInfo/reduction_multiple_reductions.ll
index cf70508..527a8cf 100644
--- a/polly/test/DependenceInfo/reduction_multiple_reductions.ll
+++ b/polly/test/DependenceInfo/reduction_multiple_reductions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; Verify we do not have dependences between the if and the else clause
 ;
diff --git a/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll b/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll
index 8d8557a..fb5fd96 100644
--- a/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll
+++ b/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ;
 ; These are the important RAW dependences, as they need to originate/end in only one iteration:
diff --git a/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll b/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll
index 7b4a68a..3ec3920 100644
--- a/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll
+++ b/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; FIXME: Change the comment once we allow different pointers
 ; The statement is "almost" reduction like but should not yield any reduction dependences
diff --git a/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll b/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll
index 0d09e5a..23bd8ef 100644
--- a/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll
+++ b/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -basic-aa -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      Reduction dependences:
 ; CHECK-NEXT:     [N] -> { Stmt_for_body3[i0, i1] -> Stmt_for_body3[i0, 1 + i1] : 0 <= i0 <= 1023 and i1 >= 0 and 1024 - N + i0 <= i1 <= 1022 }
diff --git a/polly/test/DependenceInfo/reduction_privatization_deps.ll b/polly/test/DependenceInfo/reduction_privatization_deps.ll
index ce90e21..0e0f717 100644
--- a/polly/test/DependenceInfo/reduction_privatization_deps.ll
+++ b/polly/test/DependenceInfo/reduction_privatization_deps.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_S1[i0, i1] -> Stmt_S2[-1 + i0 + i1] : 0 <= i0 <= 1023 and i1 >= 0 and -i0 < i1 <= 1024 - i0 and i1 <= 1023; Stmt_S0[i0] -> Stmt_S1[o0, i0 - o0] : i0 <= 1023 and 0 <= o0 <= i0 }
diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_2.ll b/polly/test/DependenceInfo/reduction_privatization_deps_2.ll
index 4904004..cafa319 100644
--- a/polly/test/DependenceInfo/reduction_privatization_deps_2.ll
+++ b/polly/test/DependenceInfo/reduction_privatization_deps_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; We have privatization dependences from a textually later statement to a
 ; textually earlier one, but the dependences still go forward in time.
diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_3.ll b/polly/test/DependenceInfo/reduction_privatization_deps_3.ll
index a3935eb..d86da92 100644
--- a/polly/test/DependenceInfo/reduction_privatization_deps_3.ll
+++ b/polly/test/DependenceInfo/reduction_privatization_deps_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_S1[i0] -> Stmt_S3[2 + i0] : 0 <= i0 <= 96; Stmt_S2[i0, i1] -> Stmt_S3[o0] : i1 <= 1 - i0 and -i1 < o0 <= 1 and o0 <= 1 + i0 - i1; Stmt_S3[i0] -> Stmt_S2[o0, 1 - i0] : 0 <= i0 <= 1 and i0 < o0 <= 98 }
diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_4.ll b/polly/test/DependenceInfo/reduction_privatization_deps_4.ll
index 10d726a..d84c04f 100644
--- a/polly/test/DependenceInfo/reduction_privatization_deps_4.ll
+++ b/polly/test/DependenceInfo/reduction_privatization_deps_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_S1[i0] -> Stmt_S2[i0, i0] : 0 <= i0 <= 98; Stmt_S2[i0, i0] -> Stmt_S3[i0] : 0 <= i0 <= 98; Stmt_S3[i0] -> Stmt_S2[o0, i0] : i0 >= 0 and i0 < o0 <= 98; Stmt_S2[i0, i1] -> Stmt_S1[i1] : i0 >= 0 and i0 < i1 <= 98 }
diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_5.ll b/polly/test/DependenceInfo/reduction_privatization_deps_5.ll
index e8d5118..592c723 100644
--- a/polly/test/DependenceInfo/reduction_privatization_deps_5.ll
+++ b/polly/test/DependenceInfo/reduction_privatization_deps_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_S1[i0, 0] -> Stmt_S2[i0, 0] : 0 <= i0 <= 98; Stmt_S2[i0, 0] -> Stmt_S1[1 + i0, 0] : 0 <= i0 <= 97 }
diff --git a/polly/test/DependenceInfo/reduction_sequence.ll b/polly/test/DependenceInfo/reduction_sequence.ll
index 4a46889..7ce9d37 100644
--- a/polly/test/DependenceInfo/reduction_sequence.ll
+++ b/polly/test/DependenceInfo/reduction_sequence.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 
 ;    void manyreductions(long *A) {
 ;      for (long i = 0; i < 1024; i++)
diff --git a/polly/test/DependenceInfo/reduction_simple_iv.ll b/polly/test/DependenceInfo/reduction_simple_iv.ll
index e3307af..d13d14e 100644
--- a/polly/test/DependenceInfo/reduction_simple_iv.ll
+++ b/polly/test/DependenceInfo/reduction_simple_iv.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     {  }
diff --git a/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll b/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll
index c7651c3..4c97fbb 100644
--- a/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll
+++ b/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -debug-only=polly-dependence -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -debug-only=polly-dependence -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll b/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll
index b61fd84..804005c 100644
--- a/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll
+++ b/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_S1[i0, i1] -> Stmt_S2[i0] : 0 <= i0 <= 99 and 0 <= i1 <= 99; Stmt_S0[i0] -> Stmt_S1[i0, o1] : 0 <= i0 <= 99 and 0 <= o1 <= 99; Stmt_S2[i0] -> Stmt_S0[1 + i0] : 0 <= i0 <= 98 }
diff --git a/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll b/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll
index a3a87c7..9596827 100644
--- a/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll
+++ b/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     [N] -> { Stmt_S1[i0] -> Stmt_S2[] : N >= 11 and 0 <= i0 <= 1023; Stmt_S0[] -> Stmt_S1[o0] : N >= 11 and 0 <= o0 <= 1023 }
diff --git a/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll b/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll
index c904629..d67683d 100644
--- a/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll
+++ b/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     {  }
diff --git a/polly/test/DependenceInfo/sequential_loops.ll b/polly/test/DependenceInfo/sequential_loops.ll
index 8dfa13c..6ae7200 100644
--- a/polly/test/DependenceInfo/sequential_loops.ll
+++ b/polly/test/DependenceInfo/sequential_loops.ll
@@ -1,34 +1,43 @@
-; RUN: opt -S %loadPolly -basic-aa -polly-print-dependences -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE
-; RUN: opt -S %loadPolly -basic-aa -polly-print-dependences -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY
-; RUN: opt -S %loadPolly -basic-aa -polly-print-dependences -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s -check-prefix=VALUE_ACCESS
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s -check-prefix=VALUE_ACCESS
 
-; VALUE-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.3' in function 'sequential_writes':
-; VALUE-NEXT:      RAW dependences:
+; VALUE:      RAW dependences:
 ; VALUE-NEXT:          {  }
 ; VALUE-NEXT:      WAR dependences:
 ; VALUE-NEXT:          {  }
 ; VALUE-NEXT:      WAW dependences:
 ; VALUE-NEXT:          { Stmt_S1[i0] -> Stmt_S3[i0] : 10 <= i0 <= 99; Stmt_S1[i0] -> Stmt_S2[i0] : 0 <= i0 <= 9; Stmt_S2[i0] -> Stmt_S3[i0] : 0 <= i0 <= 9 }
 ;
-;VALUE_ACCESS-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.3' in function 'sequential_writes':
-;VALUE_ACCESS-NEXT:        RAW dependences:
-;VALUE_ACCESS-NEXT:                {  }
-;VALUE_ACCESS-NEXT:        WAR dependences:
-;VALUE_ACCESS-NEXT:                {  }
-;VALUE_ACCESS-NEXT:        WAW dependences:
-;VALUE_ACCESS-NEXT:                { Stmt_S1[i0] -> Stmt_S3[i0] : 10 <= i0 <= 99; Stmt_S1[i0] -> Stmt_S2[i0] : 0 <= i0 <= 9; [Stmt_S2[i0] -> Stmt_S2_Write0[]] -> [Stmt_S3[i0] -> Stmt_S3_Write0[]] : 0 <= i0 <= 9; Stmt_S2[i0] -> Stmt_S3[i0] : 0 <= i0 <= 9; [Stmt_S1[i0] -> Stmt_S1_Write0[]] -> [Stmt_S2[i0] -> Stmt_S2_Write0[]] : 0 <= i0 <= 9; [Stmt_S1[i0] -> Stmt_S1_Write0[]] -> [Stmt_S3[i0] -> Stmt_S3_Write0[]] : 10 <= i0 <= 99 }
-
-;
-; VALUE-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.3' in function 'read_after_writes':
-; VALUE-NEXT:      RAW dependences:
+; VALUE:      RAW dependences:
 ; VALUE-NEXT:          { Stmt_S2[i0] -> Stmt_S3[i0] : 0 <= i0 <= 9; Stmt_S1[i0] -> Stmt_S3[i0] : 10 <= i0 <= 99 }
 ; VALUE-NEXT:      WAR dependences:
 ; VALUE-NEXT:          {  }
 ; VALUE-NEXT:      WAW dependences:
 ; VALUE-NEXT:          { Stmt_S1[i0] -> Stmt_S2[i0] : 0 <= i0 <= 9 }
 ;
-;VALUE_ACCESS-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.3' in function 'read_after_writes':
-;VALUE_ACCESS-NEXT:        RAW dependences:
+; VALUE:      RAW dependences:
+; VALUE-NEXT:          {  }
+; VALUE-NEXT:      WAR dependences:
+; VALUE-NEXT:          { Stmt_S1[i0] -> Stmt_S2[i0] : 0 <= i0 <= 9; Stmt_S1[i0] -> Stmt_S3[i0] : 10 <= i0 <= 99 }
+; VALUE-NEXT:      WAW dependences:
+; VALUE-NEXT:          { Stmt_S2[i0] -> Stmt_S3[i0] : 0 <= i0 <= 9 }
+;
+; VALUE:      RAW dependences:
+; VALUE-NEXT:          [p] -> { Stmt_S1[i0] -> Stmt_S2[-p + i0] : i0 >= p and 0 <= i0 <= 99 and i0 <= 9 + p }
+; VALUE-NEXT:      WAR dependences:
+; VALUE-NEXT:          [p] -> {  }
+; VALUE-NEXT:      WAW dependences:
+; VALUE-NEXT:          [p] -> {  }
+;
+;VALUE_ACCESS:        RAW dependences:
+;VALUE_ACCESS-NEXT:                {  }
+;VALUE_ACCESS-NEXT:        WAR dependences:
+;VALUE_ACCESS-NEXT:                {  }
+;VALUE_ACCESS-NEXT:        WAW dependences:
+;VALUE_ACCESS-NEXT:                { Stmt_S1[i0] -> Stmt_S3[i0] : 10 <= i0 <= 99; Stmt_S1[i0] -> Stmt_S2[i0] : 0 <= i0 <= 9; [Stmt_S2[i0] -> Stmt_S2_Write0[]] -> [Stmt_S3[i0] -> Stmt_S3_Write0[]] : 0 <= i0 <= 9; Stmt_S2[i0] -> Stmt_S3[i0] : 0 <= i0 <= 9; [Stmt_S1[i0] -> Stmt_S1_Write0[]] -> [Stmt_S2[i0] -> Stmt_S2_Write0[]] : 0 <= i0 <= 9; [Stmt_S1[i0] -> Stmt_S1_Write0[]] -> [Stmt_S3[i0] -> Stmt_S3_Write0[]] : 10 <= i0 <= 99 }
+;
+;VALUE_ACCESS:        RAW dependences:
 ;VALUE_ACCESS-NEXT:                { Stmt_S1[i0] -> Stmt_S3[i0] : 10 <= i0 <= 99; Stmt_S2[i0] -> Stmt_S3[i0] : 0 <= i0 <= 9; [Stmt_S2[i0] -> Stmt_S2_Write0[]] -> [Stmt_S3[i0] -> Stmt_S3_Read0[]] : 0 <= i0 <= 9; [Stmt_S1[i0] -> Stmt_S1_Write0[]] -> [Stmt_S3[i0] -> Stmt_S3_Read0[]] : 10 <= i0 <= 99 }
 
 ;VALUE_ACCESS-NEXT:        WAR dependences:
@@ -36,64 +45,42 @@
 ;VALUE_ACCESS-NEXT:        WAW dependences:
 ;VALUE_ACCESS-NEXT:                { [Stmt_S1[i0] -> Stmt_S1_Write0[]] -> [Stmt_S2[i0] -> Stmt_S2_Write0[]] : 0 <= i0 <= 9; Stmt_S1[i0] -> Stmt_S2[i0] : 0 <= i0 <= 9 }
 ;
-; VALUE-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.3' in function 'write_after_read':
-; VALUE-NEXT:      RAW dependences:
-; VALUE-NEXT:          {  }
-; VALUE-NEXT:      WAR dependences:
-; VALUE-NEXT:          { Stmt_S1[i0] -> Stmt_S2[i0] : 0 <= i0 <= 9; Stmt_S1[i0] -> Stmt_S3[i0] : 10 <= i0 <= 99 }
-; VALUE-NEXT:      WAW dependences:
-; VALUE-NEXT:          { Stmt_S2[i0] -> Stmt_S3[i0] : 0 <= i0 <= 9 }
-;
-;VALUE_ACCESS-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.3' in function 'write_after_read':
-;VALUE_ACCESS-NEXT:         RAW dependences:
+;VALUE_ACCESS:         RAW dependences:
 ;VALUE_ACCESS-NEXT:                 {  }
 ;VALUE_ACCESS-NEXT:         WAR dependences:
 ;VALUE_ACCESS-NEXT:                { Stmt_S1[i0] -> Stmt_S2[i0] : 0 <= i0 <= 9; Stmt_S1[i0] -> Stmt_S3[i0] : 10 <= i0 <= 99; [Stmt_S1[i0] -> Stmt_S1_Read0[]] -> [Stmt_S2[i0] -> Stmt_S2_Write0[]] : 0 <= i0 <= 9; [Stmt_S1[i0] -> Stmt_S1_Read0[]] -> [Stmt_S3[i0] -> Stmt_S3_Write0[]] : 10 <= i0 <= 99 }
 ;VALUE_ACCESS-NEXT:         WAW dependences:
 ;VALUE_ACCESS-NEXT:                { Stmt_S2[i0] -> Stmt_S3[i0] : 0 <= i0 <= 9; [Stmt_S2[i0] -> Stmt_S2_Write0[]] -> [Stmt_S3[i0] -> Stmt_S3_Write0[]] : 0 <= i0 <= 9 }
 ;
-; VALUE-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.2' in function 'parametric_offset':
-; VALUE-NEXT:      RAW dependences:
-; VALUE-NEXT:          [p] -> { Stmt_S1[i0] -> Stmt_S2[-p + i0] : i0 >= p and 0 <= i0 <= 99 and i0 <= 9 + p }
-; VALUE-NEXT:      WAR dependences:
-; VALUE-NEXT:          [p] -> {  }
-; VALUE-NEXT:      WAW dependences:
-; VALUE-NEXT:          [p] -> {  }
-;
-;VALUE_ACCESS-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.2' in function 'parametric_offset':
-;VALUE_ACCESS-NEXT:        RAW dependences:
+;VALUE_ACCESS:        RAW dependences:
 ;VALUE_ACCESS-NEXT:                [p] -> { Stmt_S1[i0] -> Stmt_S2[-p + i0] : i0 >= p and 0 <= i0 <= 99 and i0 <= 9 + p; [Stmt_S1[i0] -> Stmt_S1_Write0[]] -> [Stmt_S2[-p + i0] -> Stmt_S2_Read0[]] : i0 >= p and 0 <= i0 <= 99 and i0 <= 9 + p }
 ;VALUE_ACCESS-NEXT:        WAR dependences:
 ;VALUE_ACCESS-NEXT:                [p] -> {  }
 ;VALUE_ACCESS-NEXT:        WAW dependences:
 ;VALUE_ACCESS-NEXT:                [p] -> {  }
 
-; MEMORY-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.3' in function 'sequential_writes':
-; MEMORY-NEXT:      RAW dependences:
+; MEMORY:      RAW dependences:
 ; MEMORY-NEXT:          {  }
 ; MEMORY-NEXT:      WAR dependences:
 ; MEMORY-NEXT:          {  }
 ; MEMORY-NEXT:      WAW dependences:
 ; MEMORY-NEXT:          { Stmt_S1[i0] -> Stmt_S3[i0] : 0 <= i0 <= 99; Stmt_S1[i0] -> Stmt_S2[i0] : 0 <= i0 <= 9; Stmt_S2[i0] -> Stmt_S3[i0] : 0 <= i0 <= 9 }
 ;
-; MEMORY-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.3' in function 'read_after_writes':
-; MEMORY-NEXT:      RAW dependences:
+; MEMORY:      RAW dependences:
 ; MEMORY-NEXT:          { Stmt_S2[i0] -> Stmt_S3[i0] : 0 <= i0 <= 9; Stmt_S1[i0] -> Stmt_S3[i0] : 0 <= i0 <= 99 }
 ; MEMORY-NEXT:      WAR dependences:
 ; MEMORY-NEXT:          {  }
 ; MEMORY-NEXT:      WAW dependences:
 ; MEMORY-NEXT:          { Stmt_S1[i0] -> Stmt_S2[i0] : 0 <= i0 <= 9 }
 ;
-; MEMORY-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.3' in function 'write_after_read':
-; MEMORY-NEXT:      RAW dependences:
+; MEMORY:      RAW dependences:
 ; MEMORY-NEXT:          {  }
 ; MEMORY-NEXT:      WAR dependences:
 ; MEMORY-NEXT:          { Stmt_S1[i0] -> Stmt_S2[i0] : 0 <= i0 <= 9; Stmt_S1[i0] -> Stmt_S3[i0] : 0 <= i0 <= 99 }
 ; MEMORY-NEXT:      WAW dependences:
 ; MEMORY-NEXT:          { Stmt_S2[i0] -> Stmt_S3[i0] : 0 <= i0 <= 9 }
 ;
-; MEMORY-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.2' in function 'parametric_offset':
-; MEMORY-NEXT:      RAW dependences:
+; MEMORY:      RAW dependences:
 ; MEMORY-NEXT:          [p] -> { Stmt_S1[i0] -> Stmt_S2[-p + i0] : i0 >= p and 0 <= i0 <= 99 and i0 <= 9 + p }
 ; MEMORY-NEXT:      WAR dependences:
 ; MEMORY-NEXT:          [p] -> {  }
diff --git a/polly/test/ForwardOpTree/atax.ll b/polly/test/ForwardOpTree/atax.ll
index 0690c1b..496e8315 100644
--- a/polly/test/ForwardOpTree/atax.ll
+++ b/polly/test/ForwardOpTree/atax.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ForwardOpTree/changed-kind.ll b/polly/test/ForwardOpTree/changed-kind.ll
index a1d5982..b9081f37 100644
--- a/polly/test/ForwardOpTree/changed-kind.ll
+++ b/polly/test/ForwardOpTree/changed-kind.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 
 ; In the code below, %0 is known to be equal to the content of @c (constant 0).
 ; Thus, in order to save a scalar dependency, forward-optree replaces
diff --git a/polly/test/ForwardOpTree/forward_from_region.ll b/polly/test/ForwardOpTree/forward_from_region.ll
index 53d2280..767a580 100644
--- a/polly/test/ForwardOpTree/forward_from_region.ll
+++ b/polly/test/ForwardOpTree/forward_from_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move instructions from region statements.
 ;
diff --git a/polly/test/ForwardOpTree/forward_hoisted.ll b/polly/test/ForwardOpTree/forward_hoisted.ll
index 32fca00..5d0b0a8 100644
--- a/polly/test/ForwardOpTree/forward_hoisted.ll
+++ b/polly/test/ForwardOpTree/forward_hoisted.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting=true -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move %val to %bodyB, so %bodyA can be removed (by -polly-simplify).
 ; This involves making the load-hoisted %val1 to be made available in %bodyB.
diff --git a/polly/test/ForwardOpTree/forward_instruction.ll b/polly/test/ForwardOpTree/forward_instruction.ll
index 1dcd643..50a9b07 100644
--- a/polly/test/ForwardOpTree/forward_instruction.ll
+++ b/polly/test/ForwardOpTree/forward_instruction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move %val to %bodyB, so %bodyA can be removed (by -polly-simplify)
 ;
diff --git a/polly/test/ForwardOpTree/forward_into_region.ll b/polly/test/ForwardOpTree/forward_into_region.ll
index dd18cfe..ef71b11 100644
--- a/polly/test/ForwardOpTree/forward_into_region.ll
+++ b/polly/test/ForwardOpTree/forward_into_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move instructions to region statements.
 ;
diff --git a/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll b/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll
index e5458c0..1c58544 100644
--- a/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll
+++ b/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting=true -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 
 define void @foo(ptr %A, i32 %p, ptr %B) {
diff --git a/polly/test/ForwardOpTree/forward_load.ll b/polly/test/ForwardOpTree/forward_load.ll
index 86e3cb0..0bba4183 100644
--- a/polly/test/ForwardOpTree/forward_load.ll
+++ b/polly/test/ForwardOpTree/forward_load.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-optree>)" -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load.
diff --git a/polly/test/ForwardOpTree/forward_load_differentarray.ll b/polly/test/ForwardOpTree/forward_load_differentarray.ll
index 786277b..364bf3e 100644
--- a/polly/test/ForwardOpTree/forward_load_differentarray.ll
+++ b/polly/test/ForwardOpTree/forward_load_differentarray.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; To forward %val, B[j] cannot be reused in bodyC because it is overwritten
 ; between. Verify that instead the alternative C[j] is used.
diff --git a/polly/test/ForwardOpTree/forward_load_double_write.ll b/polly/test/ForwardOpTree/forward_load_double_write.ll
index 1618722..4c30c7f 100644
--- a/polly/test/ForwardOpTree/forward_load_double_write.ll
+++ b/polly/test/ForwardOpTree/forward_load_double_write.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load even in case two writes of identical values are in
 ; one scop statement.
diff --git a/polly/test/ForwardOpTree/forward_load_fromloop.ll b/polly/test/ForwardOpTree/forward_load_fromloop.ll
index 8f08a13..1494e87 100644
--- a/polly/test/ForwardOpTree/forward_load_fromloop.ll
+++ b/polly/test/ForwardOpTree/forward_load_fromloop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Forward a the LoadInst %val into %bodyB. %val is executed multiple times,
 ; we must get the last loaded values.
diff --git a/polly/test/ForwardOpTree/forward_load_indirect.ll b/polly/test/ForwardOpTree/forward_load_indirect.ll
index f83af61..51ce94d 100644
--- a/polly/test/ForwardOpTree/forward_load_indirect.ll
+++ b/polly/test/ForwardOpTree/forward_load_indirect.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Forward an operand tree consisting of a speculatable instruction (%add)
 ; and a load (%val).
diff --git a/polly/test/ForwardOpTree/forward_load_memset_after.ll b/polly/test/ForwardOpTree/forward_load_memset_after.ll
index 13797a4..bd2cad4 100644
--- a/polly/test/ForwardOpTree/forward_load_memset_after.ll
+++ b/polly/test/ForwardOpTree/forward_load_memset_after.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load in the presence of a non-store WRITE access.
 ;
diff --git a/polly/test/ForwardOpTree/forward_load_memset_before.ll b/polly/test/ForwardOpTree/forward_load_memset_before.ll
index 60b1e07..3e89dea 100644
--- a/polly/test/ForwardOpTree/forward_load_memset_before.ll
+++ b/polly/test/ForwardOpTree/forward_load_memset_before.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load in the presence of a non-store WRITE access.
 ;
diff --git a/polly/test/ForwardOpTree/forward_load_tripleuse.ll b/polly/test/ForwardOpTree/forward_load_tripleuse.ll
index 1d0df2a..7526a83 100644
--- a/polly/test/ForwardOpTree/forward_load_tripleuse.ll
+++ b/polly/test/ForwardOpTree/forward_load_tripleuse.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-optree -polly-codegen -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>,polly-codegen' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; %val1 is used three times: Twice by its own operand tree of %val2 and once
 ; more by the store in %bodyB.
diff --git a/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll b/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll
index b7bae56..daf289d 100644
--- a/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll
+++ b/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load.
 ; The non-analyzable store to C[0] is unrelated and can be ignored.
diff --git a/polly/test/ForwardOpTree/forward_phi_load.ll b/polly/test/ForwardOpTree/forward_phi_load.ll
index 0b0bb20..1457aa9 100644
--- a/polly/test/ForwardOpTree/forward_phi_load.ll
+++ b/polly/test/ForwardOpTree/forward_phi_load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-optree-normalize-phi=true -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load.
 ;
diff --git a/polly/test/ForwardOpTree/forward_readonly.ll b/polly/test/ForwardOpTree/forward_readonly.ll
index a29c5bf..646121c 100644
--- a/polly/test/ForwardOpTree/forward_readonly.ll
+++ b/polly/test/ForwardOpTree/forward_readonly.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-analyze-read-only-scalars=true  -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,MODEL
-; RUN: opt %loadPolly -polly-analyze-read-only-scalars=false -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,NOMODEL
+; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true  '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,MODEL
+; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,NOMODEL
 ;
 ; Move %val to %bodyB, so %bodyA can be removed (by -polly-simplify)
 ;
diff --git a/polly/test/ForwardOpTree/forward_reusue.ll b/polly/test/ForwardOpTree/forward_reusue.ll
index ead8c73..d8ad317 100644
--- a/polly/test/ForwardOpTree/forward_reusue.ll
+++ b/polly/test/ForwardOpTree/forward_reusue.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move operand tree without duplicating values used multiple times.
 ;
diff --git a/polly/test/ForwardOpTree/forward_store.ll b/polly/test/ForwardOpTree/forward_store.ll
index a6369eb..17cb8b3 100644
--- a/polly/test/ForwardOpTree/forward_store.ll
+++ b/polly/test/ForwardOpTree/forward_store.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load.
 ;
diff --git a/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll b/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll
index f0da932..57b6818 100644
--- a/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll
+++ b/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Copy %val to bodyB, assuming the exit value of %i.
 ;
diff --git a/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll b/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll
index a38ab54..b4828e4 100644
--- a/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll
+++ b/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Test support for (synthesizable) inducation variables.
 ;
diff --git a/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll b/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll
index bb1760a..3228bb6 100644
--- a/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll
+++ b/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Synthesizable values defined outside of a loop can be used
 ; inside the loop.
diff --git a/polly/test/ForwardOpTree/forward_transitive.ll b/polly/test/ForwardOpTree/forward_transitive.ll
index 2438894..aacf135 100644
--- a/polly/test/ForwardOpTree/forward_transitive.ll
+++ b/polly/test/ForwardOpTree/forward_transitive.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move %v and %val to %bodyB, so %bodyA can be removed (by -polly-simplify)
 ;
diff --git a/polly/test/ForwardOpTree/jacobi-1d.ll b/polly/test/ForwardOpTree/jacobi-1d.ll
index 05ccd99..c9c71a1 100644
--- a/polly/test/ForwardOpTree/jacobi-1d.ll
+++ b/polly/test/ForwardOpTree/jacobi-1d.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ForwardOpTree/noforward_from_region.ll b/polly/test/ForwardOpTree/noforward_from_region.ll
index 3015091..bd5864c 100644
--- a/polly/test/ForwardOpTree/noforward_from_region.ll
+++ b/polly/test/ForwardOpTree/noforward_from_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Ensure we do not move instructions from region statements in case the
 ; instruction to move loads from an array which is also written to from
diff --git a/polly/test/ForwardOpTree/noforward_load_conditional.ll b/polly/test/ForwardOpTree/noforward_load_conditional.ll
index eaa0fc5..5474e74 100644
--- a/polly/test/ForwardOpTree/noforward_load_conditional.ll
+++ b/polly/test/ForwardOpTree/noforward_load_conditional.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; B[j] is overwritten by at least one statement between the
 ; definition of %val and its use. Hence, it cannot be forwarded.
diff --git a/polly/test/ForwardOpTree/noforward_load_writebetween.ll b/polly/test/ForwardOpTree/noforward_load_writebetween.ll
index e2272c1..697c940 100644
--- a/polly/test/ForwardOpTree/noforward_load_writebetween.ll
+++ b/polly/test/ForwardOpTree/noforward_load_writebetween.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Cannot rematerialize %val from B[0] at bodyC because B[0] has been
 ; overwritten in bodyB.
diff --git a/polly/test/ForwardOpTree/noforward_outofquota.ll b/polly/test/ForwardOpTree/noforward_outofquota.ll
index 2ec965d..306bb8d 100644
--- a/polly/test/ForwardOpTree/noforward_outofquota.ll
+++ b/polly/test/ForwardOpTree/noforward_outofquota.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-optree-max-ops=1 -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
-; RUN: opt %loadPolly -polly-optree-max-ops=1 -polly-optree -disable-output -stats < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=STATS
+; RUN: opt %loadNPMPolly -polly-optree-max-ops=1 '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-optree-max-ops=1 -passes=polly-optree -disable-output -stats < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=STATS
 ; REQUIRES: asserts
 ;
 ; for (int j = 0; j < n; j += 1) {
diff --git a/polly/test/ForwardOpTree/noforward_partial.ll b/polly/test/ForwardOpTree/noforward_partial.ll
index 127ac9f..edb5d34 100644
--- a/polly/test/ForwardOpTree/noforward_partial.ll
+++ b/polly/test/ForwardOpTree/noforward_partial.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Not the entire operand tree can be forwarded,
 ; some scalar dependencies would remain.
diff --git a/polly/test/ForwardOpTree/noforward_phi.ll b/polly/test/ForwardOpTree/noforward_phi.ll
index 58d41a4..755abad 100644
--- a/polly/test/ForwardOpTree/noforward_phi.ll
+++ b/polly/test/ForwardOpTree/noforward_phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Do not move PHI nodes.
 ;
diff --git a/polly/test/ForwardOpTree/noforward_selfrefphi.ll b/polly/test/ForwardOpTree/noforward_selfrefphi.ll
index b2d4dc5..be7e82f 100644
--- a/polly/test/ForwardOpTree/noforward_selfrefphi.ll
+++ b/polly/test/ForwardOpTree/noforward_selfrefphi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-optree-normalize-phi=true -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Contains a self-referencing PHINode that would require a
 ; transitive closure to handle.
diff --git a/polly/test/ForwardOpTree/noforward_sideffects.ll b/polly/test/ForwardOpTree/noforward_sideffects.ll
index a563376..c01b72a 100644
--- a/polly/test/ForwardOpTree/noforward_sideffects.ll
+++ b/polly/test/ForwardOpTree/noforward_sideffects.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly  -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly  '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Do not forward instructions with side-effects (here: function call).
 ;
diff --git a/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll b/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll
index f589fde..776d848 100644
--- a/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll
+++ b/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Do not try to forward %i.trunc, it is not synthesizable in %body.
 ;
diff --git a/polly/test/ForwardOpTree/out-of-quota1.ll b/polly/test/ForwardOpTree/out-of-quota1.ll
index 7afdb8e..ee3e326 100644
--- a/polly/test/ForwardOpTree/out-of-quota1.ll
+++ b/polly/test/ForwardOpTree/out-of-quota1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output %s | FileCheck %s
 
 ; This used to loop infinitely because of UINT_MAX returned by ISL on out-of-quota.
 
diff --git a/polly/test/IstAstInfo/alias_checks_with_empty_context.ll b/polly/test/IstAstInfo/alias_checks_with_empty_context.ll
index 9b95cd5..81c2953 100644
--- a/polly/test/IstAstInfo/alias_checks_with_empty_context.ll
+++ b/polly/test/IstAstInfo/alias_checks_with_empty_context.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s \
 ; RUN:     | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/IstAstInfo/alias_simple_1.ll b/polly/test/IstAstInfo/alias_simple_1.ll
index 83d470c..904f55d 100644
--- a/polly/test/IstAstInfo/alias_simple_1.ll
+++ b/polly/test/IstAstInfo/alias_simple_1.ll
@@ -1,8 +1,8 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -disable-output < %s | FileCheck %s --check-prefix=NOAA
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=BASI
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
 ;
 ;    int A[1024];
 ;
diff --git a/polly/test/IstAstInfo/alias_simple_2.ll b/polly/test/IstAstInfo/alias_simple_2.ll
index bbf528f..5fae579 100644
--- a/polly/test/IstAstInfo/alias_simple_2.ll
+++ b/polly/test/IstAstInfo/alias_simple_2.ll
@@ -1,9 +1,9 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -disable-output < %s | FileCheck %s --check-prefix=NOAA
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=BASI
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -globals-aa -polly-allow-nonaffine -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=globals-aa -polly-allow-nonaffine -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
 ;
 ;    int A[1024], B[1024];
 ;
diff --git a/polly/test/IstAstInfo/alias_simple_3.ll b/polly/test/IstAstInfo/alias_simple_3.ll
index 9067521..8599c29 100644
--- a/polly/test/IstAstInfo/alias_simple_3.ll
+++ b/polly/test/IstAstInfo/alias_simple_3.ll
@@ -1,8 +1,8 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -disable-output < %s | FileCheck %s --check-prefix=NOAA
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=BASI
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
 ;
 ;    int A[1024];
 ;    float B[1024];
diff --git a/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll b/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll
index 0cabd20..dc21dc1 100644
--- a/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll
+++ b/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s \
 ; RUN:   -polly-invariant-load-hoisting \
 ; RUN:   | FileCheck %s
 
diff --git a/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll b/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll
index b824c21..8d4adfa 100644
--- a/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll
+++ b/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-ast       -disable-output < %s | FileCheck %s --check-prefix=NOAA
-; RUN: opt %loadPolly -polly-print-ast -tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=      -disable-output < %s | FileCheck %s --check-prefix=NOAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
 ;
 ;    void jd(int *Int0, int *Int1, float *Float0, float *Float1) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll b/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll
index e0c3255..be37b27 100644
--- a/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll
+++ b/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output %s | FileCheck %s
 ;
 ;    void jd(int *A, int *B, int c) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll b/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll
index 74bad6c..1555058 100644
--- a/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll
+++ b/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ;    void jd(int *A, int *B, int c) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/IstAstInfo/dependence_distance_minimal.ll b/polly/test/IstAstInfo/dependence_distance_minimal.ll
index c6b1d15..d69cc3f 100644
--- a/polly/test/IstAstInfo/dependence_distance_minimal.ll
+++ b/polly/test/IstAstInfo/dependence_distance_minimal.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; The minimal dependence distance of the innermost loop should be 1 instead of 250.
 ; CHECK:    #pragma minimal dependence distance: 1
diff --git a/polly/test/IstAstInfo/domain_bounded_only_with_context.ll b/polly/test/IstAstInfo/domain_bounded_only_with_context.ll
index 32cebd7..e2cf0bd 100644
--- a/polly/test/IstAstInfo/domain_bounded_only_with_context.ll
+++ b/polly/test/IstAstInfo/domain_bounded_only_with_context.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ; CHECK:      {
 ; CHECK-NEXT:    if (p <= -1 || p >= 1)
diff --git a/polly/test/IstAstInfo/non_affine_access.ll b/polly/test/IstAstInfo/non_affine_access.ll
index d8757b2..98e8d2d 100644
--- a/polly/test/IstAstInfo/non_affine_access.ll
+++ b/polly/test/IstAstInfo/non_affine_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-ast-print-accesses -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-print-accesses -polly-allow-nonaffine -disable-output < %s | FileCheck %s
 ;
 ;    void non_affine_access(float A[]) {
 ;      for (long i = 0; i < 1024; i++)
diff --git a/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll b/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll
index 8d52e34..c20a7d6 100644
--- a/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll
+++ b/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK: #pragma known-parallel reduction (^ : MemRef_sum)
 ;        void f(int N, int M, int *sum) {
diff --git a/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll b/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll
index 9c6eea6..e6092f0 100644
--- a/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll
+++ b/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; This loopnest contains a reduction which imposes the same dependences as the
 ; accesses to the array A. We need to ensure we do __not__ parallelize anything
diff --git a/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll b/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll
index 5104f71..14de70f 100644
--- a/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll
+++ b/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK: #pragma simd reduction (+ : MemRef_sum{{[1,2]}}, MemRef_sum{{[1,2]}}) reduction (* : MemRef_prod) reduction (| : MemRef_or) reduction (& : MemRef_and)
 ; CHECK: #pragma known-parallel reduction (+ : MemRef_sum{{[1,2]}}, MemRef_sum{{[1,2]}}) reduction (* : MemRef_prod) reduction (| : MemRef_or) reduction (& : MemRef_and)
diff --git a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll
index 8a42cf8..15fca88 100644
--- a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK:          #pragma known-parallel reduction (+ : MemRef_A)
 ; CHECK-NEXT:     for (int c0 = 0; c0 <= 2; c0 += 1) {
diff --git a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll
index 8f5efd1..44e9aa4 100644
--- a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK:    #pragma known-parallel reduction
 ; CHECK:    for (int c0 = 0; c0 <= 2; c0 += 1) {
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll
index a711a36..2667535 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK:    #pragma known-parallel
 ; CHECK:    for (int c0 = 0; c0 <= 1; c0 += 1)
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll
index 485d696..46b2559 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; Verify that the outer dimension doesnt't carry reduction dependences
 ;
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll
index 375fabb..6f40ee9 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; Verify that the outer dimension doesnt't carry reduction dependences
 ;
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll
index 584c076..f82b956 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; Verify that the outer dimension doesnt't carry reduction dependences
 ;
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll
index eaa3444..b889db4 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; Verify that only the outer dimension needs privatization
 ;
diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions.ll
index 9618ec8..2a8fd7a 100644
--- a/polly/test/IstAstInfo/reduction_multiple_dimensions.ll
+++ b/polly/test/IstAstInfo/reduction_multiple_dimensions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK-NOT:#pragma known-parallel reduction
 ; CHECK:    #pragma known-parallel
diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll
index af31757..25f2fa5 100644
--- a/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll
+++ b/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK-NOT:#pragma known-parallel reduction
 ; CHECK:    #pragma known-parallel
diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll
index 1f71914..0d6be9a 100644
--- a/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll
+++ b/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK-NOT:#pragma known-parallel reduction
 ; CHECK:    #pragma known-parallel
diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll
index 40bae5e..8b53751 100644
--- a/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll
+++ b/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK-NOT:#pragma known-parallel reduction
 ; CHECK:    #pragma known-parallel
diff --git a/polly/test/IstAstInfo/run-time-condition.ll b/polly/test/IstAstInfo/run-time-condition.ll
index ccc9c7c..44d3534 100644
--- a/polly/test/IstAstInfo/run-time-condition.ll
+++ b/polly/test/IstAstInfo/run-time-condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ; for (i = 0; i < 1024; i++)
 ;   A[i] = B[i];
diff --git a/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll b/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll
index 2853e0a..8c3f230 100644
--- a/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll
+++ b/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
 ;
 ; Verify we do not simplify the runtime check to "true" due to the domain
 ; constraints as the test contains an error block that influenced the domains
diff --git a/polly/test/IstAstInfo/simple-run-time-condition.ll b/polly/test/IstAstInfo/simple-run-time-condition.ll
index 5fb99f0..488cd18 100644
--- a/polly/test/IstAstInfo/simple-run-time-condition.ll
+++ b/polly/test/IstAstInfo/simple-run-time-condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-precise-inbounds -polly-precise-fold-accesses -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-precise-inbounds -polly-precise-fold-accesses -disable-output < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
diff --git a/polly/test/IstAstInfo/single_loop_strip_mine.ll b/polly/test/IstAstInfo/single_loop_strip_mine.ll
index 1c627f8..afe6179 100644
--- a/polly/test/IstAstInfo/single_loop_strip_mine.ll
+++ b/polly/test/IstAstInfo/single_loop_strip_mine.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -basic-aa -polly-import-jscop -polly-ast-print-accesses -polly-ast-detect-parallel -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-VECTOR
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-ast-print-accesses -polly-ast-detect-parallel '-passes=polly-import-jscop,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-VECTOR
 
 ; for (i = 0; i < 1024; i++)
 ;   A[i] = B[i];
diff --git a/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll b/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll
index f1cd5da..f614f90 100644
--- a/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll
+++ b/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 ; XFAIL: *
 
 ;#include "limits.h"
diff --git a/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll b/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll
index d421e22..e91ea13 100644
--- a/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll
+++ b/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 ; XFAIL: *
 
 ;#include "limits.h"
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll
index d4a1a62..49a9625 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: expecting other token
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll
index 43f9d3e..749b962 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: Statement from JScop file has no key name 'accesses' for index 1.
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll
index 24ad037..1d97e3e 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: The number of memory accesses in the JSop file and the number of memory accesses differ for index 0.
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll
index 1060926..f4b7393 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: The number of indices and the number of statements differ.
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll
index 0797597..1f5cda35 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: Memory access number 0 has no key name 'relation' for statement number 1.
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll
index 9f72596..0c75084 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: JScop file has no key name 'statements'.
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll
index df7eb42..d8c9c3f 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: JScop file contains access function with undeclared ScopArrayInfo
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll
index 61c1173..f8d7cb8 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: JScop file changes the number of parameter dimensions.
 ;
diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll
index a14ae5c..6e13a5e 100644
--- a/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll
+++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll
@@ -1,4 +1,4 @@
-  ; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
+  ; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Array has not a valid type.
 ;
diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll
index 2a03197..7f65787 100644
--- a/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll
+++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; #define Ni 1056
 ; #define Nj 1056
diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll
index 45bb349..e698bdc 100644
--- a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll
+++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Array has no key 'name'.
 ;
diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll
index 5bbb974..f130b65 100644
--- a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll
+++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Array has no key 'sizes'.
 ;
diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll
index af01399..68d2e50 100644
--- a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll
+++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Array has no key 'type'.
 ;
diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll
index 2490e44..94c77dc 100644
--- a/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll
+++ b/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: JScop file has no key named 'context'.
 ;
diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll
index 66ce6a6..c20d5c0 100644
--- a/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll
+++ b/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: The isl_set is not a parameter set.
 ;
diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll
index 7bcc54d..92f4d61 100644
--- a/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll
+++ b/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: unexpected isl_token
 ;
diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll
index 65cdcbd..89668d8 100644
--- a/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll
+++ b/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: Imported context has the wrong number of parameters : Found 2 Expected 1
 ;
diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll
index b52db08..efe15c1 100644
--- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll
+++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: Statement 0 has no 'schedule' key.
 ;
diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll
index 5ce3ad2..db516f6 100644
--- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll
+++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: expecting other token
 ;
diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll
index 4329653..b93c984 100644
--- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll
+++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: JScop file has no key name 'statements'.
 ;
diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll
index f66fc6c..3fa14c6 100644
--- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll
+++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: The number of indices and the number of statements differ.
 ;
diff --git a/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll b/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll
index 791210f..1d81ff7 100644
--- a/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll
+++ b/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-mse -polly-print-scops -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-mse -polly-print-scops -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1| FileCheck %s --check-prefix=MSE
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that the expansion of an array with load after store in a same statement is not done.
diff --git a/polly/test/MaximalStaticExpansion/read_from_original.ll b/polly/test/MaximalStaticExpansion/read_from_original.ll
index 59f9379..57017381 100644
--- a/polly/test/MaximalStaticExpansion/read_from_original.ll
+++ b/polly/test/MaximalStaticExpansion/read_from_original.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadPolly -polly-mse -polly-print-scops -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-mse -polly-print-scops -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1| FileCheck %s --check-prefix=MSE
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that Polly detects problems and does not expand the array
diff --git a/polly/test/MaximalStaticExpansion/too_many_writes.ll b/polly/test/MaximalStaticExpansion/too_many_writes.ll
index 50a66cd..7e33de1 100644
--- a/polly/test/MaximalStaticExpansion/too_many_writes.ll
+++ b/polly/test/MaximalStaticExpansion/too_many_writes.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadPolly -polly-mse -polly-print-scops -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-mse -polly-print-scops -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that Polly detects problems and does not expand the array
diff --git a/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll b/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll
index 8e2707c..355fc02 100644
--- a/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll
+++ b/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-mse -polly-print-scops -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::Array
diff --git a/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll b/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll
index 2bf49b8..9305395 100644
--- a/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll
+++ b/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadPolly -polly-mse -polly-print-scops -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-mse -polly-print-scops -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::Array and MemoryKind::PHI.
diff --git a/polly/test/MaximalStaticExpansion/working_expansion.ll b/polly/test/MaximalStaticExpansion/working_expansion.ll
index bb5b236..a055e50 100644
--- a/polly/test/MaximalStaticExpansion/working_expansion.ll
+++ b/polly/test/MaximalStaticExpansion/working_expansion.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-mse -polly-print-scops -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::Array
diff --git a/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll b/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll
index 89ff789..77338c9 100644
--- a/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll
+++ b/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-mse -polly-print-scops -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
 ;
 ; Verify that the accesses are correctly expanded
diff --git a/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll b/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll
index 7ffd39f..9cfa553 100644
--- a/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll
+++ b/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-mse -polly-print-scops -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
 ;
 ; Verify that the accesses are correctly expanded
diff --git a/polly/test/MaximalStaticExpansion/working_phi_expansion.ll b/polly/test/MaximalStaticExpansion/working_phi_expansion.ll
index 43919c6..63e4d48 100644
--- a/polly/test/MaximalStaticExpansion/working_phi_expansion.ll
+++ b/polly/test/MaximalStaticExpansion/working_phi_expansion.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadPolly -polly-mse -polly-print-scops -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-mse -polly-print-scops -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::PHI
diff --git a/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll b/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll
index a581a38..87bd57a 100644
--- a/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll
+++ b/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-mse -polly-print-scops -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-mse -polly-print-scops -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::PHI
diff --git a/polly/test/MaximalStaticExpansion/working_value_expansion.ll b/polly/test/MaximalStaticExpansion/working_value_expansion.ll
index d54eff9..cc28a78 100644
--- a/polly/test/MaximalStaticExpansion/working_value_expansion.ll
+++ b/polly/test/MaximalStaticExpansion/working_value_expansion.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-mse -polly-print-scops -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::Value
diff --git a/polly/test/PruneUnprofitable/prune_only_scalardeps.ll b/polly/test/PruneUnprofitable/prune_only_scalardeps.ll
index 31db556..9cc2aec 100644
--- a/polly/test/PruneUnprofitable/prune_only_scalardeps.ll
+++ b/polly/test/PruneUnprofitable/prune_only_scalardeps.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false -polly-prune-unprofitable -disable-output -stats < %s 2>&1 | FileCheck -match-full-lines %s
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false "-passes=scop(polly-prune-unprofitable)" -disable-output -stats < %s 2>&1 | FileCheck -match-full-lines %s
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll b/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll
index 5acc353..38facb1 100644
--- a/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll
+++ b/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -S < %s
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -S < %s
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
 
 define void @sdbout_label() nounwind {
diff --git a/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll b/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll
index 3f4237b..8359860 100644
--- a/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll
+++ b/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -S < %s
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -S < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Check that we handle statements with an empty iteration domain correctly.
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll
index a61af2d..5e4ce82 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
 
 define void @func(i32 %n, ptr  noalias nonnull %A,  ptr  noalias nonnull %B) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll
index 185d5c5..de4c387 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
-; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT
 
 define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll
index f1eca0e..91bd549 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
-; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK
 
 define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll
index 35903ce..8b69d9e 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
-; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT
 
 define void @func(i32 %n, ptr noalias nonnull %A) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll
index 1fb8c00..49d1124 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
-; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK
 
 define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll
index 2db6833..a449a2f 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
 
 define void @func(i32 %n, ptr noalias nonnull %A) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll
index 49d008b..798e9b9 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
 
 define void @func(i32 %n, ptr noalias nonnull %A) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll
index 175b859..4d0ccc9 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
 
 ; This could theoretically be fused by adjusting the offset of the second loop by %k (instead of relying on schedule dimensions).
 
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll
index 48ba203..bf470b9 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
 
 define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll b/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll
index 537721f..b0f75dd 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-opt-isl -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Check that the disable_nonforced metadata is honored; optimization
 ; heuristics/rescheduling must not be applied.
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll
index aaf4d27..900360d 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-pragma-based-opts=1 -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=ON
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-pragma-based-opts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=OFF
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=ON
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-pragma-based-opts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=OFF
 ;
 define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll
index b1e9422..d45b624 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines
 ;
 ; CHECK: warning: distribute_illegal.c:2:3: not applying loop fission/distribution: cannot ensure semantic equivalence due to possible dependency violations
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll
index fc0df85..d835e66 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines
 ;
 ; CHECK: warning: distribute_illegal.c:1:42: not applying loop fission/distribution: cannot ensure semantic equivalence due to possible dependency violations
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll
index 9537f3a..a5781a7f 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-pragma-based-opts=1 -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
 ;
 ; Override unroll metadata with llvm.loop.unroll.disable.
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll
index b031097..cccf136 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
 ;
 ; Apply two loop transformations. First partial, then full unrolling.
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll
index b9a4c84..4d49907 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
 ;
 ; Full unroll of a loop with 5 iterations.
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll
index 0387aec..d67472a 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-pragma-based-opts=1 -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
-; RUN: opt %loadPolly -polly-pragma-based-opts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly -polly-pragma-based-opts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
 ;
 ; Unrolling with heuristic factor.
 ; Currently not supported and expected to be handled by LLVM's unroll pass.
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll
index 81e40f0..90101b4 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-pragma-based-opts=1 -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
-; RUN: opt %loadPolly -polly-pragma-based-opts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefix=OFF --match-full-lines
+; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly -polly-pragma-based-opts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefix=OFF --match-full-lines
 ;
 ; Partial unroll by a factor of 4.
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll
index 8665f68..4cfa3fb 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefix=OPT --match-full-lines
-; RUN: opt %loadPolly -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST --match-full-lines
-; RUN: opt %loadPolly -polly-opt-isl -polly-codegen -simplifycfg -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefix=OPT --match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=AST --match-full-lines
+; RUN: opt %loadNPMPolly '-passes=scop(polly-opt-isl,polly-codegen),simplifycfg' -S < %s | FileCheck %s --check-prefix=CODEGEN
 ;
 ; Partial unroll by a factor of 4.
 ;
@@ -49,7 +49,7 @@ return:
 ; OPT-NEXT:        - filter: "[n] -> { Stmt_body[i0] : (1 + i0) mod 4 = 0 }"
 
 
-; AST-LABEL: Printing analysis 'Polly - Generate an AST of the SCoP (isl)'for => return' in function 'func':
+; AST-LABEL: :: isl ast :: func :: %for---%return
 ; AST:       // Loop with Metadata
 ; AST-NEXT:  for (int c0 = 0; c0 < n; c0 += 4) {
 
diff --git a/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll b/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll
index 8585634..3f6f50e 100644
--- a/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll
+++ b/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-parallel -polly-vectorizer=stripmine -polly-codegen-verify -polly-opt-isl -polly-print-ast -polly-codegen -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-parallel -polly-vectorizer=stripmine -passes=polly-codegen-verify '-passes=polly-opt-isl,print<polly-ast>,polly-codegen' -disable-output < %s | FileCheck %s
 ;
 ; Check that there are no nested #pragma omp parallel for inside a
 ; #pragma omp parallel for loop.
diff --git a/polly/test/ScheduleOptimizer/computeout.ll b/polly/test/ScheduleOptimizer/computeout.ll
index 35e3416..a3286b4 100644
--- a/polly/test/ScheduleOptimizer/computeout.ll
+++ b/polly/test/ScheduleOptimizer/computeout.ll
@@ -1,6 +1,4 @@
-; RUN: opt -S %loadPolly -basic-aa -polly-opt-isl -polly-isl-arg=--no-schedule-serialize-sccs -polly-print-ast -disable-output < %s | FileCheck %s
 ; RUN: opt -S %loadNPMPolly "-passes=scop(polly-opt-isl,print<polly-ast>)" -polly-isl-arg=--no-schedule-serialize-sccs -disable-output < %s | FileCheck %s
-; RUN: opt -S %loadPolly -basic-aa -polly-opt-isl -polly-isl-arg=--schedule-serialize-sccs -polly-dependences-computeout=1 -polly-print-ast -disable-output  < %s | FileCheck %s -check-prefix=TIMEOUT
 ; RUN: opt -S %loadNPMPolly "-passes=scop(polly-opt-isl,print<polly-ast>)" -polly-isl-arg=--no-schedule-serialize-sccs -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll b/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll
index 43caca5..928ee85 100644
--- a/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll
+++ b/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll
@@ -1,9 +1,9 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-remarks-minimal \
-; RUN:     -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -polly-process-unprofitable -polly-remarks-minimal \
+; RUN:     '-passes=polly-opt-isl,print<polly-ast>' -polly-pattern-matching-based-opts=true \
 ; RUN:     -polly-target-throughput-vector-fma=1 \
 ; RUN:     -polly-target-latency-vector-fma=1 \
 ; RUN:     -polly-target-vector-register-bitwidth=4096 \
-; RUN:     -polly-target-1st-cache-level-associativity=3 -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN:     -polly-target-1st-cache-level-associativity=3 -disable-output < %s | FileCheck %s
 ;
 ;     /* Test that Polly does not crash due to configurations that can lead to
 ;    incorrect tile size computations.
diff --git a/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll b/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll
index daa1afd..b533cb8 100644
--- a/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll
+++ b/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-vectorizer=stripmine -polly-invariant-load-hoisting -polly-optimized-scops -polly-print-opt-isl -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-opt-isl>)" -polly-vectorizer=stripmine -polly-invariant-load-hoisting -disable-output < %s | FileCheck %s
 ;
 ; llvm.org/PR46578
diff --git a/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll b/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll
index 06e86d7..3dd579e 100644
--- a/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll
+++ b/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S %loadPolly -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ; CHECK:          // 1st level tiling - Tiles
 ; CHECK-NEXT:    #pragma known-parallel
 ; CHECK-NEXT:    for (int c0 = 0; c0 <= floord(ni - 1, 32); c0 += 1)
diff --git a/polly/test/ScheduleOptimizer/line-tiling-2.ll b/polly/test/ScheduleOptimizer/line-tiling-2.ll
index eb374cb..3a2c566 100644
--- a/polly/test/ScheduleOptimizer/line-tiling-2.ll
+++ b/polly/test/ScheduleOptimizer/line-tiling-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-tile-sizes=1,64 -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-tile-sizes=1,64 '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ; CHECK: for (int c0 = 0; c0 <= 1023; c0 += 1)
 ; CHECK:   for (int c1 = 0; c1 <= 7; c1 += 1)
diff --git a/polly/test/ScheduleOptimizer/line-tiling.ll b/polly/test/ScheduleOptimizer/line-tiling.ll
index 2f14ac1..0dbdeff 100644
--- a/polly/test/ScheduleOptimizer/line-tiling.ll
+++ b/polly/test/ScheduleOptimizer/line-tiling.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-tile-sizes=64,1 -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-tile-sizes=64,1 '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ; CHECK: for (int c0 = 0; c0 <= 15; c0 += 1)
 ; CHECK:   for (int c1 = 0; c1 <= 511; c1 += 1)
diff --git a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll
index faf51e0..8f270b9 100644
--- a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll
+++ b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-associativity=8 \
diff --git a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll
index 30b693a..de1c815 100644
--- a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll
+++ b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-associativity=8 \
@@ -6,7 +6,7 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
-; RUN: -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ;    /* C := alpha*A*B + beta*C */
 ;    /* _PB_NK % Kc != 0 */
@@ -18,7 +18,7 @@
 ;	     C[i][j] += alpha * A[i][k] * B[k][j];
 ;        }
 ;
-; CHECK-LABEL: Printing analysis 'Polly - Generate an AST from the SCoP (isl)' for region: 'bb8 => bb32' in function 'kernel_gemm':
+; CHECK-LABEL:     :: isl ast :: kernel_gemm :: %bb8---%bb32
 ; CHECK:    {
 ; CHECK-NEXT:      // 1st level tiling - Tiles
 ; CHECK-NEXT:      for (int c0 = 0; c0 <= 32; c0 += 1)
diff --git a/polly/test/ScheduleOptimizer/one-dimensional-band.ll b/polly/test/ScheduleOptimizer/one-dimensional-band.ll
index 4592907..a097d4a 100644
--- a/polly/test/ScheduleOptimizer/one-dimensional-band.ll
+++ b/polly/test/ScheduleOptimizer/one-dimensional-band.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ;    void jacobi1d(long T, long N, float *A, float *B) {
 ;      long t, i, j;
diff --git a/polly/test/ScheduleOptimizer/outer_coincidence.ll b/polly/test/ScheduleOptimizer/outer_coincidence.ll
index 2ab33ed..7c1af80 100644
--- a/polly/test/ScheduleOptimizer/outer_coincidence.ll
+++ b/polly/test/ScheduleOptimizer/outer_coincidence.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=no  -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=yes -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=OUTER
+; RUN: opt %loadNPMPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=no  '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=yes '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=OUTER
 
 ; By skewing, the diagonal can be made parallel. ISL does this when the Check
 ; the 'outer_coincidence' option is enabled.
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
index 6601116..8228a5c 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly \
+; RUN: opt %loadNPMPolly \
 ; RUN: -polly-pattern-matching-based-opts=true \
-; RUN: -polly-optree -polly-delicm -polly-simplify \
-; RUN: -polly-opt-isl -polly-tc-opt=true -debug -disable-output < %s 2>&1 \
+; RUN: '-passes=polly-optree,polly-delicm,polly-simplify,polly-opt-isl' \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 \
 ; RUN: | FileCheck %s
 ; REQUIRES: asserts
 
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll
index 95da89f..4bda758 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-delicm -polly-simplify -polly-opt-isl \
+; RUN: opt %loadNPMPolly '-passes=polly-delicm,polly-simplify,polly-opt-isl' \
 ; RUN: -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
index 7604257..09118e2 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
@@ -1,8 +1,8 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=false \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=false \
 ; RUN: -debug -polly-tc-opt -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -debug -polly-tc-opt -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-ast-detect-parallel -polly-print-ast -disable-output  < %s | FileCheck %s --check-prefix=PARALLEL-AST
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -stats -disable-output < %s 2>&1 | FileCheck %s --check-prefix=STATS -match-full-lines
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true -debug -polly-tc-opt -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
+; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -polly-pattern-matching-based-opts=true -polly-ast-detect-parallel -disable-output  < %s | FileCheck %s --check-prefix=PARALLEL-AST
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true -stats -disable-output < %s 2>&1 | FileCheck %s --check-prefix=STATS -match-full-lines
 ; REQUIRES: asserts
 ;
 ;    /* C := alpha*A*B + beta*C */
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
index ccdb39b..b771d1f 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop \
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-opt-isl' \
 ; RUN: -polly-import-jscop-postfix=transformed \
 ; RUN: -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
@@ -8,7 +8,7 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -debug \
+; RUN: -debug \
 ; RUN: -polly-tc-opt=true -disable-output < %s 2>&1 \
 ; RUN: | FileCheck %s
 ; REQUIRES: asserts
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll
index dd39fec..238f6dd 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-associativity=8 \
@@ -6,7 +6,7 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -disable-output < %s
+; RUN: -passes=polly-opt-isl -disable-output < %s
 ;
 ; Test whether isolation works as expected.
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll
index e086dd3..0e4540e 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=2 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-associativity=8 \
@@ -6,7 +6,7 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=128 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ; Test whether isolation works as expected.
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll
index a4c71c2..9678ad83 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-opt-isl  \
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-opt-isl,polly-codegen'  \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-associativity=8 \
@@ -6,7 +6,7 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-import-jscop-postfix=transformed -polly-codegen -S < %s \
+; RUN: -polly-import-jscop-postfix=transformed -S < %s \
 ; RUN: | FileCheck %s
 ;
 ; Check that we disable the Loop Vectorizer.
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll
index a8da219..e74884d 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -debug-only=polly-opt-isl -disable-output \
 ; RUN: -polly-tc-opt=true < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll
index c1ad301..9c99a09 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll
index 002816a..8e14035 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll
index d5679c7..4f562c3 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll
index 4e1620a..32ded89 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
index 01e336eb..f0c0177 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll
index 0be08d8..155177b 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll
index 9b2df49..3d21ac3 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll
index 3d3641d..00a4bf8 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll
index 8959614..bfe5c52 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-opt-isl \
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -passes=polly-opt-isl \
 ; RUN: -polly-pattern-matching-based-opts=true -polly-tc-opt=true \
 ; RUN: -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll
index 8a39579..a2e1ced 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
@@ -53,4 +53,4 @@ for.body8:                                        ; preds = %for.body8, %for.con
   br i1 %exitcond.not, label %for.cond.cleanup7, label %for.body8
 }
 
-declare double @llvm.fmuladd.f64(double, double, double)
-\ No newline at end of file
+declare double @llvm.fmuladd.f64(double, double, double)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
index fab3ac5..9844d37 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
@@ -1,11 +1,11 @@
-; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-size=0 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
-; RUN: -polly-opt-isl -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s
 
-; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-associativity=8 \
@@ -13,7 +13,7 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=EXTRACTION-OF-MACRO-KERNEL
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=EXTRACTION-OF-MACRO-KERNEL
 ;
 ;    /* C := alpha*A*B + beta*C */
 ;    for (i = 0; i < _PB_NI; i++)
@@ -24,7 +24,7 @@
 ;	     C[i][j] += alpha * A[i][k] * B[k][j];
 ;        }
 ;
-; CHECK-LABEL: Printing analysis 'Polly - Generate an AST from the SCoP (isl)' for region: 'bb8 => bb32' in function 'kernel_gemm':
+; CHECK-LABEL: :: isl ast :: kernel_gemm :: %bb8---%bb32 
 ; CHECK:    {
 ; CHECK-NEXT:      // 1st level tiling - Tiles
 ; CHECK-NEXT:      for (int c0 = 0; c0 <= 32; c0 += 1)
@@ -76,7 +76,7 @@
 ; CHECK-NEXT:          }
 ; CHECK-NEXT:    }
 ;
-; EXTRACTION-OF-MACRO-KERNEL-LABEL: Printing analysis 'Polly - Generate an AST from the SCoP (isl)' for region: 'bb8 => bb32' in function 'kernel_gemm':
+; EXTRACTION-OF-MACRO-KERNEL-LABEL: :: isl ast :: kernel_gemm :: %bb8---%bb32 
 ; EXTRACTION-OF-MACRO-KERNEL:    {
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:      // 1st level tiling - Tiles
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:      for (int c0 = 0; c0 <= 32; c0 += 1)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
index dc0edc6..250641d 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
@@ -1,12 +1,12 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -debug -polly-tc-opt=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
-; RUN: -polly-target-2nd-cache-level-size=262144 -polly-print-ast \
-; RUN: -polly-tc-opt=true -disable-output -polly-opt-isl < %s |  \
+; RUN: -polly-target-2nd-cache-level-size=262144 \
+; RUN: -polly-tc-opt=true -disable-output < %s |  \
 ; RUN: FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll
index 6581566..ad2c195 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-associativity=8 \
@@ -6,12 +6,12 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
-; opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; -polly-target-throughput-vector-fma=1 \
 ; -polly-target-latency-vector-fma=8 \
-; -polly-codegen -polly-target-1st-cache-level-associativity=8 \
+; -passes=polly-codegen -polly-target-1st-cache-level-associativity=8 \
 ; -polly-target-2nd-cache-level-associativity=8 \
 ; -polly-target-1st-cache-level-size=32768 \
 ; -polly-target-vector-register-bitwidth=256 \
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll
index bcf1fc9..1d3cdbd 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-associativity=8 \
@@ -6,12 +6,12 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
-;  opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+;  opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ;  -polly-target-throughput-vector-fma=1 \
 ;  -polly-target-latency-vector-fma=8 \
-;  -polly-codegen -polly-target-1st-cache-level-associativity=8 \
+;  -passes=polly-codegen -polly-target-1st-cache-level-associativity=8 \
 ;  -polly-target-2nd-cache-level-associativity=8 \
 ;  -polly-target-1st-cache-level-size=32768 \
 ;  -polly-target-vector-register-bitwidth=256 \
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll
index 77a3e02..59eaa4a 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-associativity=8 \
@@ -6,7 +6,7 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ;    /* C := A * B + C */
 ;    /* Elements of the matrices A, B, C have the float type. */
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll
index d02bc35..2544d50 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-associativity=8 \
@@ -6,7 +6,7 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ;    /* C := A * B + C */
 ;    /* Elements of the matrices B, C have the double type. */
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll
index 144abfd..85c1435 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-associativity=8 \
@@ -6,9 +6,9 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -disable-output < %s
+; RUN: -passes=polly-opt-isl -disable-output < %s
 ;
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s --check-prefix=DEPENDENCES
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s --check-prefix=DEPENDENCES
 ;
 ;    /* C := A * B + C */
 ;    /* Elements of the matrices A, B, C have the char type. */
diff --git a/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll b/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll
index 5b9783d..6428589 100644
--- a/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll
+++ b/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-opt-isl -debug-only=polly-opt-isl -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -passes=polly-opt-isl -debug-only=polly-opt-isl -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ; void pattern_matching_based_opts_splitmap(double C[static const restrict 2][2], double A[static const restrict 2][784], double B[static const restrict 784][2]) {
diff --git a/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll b/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
index fea2155..a18ba1d 100644
--- a/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
+++ b/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S %loadPolly -basic-aa -polly-tiling=false -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-tiling=false -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 @C = common global [1536 x [1536 x float]] zeroinitializer, align 16
diff --git a/polly/test/ScheduleOptimizer/prevectorization.ll b/polly/test/ScheduleOptimizer/prevectorization.ll
index 385ebf1..4db61ad 100644
--- a/polly/test/ScheduleOptimizer/prevectorization.ll
+++ b/polly/test/ScheduleOptimizer/prevectorization.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S %loadPolly -basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine                         -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt -S %loadPolly -basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine -polly-prevect-width=16 -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=VEC16
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine                         '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine -polly-prevect-width=16 '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=VEC16
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScheduleOptimizer/rectangular-tiling.ll b/polly/test/ScheduleOptimizer/rectangular-tiling.ll
index b527255..e1d768b 100644
--- a/polly/test/ScheduleOptimizer/rectangular-tiling.ll
+++ b/polly/test/ScheduleOptimizer/rectangular-tiling.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-tile-sizes=256,16                                                                                                                                        -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-tile-sizes=256,16 -polly-tiling=false                                                                                                                    -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=NOTILING
-; RUN: opt %loadPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8                                                                               -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=TWOLEVEL
-; RUN: opt %loadPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8 -polly-register-tiling                                                        -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=TWO-PLUS-REGISTER
+; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16                                                                                                                                        '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-tiling=false                                                                                                                    '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=NOTILING
+; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8                                                                               '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=TWOLEVEL
+; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8 -polly-register-tiling                                                        '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=TWO-PLUS-REGISTER
 
 ; CHECK: // 1st level tiling - Tiles
 ; CHECK: for (int c0 = 0; c0 <= 3; c0 += 1)
diff --git a/polly/test/ScheduleOptimizer/schedule_computeout.ll b/polly/test/ScheduleOptimizer/schedule_computeout.ll
index acc8601..1e1359e 100644
--- a/polly/test/ScheduleOptimizer/schedule_computeout.ll
+++ b/polly/test/ScheduleOptimizer/schedule_computeout.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-optree -polly-delicm  -polly-opt-isl -polly-schedule-computeout=10000 -debug-only="polly-opt-isl" < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-optree -passes=polly-delicm  -passes=polly-opt-isl -polly-schedule-computeout=10000 -debug-only="polly-opt-isl" < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 ; Bailout if the computations of schedule compute exceeds the max scheduling quota.
diff --git a/polly/test/ScheduleOptimizer/statistics.ll b/polly/test/ScheduleOptimizer/statistics.ll
index 472febe..84eb593 100644
--- a/polly/test/ScheduleOptimizer/statistics.ll
+++ b/polly/test/ScheduleOptimizer/statistics.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -stats -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -stats -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 
 ; REQUIRES: asserts
 
diff --git a/polly/test/ScheduleOptimizer/tile_after_fusion.ll b/polly/test/ScheduleOptimizer/tile_after_fusion.ll
index 8e58492..50a46d6 100644
--- a/polly/test/ScheduleOptimizer/tile_after_fusion.ll
+++ b/polly/test/ScheduleOptimizer/tile_after_fusion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-isl-arg=--no-schedule-serialize-sccs -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-isl-arg=--no-schedule-serialize-sccs '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ;
 ;    void tf(int C[256][256][256], int A0[256][256][256], int A1[256][256][256]) {
@@ -17,7 +17,7 @@
 ; checks whether they are tiled after being fused when polly-opt-fusion equals
 ; "max".
 ;
-; CHECK-LABEL: Printing analysis 'Polly - Generate an AST from the SCoP (isl)' for region: 'for.cond => for.end56' in function 'tf':
+; CHECK-LABEL: :: isl ast :: tf :: %for.cond---%for.end56 
 ; CHECK:       1st level tiling - Tiles
 ; CHECK-NEXT:     for (int c0 = 0; c0 <= 7; c0 += 1)
 ; CHECK-NEXT:       for (int c1 = 0; c1 <= 7; c1 += 1)
diff --git a/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll b/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll
index d08595d..e59a316 100644
--- a/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll
+++ b/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-vectorizer=stripmine -polly-isl-arg=--no-schedule-serialize-sccs -polly-tiling=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-vectorizer=stripmine -polly-isl-arg=--no-schedule-serialize-sccs -polly-tiling=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
 
 ; isl_schedule_node_band_sink may sink into multiple children.
 ; https://llvm.org/PR52637
diff --git a/polly/test/ScopDetect/aliasing_parametric_simple_1.ll b/polly/test/ScopDetect/aliasing_parametric_simple_1.ll
index 2eddbd4..cee1c06 100644
--- a/polly/test/ScopDetect/aliasing_parametric_simple_1.ll
+++ b/polly/test/ScopDetect/aliasing_parametric_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop:
 ;
diff --git a/polly/test/ScopDetect/aliasing_parametric_simple_2.ll b/polly/test/ScopDetect/aliasing_parametric_simple_2.ll
index c111f68..5506b3c 100644
--- a/polly/test/ScopDetect/aliasing_parametric_simple_2.ll
+++ b/polly/test/ScopDetect/aliasing_parametric_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop:
 ;
diff --git a/polly/test/ScopDetect/aliasing_simple_1.ll b/polly/test/ScopDetect/aliasing_simple_1.ll
index 524ca19..5f43ec1 100644
--- a/polly/test/ScopDetect/aliasing_simple_1.ll
+++ b/polly/test/ScopDetect/aliasing_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop:
 ;
diff --git a/polly/test/ScopDetect/aliasing_simple_2.ll b/polly/test/ScopDetect/aliasing_simple_2.ll
index 457df99..e853dfc 100644
--- a/polly/test/ScopDetect/aliasing_simple_2.ll
+++ b/polly/test/ScopDetect/aliasing_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop:
 ;
diff --git a/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll b/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll
index 0411aed..eeb9e11 100644
--- a/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll
+++ b/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true -polly-scops -polly-print-import-jscop -polly-codegen -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>,scop(polly-import-jscop,polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; This violated an assertion in setNewAccessRelation that assumed base pointers
 ; to be load-hoisted. Without this assertion, it codegen would generate invalid
diff --git a/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll b/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll
index ff9be6e..16976e6 100644
--- a/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll
+++ b/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -disable-basic-aa -polly-detect -polly-print-import-jscop -polly-codegen -disable-output < %s | FileCheck %s --allow-empty
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(polly-import-jscop,polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s --allow-empty
 ;
 ; Polly codegen used to generate invalid code (referring to %ptr from the
 ; original region) when regeneration of the access function is necessary.
diff --git a/polly/test/ScopDetect/callbr.ll b/polly/test/ScopDetect/callbr.ll
index d65ab93..4182974 100644
--- a/polly/test/ScopDetect/callbr.ll
+++ b/polly/test/ScopDetect/callbr.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-detect -polly-detect-track-failures -disable-output -pass-remarks-missed=polly-detect < %s 2>&1 | FileCheck %s --check-prefix=REMARK
-; RUN: opt %loadPolly -polly-detect -polly-detect-track-failures -disable-output -stats                            < %s 2>&1 | FileCheck %s --check-prefix=STAT
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -polly-detect-track-failures -disable-output -pass-remarks-missed=polly-detect < %s 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -polly-detect-track-failures -disable-output -stats                            < %s 2>&1 | FileCheck %s --check-prefix=STAT
 ; REQUIRES: asserts
 
 ; REMARK: Branch from indirect terminator.
diff --git a/polly/test/ScopDetect/collective_invariant_loads.ll b/polly/test/ScopDetect/collective_invariant_loads.ll
index f1d2eea..f451bcc 100644
--- a/polly/test/ScopDetect/collective_invariant_loads.ll
+++ b/polly/test/ScopDetect/collective_invariant_loads.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting -disable-output< %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting -disable-output< %s 2>&1 | FileCheck %s
 
 ;CHECK:     Function: test_init_chpl
 ;CHECK-NEXT:     Region: %bb1---%bb16
diff --git a/polly/test/ScopDetect/cross_loop_non_single_exit.ll b/polly/test/ScopDetect/cross_loop_non_single_exit.ll
index ae23930..fe39221 100644
--- a/polly/test/ScopDetect/cross_loop_non_single_exit.ll
+++ b/polly/test/ScopDetect/cross_loop_non_single_exit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll b/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll
index 5c25da6..4cac173 100644
--- a/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll
+++ b/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll b/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll
index 12983d2..7d74764 100644
--- a/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll
+++ b/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-detect -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
 define void @f(ptr %A, i64 %N, i64 %M) nounwind {
diff --git a/polly/test/ScopDetect/dot-scops-npm.ll b/polly/test/ScopDetect/dot-scops-npm.ll
index 7c8be03..d14bf8a 100644
--- a/polly/test/ScopDetect/dot-scops-npm.ll
+++ b/polly/test/ScopDetect/dot-scops-npm.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly "-passes=polly-scop-printer" -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-scop-printer' -disable-output < %s
 ; RUN: FileCheck %s -input-file=scops.func_npm.dot
 ;
 ; Check that the ScopPrinter does not crash.
diff --git a/polly/test/ScopDetect/dot-scops.ll b/polly/test/ScopDetect/dot-scops.ll
index c31562e..63163b2 100644
--- a/polly/test/ScopDetect/dot-scops.ll
+++ b/polly/test/ScopDetect/dot-scops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-scops -dot-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,polly-scop-printer' -disable-output < %s
 ;
 ; Check that the ScopPrinter does not crash.
 ; ScopPrinter needs the ScopDetection pass, which should depend on
diff --git a/polly/test/ScopDetect/error-block-always-executed.ll b/polly/test/ScopDetect/error-block-always-executed.ll
index 894be21..d799d57 100644
--- a/polly/test/ScopDetect/error-block-always-executed.ll
+++ b/polly/test/ScopDetect/error-block-always-executed.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid Region for Scop:
 
diff --git a/polly/test/ScopDetect/error-block-referenced-from-scop.ll b/polly/test/ScopDetect/error-block-referenced-from-scop.ll
index 0853514..ba271f3 100644
--- a/polly/test/ScopDetect/error-block-referenced-from-scop.ll
+++ b/polly/test/ScopDetect/error-block-referenced-from-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid Region for Scop:
 
diff --git a/polly/test/ScopDetect/error-block-unreachable.ll b/polly/test/ScopDetect/error-block-unreachable.ll
index 48f6fe8..6ba7698a 100644
--- a/polly/test/ScopDetect/error-block-unreachable.ll
+++ b/polly/test/ScopDetect/error-block-unreachable.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-detect -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s
 
 ; Verify that the scop detection does not crash on inputs with unreachable
 ; blocks. Earlier we crashed when detecting error blocks.
diff --git a/polly/test/ScopDetect/expand-region-correctly-2.ll b/polly/test/ScopDetect/expand-region-correctly-2.ll
index fadb503..df35d056 100644
--- a/polly/test/ScopDetect/expand-region-correctly-2.ll
+++ b/polly/test/ScopDetect/expand-region-correctly-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop: if.end.1631 => for.cond.1647.outer
 ;
diff --git a/polly/test/ScopDetect/expand-region-correctly.ll b/polly/test/ScopDetect/expand-region-correctly.ll
index 72082a3..a8c90c0 100644
--- a/polly/test/ScopDetect/expand-region-correctly.ll
+++ b/polly/test/ScopDetect/expand-region-correctly.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Valid Region for Scop: if.end.1631 => for.cond.1647.outer
 
diff --git a/polly/test/ScopDetect/ignore_func_flag_regex.ll b/polly/test/ScopDetect/ignore_func_flag_regex.ll
index 224126e..a75e705 100644
--- a/polly/test/ScopDetect/ignore_func_flag_regex.ll
+++ b/polly/test/ScopDetect/ignore_func_flag_regex.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-ignore-func=f.*,g.* -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-ignore-func=f.*,g.* '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the flag `-polly-ignore-func` works with regexes.
 ;
diff --git a/polly/test/ScopDetect/index_from_unpredictable_loop.ll b/polly/test/ScopDetect/index_from_unpredictable_loop.ll
index 27ed64d..f6d6cfa 100644
--- a/polly/test/ScopDetect/index_from_unpredictable_loop.ll
+++ b/polly/test/ScopDetect/index_from_unpredictable_loop.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly                        -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=AFFINE
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly                        '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=AFFINE
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
 
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopDetect/index_from_unpredictable_loop2.ll b/polly/test/ScopDetect/index_from_unpredictable_loop2.ll
index 9b5a3a4..16d4761 100644
--- a/polly/test/ScopDetect/index_from_unpredictable_loop2.ll
+++ b/polly/test/ScopDetect/index_from_unpredictable_loop2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly                        -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=AFFINE
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly                        '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=AFFINE
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
 
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopDetect/indvars.ll b/polly/test/ScopDetect/indvars.ll
index 2ba4d1f..3fbc4d6 100644
--- a/polly/test/ScopDetect/indvars.ll
+++ b/polly/test/ScopDetect/indvars.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -polly-codegen -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s
 ;
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/ScopDetect/intrinsics_1.ll b/polly/test/ScopDetect/intrinsics_1.ll
index 65d3968..0f9c700 100644
--- a/polly/test/ScopDetect/intrinsics_1.ll
+++ b/polly/test/ScopDetect/intrinsics_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop: for.cond => for.end
 ;
diff --git a/polly/test/ScopDetect/intrinsics_2.ll b/polly/test/ScopDetect/intrinsics_2.ll
index f057551..1db9807 100644
--- a/polly/test/ScopDetect/intrinsics_2.ll
+++ b/polly/test/ScopDetect/intrinsics_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we allow the lifetime markers for the tmp array.
 ;
diff --git a/polly/test/ScopDetect/intrinsics_3.ll b/polly/test/ScopDetect/intrinsics_3.ll
index bce90d1..a230d0a 100644
--- a/polly/test/ScopDetect/intrinsics_3.ll
+++ b/polly/test/ScopDetect/intrinsics_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we allow the misc intrinsics.
 ;
diff --git a/polly/test/ScopDetect/invalid-latch-conditions.ll b/polly/test/ScopDetect/invalid-latch-conditions.ll
index eb80974..db4898c 100644
--- a/polly/test/ScopDetect/invalid-latch-conditions.ll
+++ b/polly/test/ScopDetect/invalid-latch-conditions.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly                              -polly-process-unprofitable=false -polly-print-detect -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-allow-nonaffine-loops                                   -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=NALOOPS
-; RUN: opt %loadPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly                              -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops                                   '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NALOOPS
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 
 ; The latch conditions of the outer loop are not affine, thus the loop cannot
 ; handled by the domain generation and needs to be overapproximated.
diff --git a/polly/test/ScopDetect/invalidate_scalar_evolution.ll b/polly/test/ScopDetect/invalidate_scalar_evolution.ll
index 01d34c4..ddef510 100644
--- a/polly/test/ScopDetect/invalidate_scalar_evolution.ll
+++ b/polly/test/ScopDetect/invalidate_scalar_evolution.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s -check-prefix=PHI
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PHI
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/invariant-load-before-scop.ll b/polly/test/ScopDetect/invariant-load-before-scop.ll
index f72085f..1047964 100644
--- a/polly/test/ScopDetect/invariant-load-before-scop.ll
+++ b/polly/test/ScopDetect/invariant-load-before-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; The LoadInst %.b761 is defined outside the SCoP, hence is always constant
 ; within it. It is no "required invariant load".
diff --git a/polly/test/ScopDetect/keep_going_expansion.ll b/polly/test/ScopDetect/keep_going_expansion.ll
index 9bcfb39..074aae9 100644
--- a/polly/test/ScopDetect/keep_going_expansion.ll
+++ b/polly/test/ScopDetect/keep_going_expansion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-detect-track-failures -polly-detect-keep-going -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-detect-track-failures -polly-detect-keep-going '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopDetect/mod_ref_read_pointer.ll b/polly/test/ScopDetect/mod_ref_read_pointer.ll
index 95a4649..64535d8 100644
--- a/polly/test/ScopDetect/mod_ref_read_pointer.ll
+++ b/polly/test/ScopDetect/mod_ref_read_pointer.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-allow-modref-calls -polly-print-detect -disable-output < %s | FileCheck %s -check-prefix=MODREF
-; RUN: opt %loadPolly -basic-aa                           -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=MODREF
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa                           '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid Region for Scop: for.body => for.end
 ; MODREF: Valid Region for Scop: for.body => for.end
diff --git a/polly/test/ScopDetect/more-than-one-loop.ll b/polly/test/ScopDetect/more-than-one-loop.ll
index bfd226c..3009065 100644
--- a/polly/test/ScopDetect/more-than-one-loop.ll
+++ b/polly/test/ScopDetect/more-than-one-loop.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-process-unprofitable=false -polly-print-detect -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-process-unprofitable=true -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=true '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Valid Region for Scop:
 
diff --git a/polly/test/ScopDetect/multidim-with-undef-size.ll b/polly/test/ScopDetect/multidim-with-undef-size.ll
index 9973c6c..2a5f8b1 100644
--- a/polly/test/ScopDetect/multidim-with-undef-size.ll
+++ b/polly/test/ScopDetect/multidim-with-undef-size.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; CHECK: Valid Region for Scop: bb14 => bb17
diff --git a/polly/test/ScopDetect/multidim.ll b/polly/test/ScopDetect/multidim.ll
index f436988..9120237 100644
--- a/polly/test/ScopDetect/multidim.ll
+++ b/polly/test/ScopDetect/multidim.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; CHECK: Valid Region for Scop: bb19 => bb20
diff --git a/polly/test/ScopDetect/multidim_indirect_access.ll b/polly/test/ScopDetect/multidim_indirect_access.ll
index 3e06251..a9cd446 100644
--- a/polly/test/ScopDetect/multidim_indirect_access.ll
+++ b/polly/test/ScopDetect/multidim_indirect_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we will recognize this SCoP.
 ;
diff --git a/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll b/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll
index ed554a2..9c91fbf 100644
--- a/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll
+++ b/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/ScopDetect/nested_loop_single_exit.ll b/polly/test/ScopDetect/nested_loop_single_exit.ll
index 377e808..a074211 100644
--- a/polly/test/ScopDetect/nested_loop_single_exit.ll
+++ b/polly/test/ScopDetect/nested_loop_single_exit.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
 
 ; void f(long A[], long N) {
 ;   long i, j;
diff --git a/polly/test/ScopDetect/non-affine-conditional.ll b/polly/test/ScopDetect/non-affine-conditional.ll
index fc2d0c0..e74619c 100644
--- a/polly/test/ScopDetect/non-affine-conditional.ll
+++ b/polly/test/ScopDetect/non-affine-conditional.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/ScopDetect/non-affine-float-compare.ll b/polly/test/ScopDetect/non-affine-float-compare.ll
index 984f14a..9326cd4 100644
--- a/polly/test/ScopDetect/non-affine-float-compare.ll
+++ b/polly/test/ScopDetect/non-affine-float-compare.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(float *A) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll
index 068367f..1ab6b35 100644
--- a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll
+++ b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                                                          -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                                                           -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine                                   -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine -polly-process-unprofitable=false -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                                                          '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                                                           '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine                                   '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 ;
 ; Here we have a non-affine loop but also a non-affine access which should
 ; be rejected as long as -polly-allow-nonaffine isn't given.
diff --git a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll
index cd21405..921f6ab 100644
--- a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll
+++ b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                        -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                         -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                        '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                         '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
 ;
 ; Here we have a non-affine loop (in the context of the loop nest)
 ; and also a non-affine access (A[k]). While we can always detect the
diff --git a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll
index fb93621..78774d9 100644
--- a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll
+++ b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                        -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                         -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                        '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                         '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
 ;
 ; Here we have a non-affine loop (in the context of the loop nest)
 ; and also a non-affine access (A[k]). While we can always detect the
diff --git a/polly/test/ScopDetect/non-affine-loop.ll b/polly/test/ScopDetect/non-affine-loop.ll
index d5f7ea1..5136b3b 100644
--- a/polly/test/ScopDetect/non-affine-loop.ll
+++ b/polly/test/ScopDetect/non-affine-loop.ll
@@ -1,8 +1,8 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                                                          -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                                                           -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false -polly-allow-nonaffine                                   -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=ALLOWNONAFFINEREGIONSANDACCESSES
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine                                   -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine -polly-process-unprofitable=false -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                                                          '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                                                           '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false -polly-allow-nonaffine                                   '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINEREGIONSANDACCESSES
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine                                   '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 ;
 ; This function/region does contain a loop, however it is non-affine, hence the access
 ; A[i] is also. Furthermore, it is the only loop, thus when we over approximate
diff --git a/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll b/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll
index 43af168..fd52c5d 100644
--- a/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll
+++ b/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-process-unprofitable=false -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid
 ;
diff --git a/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll b/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll
index 4cddcc9..d0c1f7a 100644
--- a/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll
+++ b/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Valid Region for Scop: bb11 => bb25
 
diff --git a/polly/test/ScopDetect/non-simple-memory-accesses.ll b/polly/test/ScopDetect/non-simple-memory-accesses.ll
index a822289..bdc4898 100644
--- a/polly/test/ScopDetect/non-simple-memory-accesses.ll
+++ b/polly/test/ScopDetect/non-simple-memory-accesses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we do not model atomic memory accesses. We did not reason about
 ; how to handle them correctly and the Alias Set Tracker models some of them
diff --git a/polly/test/ScopDetect/non_affine_loop_condition.ll b/polly/test/ScopDetect/non_affine_loop_condition.ll
index f268442..63bd7b3 100644
--- a/polly/test/ScopDetect/non_affine_loop_condition.ll
+++ b/polly/test/ScopDetect/non_affine_loop_condition.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-loops                                   -polly-print-detect -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops                                   '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 ;
 ;    void f(int *A) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/ScopDetect/only-one-affine-loop.ll b/polly/test/ScopDetect/only-one-affine-loop.ll
index d6d50bb..1d36f4d 100644
--- a/polly/test/ScopDetect/only-one-affine-loop.ll
+++ b/polly/test/ScopDetect/only-one-affine-loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-process-unprofitable=false -polly-allow-nonaffine-loops -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=false -polly-allow-nonaffine-loops '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Even if we allow non-affine loops we can only model the outermost loop, all
 ; other loops are boxed in non-affine regions. However, the inner loops can be
diff --git a/polly/test/ScopDetect/only_func_flag.ll b/polly/test/ScopDetect/only_func_flag.ll
index d465cd0..4742375 100644
--- a/polly/test/ScopDetect/only_func_flag.ll
+++ b/polly/test/ScopDetect/only_func_flag.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-only-func=f,g -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-only-func=f,g '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the flag `-polly-only-func` limits analysis to `f` and `g`.
 ;
diff --git a/polly/test/ScopDetect/only_func_flag_regex.ll b/polly/test/ScopDetect/only_func_flag_regex.ll
index e667579..2ad22c9 100644
--- a/polly/test/ScopDetect/only_func_flag_regex.ll
+++ b/polly/test/ScopDetect/only_func_flag_regex.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-only-func=f.*,g.* -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-only-func=f.*,g.* '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the flag `-polly-only-func` works with regexes.
 ;
diff --git a/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll b/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll
index fc957a7..271825a 100644
--- a/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll
+++ b/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 
 ; CHECK-NOT: Valid Region
diff --git a/polly/test/ScopDetect/parametric-multiply-in-scev.ll b/polly/test/ScopDetect/parametric-multiply-in-scev.ll
index 9c6e5ccc8f..2ab8997 100644
--- a/polly/test/ScopDetect/parametric-multiply-in-scev.ll
+++ b/polly/test/ScopDetect/parametric-multiply-in-scev.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ;  foo(float *A, long n, long k) {
 ;    if (true)
diff --git a/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll b/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll
index 054de16..248bb43 100644
--- a/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll
+++ b/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Region with an exit node that has a PHI node multiple incoming edges from
 ; inside the region. Motivation for supporting such cases in Polly.
diff --git a/polly/test/ScopDetect/profitability-large-basic-blocks.ll b/polly/test/ScopDetect/profitability-large-basic-blocks.ll
index e1650fe..d74185b 100644
--- a/polly/test/ScopDetect/profitability-large-basic-blocks.ll
+++ b/polly/test/ScopDetect/profitability-large-basic-blocks.ll
@@ -1,12 +1,12 @@
-; RUN: opt %loadPolly -polly-process-unprofitable=false \
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=false \
 ; RUN:                -polly-detect-profitability-min-per-loop-insts=40 \
-; RUN: -polly-print-detect -disable-output < %s | FileCheck %s -check-prefix=PROFITABLE
+; RUN: '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFITABLE
 
-; RUN: opt %loadPolly -polly-process-unprofitable=true \
-; RUN: -polly-print-detect -disable-output < %s | FileCheck %s -check-prefix=PROFITABLE
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=true \
+; RUN: '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFITABLE
 
-; RUN: opt %loadPolly -polly-process-unprofitable=false \
-; RUN: -polly-print-detect -disable-output < %s | FileCheck %s -check-prefix=UNPROFITABLE
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=false \
+; RUN: '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=UNPROFITABLE
 
 ; UNPROFITABLE-NOT: Valid Region for Scop:
 ; PROFITABLE: Valid Region for Scop:
diff --git a/polly/test/ScopDetect/profitability-two-nested-loops.ll b/polly/test/ScopDetect/profitability-two-nested-loops.ll
index 525f91c..0291d3b 100644
--- a/polly/test/ScopDetect/profitability-two-nested-loops.ll
+++ b/polly/test/ScopDetect/profitability-two-nested-loops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Valid Region for Scop: next => bb3
 ;
diff --git a/polly/test/ScopDetect/remove_all_children.ll b/polly/test/ScopDetect/remove_all_children.ll
index 6d5097b..d95e9bd 100644
--- a/polly/test/ScopDetect/remove_all_children.ll
+++ b/polly/test/ScopDetect/remove_all_children.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopDetect/report-scop-location.ll b/polly/test/ScopDetect/report-scop-location.ll
index 750699c..a99a2ef 100644
--- a/polly/test/ScopDetect/report-scop-location.ll
+++ b/polly/test/ScopDetect/report-scop-location.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-detect -polly-report -disable-output < %s  2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -polly-report -disable-output < %s  2>&1 | FileCheck %s
 target datalayout = "e-i64:64-f80:128-s:64-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
diff --git a/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll b/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll
index e94f1e7..f49190b3 100644
--- a/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll
+++ b/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ; CHECK-NOT: Valid Region for Scop:
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/ScopDetect/run_time_alias_check.ll b/polly/test/ScopDetect/run_time_alias_check.ll
index 672f3df..74cbedb 100644
--- a/polly/test/ScopDetect/run_time_alias_check.ll
+++ b/polly/test/ScopDetect/run_time_alias_check.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
diff --git a/polly/test/ScopDetect/scev_remove_max.ll b/polly/test/ScopDetect/scev_remove_max.ll
index 5353e06b..caf55bf 100644
--- a/polly/test/ScopDetect/scev_remove_max.ll
+++ b/polly/test/ScopDetect/scev_remove_max.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-detect < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s
 
 ; This test case helps to determine wether SCEVRemoveMax::remove produces
 ; an infinite loop and a segmentation fault, if it processes, for example,
diff --git a/polly/test/ScopDetect/sequential_loops.ll b/polly/test/ScopDetect/sequential_loops.ll
index e6ac38a..4a84f35 100644
--- a/polly/test/ScopDetect/sequential_loops.ll
+++ b/polly/test/ScopDetect/sequential_loops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
@@ -13,7 +13,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; }
 
 define void @f1(ptr %A, i64 %N) nounwind {
-; CHECK-LABEL: 'Polly - Detect static control parts (SCoPs)' for function 'f1'
+; CHECK-LABEL: Detected Scops in Function f1 
 entry:
   fence seq_cst
   br label %for.i.1
@@ -60,7 +60,7 @@ return:
 ;     }
 
 define void @f2(ptr %A, i64 %N) nounwind {
-; CHECK-LABEL: 'Polly - Detect static control parts (SCoPs)' for function 'f2'
+; CHECK-LABEL: Detected Scops in Function f2
 entry:
   fence seq_cst
   br label %for.i.1
diff --git a/polly/test/ScopDetect/simple_loop.ll b/polly/test/ScopDetect/simple_loop.ll
index c8ed89a..33823b2 100644
--- a/polly/test/ScopDetect/simple_loop.ll
+++ b/polly/test/ScopDetect/simple_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/simple_loop_non_single_entry.ll b/polly/test/ScopDetect/simple_loop_non_single_entry.ll
index 22adec5..1bba2c2 100644
--- a/polly/test/ScopDetect/simple_loop_non_single_entry.ll
+++ b/polly/test/ScopDetect/simple_loop_non_single_entry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/simple_loop_non_single_exit.ll b/polly/test/ScopDetect/simple_loop_non_single_exit.ll
index 71ac830..93ec84e 100644
--- a/polly/test/ScopDetect/simple_loop_non_single_exit.ll
+++ b/polly/test/ScopDetect/simple_loop_non_single_exit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll b/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll
index d9915dc..33b0d8d 100644
--- a/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll
+++ b/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll b/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll
index 867bd50..9b47b7c 100644
--- a/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll
+++ b/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/simple_loop_with_param.ll b/polly/test/ScopDetect/simple_loop_with_param.ll
index 1ae5c66..4a0a3ad 100644
--- a/polly/test/ScopDetect/simple_loop_with_param.ll
+++ b/polly/test/ScopDetect/simple_loop_with_param.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-detect -disable-output < %s | FileCheck %s -check-prefix=PHI
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PHI
 
 ; void f(long A[], long N, long *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopDetect/simple_loop_with_param_2.ll b/polly/test/ScopDetect/simple_loop_with_param_2.ll
index 1a47506..670936b 100644
--- a/polly/test/ScopDetect/simple_loop_with_param_2.ll
+++ b/polly/test/ScopDetect/simple_loop_with_param_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], int N, int *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopDetect/simple_non_single_entry.ll b/polly/test/ScopDetect/simple_non_single_entry.ll
index a1995a4..6ace3b6 100644
--- a/polly/test/ScopDetect/simple_non_single_entry.ll
+++ b/polly/test/ScopDetect/simple_non_single_entry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/skip_function_attribute.ll b/polly/test/ScopDetect/skip_function_attribute.ll
index e85dbd4..2150a3e 100644
--- a/polly/test/ScopDetect/skip_function_attribute.ll
+++ b/polly/test/ScopDetect/skip_function_attribute.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify polly skips this function
 ;
diff --git a/polly/test/ScopDetect/srem_with_parametric_divisor.ll b/polly/test/ScopDetect/srem_with_parametric_divisor.ll
index 4b5c3b0..66c3b04 100644
--- a/polly/test/ScopDetect/srem_with_parametric_divisor.ll
+++ b/polly/test/ScopDetect/srem_with_parametric_divisor.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid Region for Scop:
 ;
diff --git a/polly/test/ScopDetect/statistics.ll b/polly/test/ScopDetect/statistics.ll
index 64df3d0..a1dcebe 100644
--- a/polly/test/ScopDetect/statistics.ll
+++ b/polly/test/ScopDetect/statistics.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-detect -stats -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -stats -disable-output < %s 2>&1 | FileCheck %s
 
 ; REQUIRES: asserts
 
diff --git a/polly/test/ScopDetect/switch-in-loop-patch.ll b/polly/test/ScopDetect/switch-in-loop-patch.ll
index ab4729f..2f9b670 100644
--- a/polly/test/ScopDetect/switch-in-loop-patch.ll
+++ b/polly/test/ScopDetect/switch-in-loop-patch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK-NOT: Valid
 
diff --git a/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll b/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll
index 97ba7f9..4ae86a9 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-use-runtime-alias-checks=false -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-print-detect -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly -polly-use-runtime-alias-checks=false -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s
 
 ;void f(int A[], int B[]) {
 ;  for (int i=0; i<42; i++)
diff --git a/polly/test/ScopDetectionDiagnostics/ReportEntry.ll b/polly/test/ScopDetectionDiagnostics/ReportEntry.ll
index fc21e19..adb14b5 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportEntry.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportEntry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-detect -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
 
 ; CHECK: remark: <unknown>:0:0: Scop contains function entry (not yet supported).
 
diff --git a/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll b/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll
index abace4b..428a7cf8 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; #define N 1024
 ; double invalidCall(double A[N]);
diff --git a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll
index 8368a68..d22c3b6 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ;void foo(int a, int b) {
diff --git a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll
index 82c6c33..2bc515e 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-detect -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
 
 ; CHECK: remark: <unknown>:0:0: Irreducible region encountered in control flow.
 
diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
index 35986b5..cb91300 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
@@ -1,15 +1,15 @@
-; RUN: opt %loadPolly \
+; RUN: opt %loadNPMPolly \
 ; RUN:     -pass-remarks-missed="polly-detect" -polly-detect-track-failures \
-; RUN:     -polly-allow-nonaffine-loops=false -polly-print-detect -disable-output \
+; RUN:     -polly-allow-nonaffine-loops=false '-passes=print<polly-detect>' -disable-output \
 ; RUN:     < %s 2>&1| FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
-; RUN: opt %loadPolly \
+; RUN: opt %loadNPMPolly \
 ; RUN:     -pass-remarks-missed="polly-detect" -polly-detect-track-failures \
-; RUN:     -polly-allow-nonaffine-loops=true -polly-print-detect -disable-output \
+; RUN:     -polly-allow-nonaffine-loops=true '-passes=print<polly-detect>' -disable-output \
 ; RUN:     < %s 2>&1| FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
-; RUN: opt %loadPolly -pass-remarks-missed="polly-detect" \
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" \
 ; RUN:     -polly-process-unprofitable=false \
 ; RUN:     -polly-detect-track-failures -polly-allow-nonaffine-loops=true \
-; RUN:     -polly-allow-nonaffine -polly-print-detect -disable-output < %s 2>&1 \
+; RUN:     -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=ALLOWNONAFFINEALL
 
 ; void f(int A[], int n) {
diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll
index 5dbeade..9202809 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll
@@ -4,8 +4,8 @@
 ; the PostDominatorTree. Infinite loops are postdominated ony by the virtual
 ; root, which causes them not to appear in regions in ScopDetection anymore.
 
-; RUN: opt %loadPolly -pass-remarks-missed="polly-detect" -polly-allow-nonaffine-loops -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadPolly -pass-remarks-missed="polly-detect" -polly-allow-nonaffine-loops=false -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-allow-nonaffine-loops '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-allow-nonaffine-loops=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void func (int param0, int N, int *A)
 ; {
diff --git a/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll b/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll
index 634b63e..dd95bd6 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll
@@ -1,9 +1,9 @@
-; RUN: opt %loadPolly -basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-print-detect -disable-output < %s 2>&1| FileCheck %s
-; RUN: opt %loadPolly -basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-print-detect -polly-delinearize=false -polly-detect-keep-going -disable-output < %s 2>&1| FileCheck %s -check-prefix=ALL
-; RUN: opt %loadPolly -basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-print-detect -disable-output < %s 2>&1| FileCheck %s -check-prefix=DELIN
-; RUN: opt %loadPolly -basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-print-detect -polly-detect-keep-going -disable-output < %s 2>&1| FileCheck %s -check-prefix=DELIN-ALL
-; RUN: opt %loadPolly -basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-print-detect -polly-allow-nonaffine -disable-output < %s 2>&1| FileCheck %s -check-prefix=NONAFFINE
-; RUN: opt %loadPolly -basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-print-detect -polly-allow-nonaffine -disable-output < %s 2>&1| FileCheck %s -check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-delinearize=false -polly-detect-keep-going -disable-output < %s 2>&1| FileCheck %s -check-prefix=ALL
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s -check-prefix=DELIN
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-detect-keep-going -disable-output < %s 2>&1| FileCheck %s -check-prefix=DELIN-ALL
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-allow-nonaffine -disable-output < %s 2>&1| FileCheck %s -check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-allow-nonaffine -disable-output < %s 2>&1| FileCheck %s -check-prefix=NONAFFINE
 
 ;  1 void manyaccesses(float A[restrict], long n, float B[restrict][n])
 ;  2 {
diff --git a/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll b/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll
index 23d8c9c..832045f 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-print-detect -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s
 
 ; void f(int A[]) {
 ;   for(int i=0; i<42; ++i)
diff --git a/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll b/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll
index d35b7a2..b951487 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll
@@ -1,9 +1,9 @@
-; RUN: opt %loadPolly -pass-remarks-missed="polly-detect" \
-; RUN:     -polly-detect-track-failures -polly-print-detect -disable-output \
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" \
+; RUN:     -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output \
 ; RUN:     -polly-process-unprofitable=false < %s 2>&1| FileCheck %s
 
-; RUN: opt %loadPolly -pass-remarks-missed="polly-detect" \
-; RUN:     -polly-detect-track-failures -polly-print-detect -disable-output \
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" \
+; RUN:     -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output \
 ; RUN:     -polly-process-unprofitable=false < %s 2>&1 -pass-remarks-output=%t.yaml
 ; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll b/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll
index 6c868db..d110cfe 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s \
 ; RUN:     -pass-remarks-missed="polly-detect" 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
diff --git a/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll b/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll
index a82f56b..c2efd61 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-print-detect -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s
 
 ; struct b {
 ;   double **b;
diff --git a/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll b/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll
index a0f2704..3cdeed1 100644
--- a/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll
+++ b/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-detect -disable-output 2>&1 < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output 2>&1 < %s | FileCheck %s -match-full-lines
 ;
 ; Derived from test-suite/MultiSource/Benchmarks/BitBench/uuencode/uuencode.c
 ;
diff --git a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll
index 667ed7d..4a9a200 100644
--- a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll
+++ b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
 
 ; CHECK: remark: <unknown>:0:0: Loop cannot be handled because not all latches are part of loop region.
 
diff --git a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll
index 9dce56a..61ff033 100644
--- a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll
+++ b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
 
 ; CHECK: remark: <unknown>:0:0: Loop cannot be handled because not all latches are part of loop region.
 ; CHECK: remark: <unknown>:0:0: Loop cannot be handled because not all latches are part of loop region.
diff --git a/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll b/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll
index 94dd582..c5efec3 100644
--- a/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll
+++ b/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll
@@ -1,5 +1,5 @@
 ; This should be run without alias analysis enabled.
-;RUN: opt %loadPolly -polly-scops  -disable-output < %s
+;RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>'  -disable-output < %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
 
 define i32 @main() nounwind {
diff --git a/polly/test/ScopInfo/20111108-Parameter-not-detected.ll b/polly/test/ScopInfo/20111108-Parameter-not-detected.ll
index f80177c..81c7efb 100644
--- a/polly/test/ScopInfo/20111108-Parameter-not-detected.ll
+++ b/polly/test/ScopInfo/20111108-Parameter-not-detected.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 declare void @foo()
diff --git a/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll b/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll
index b55d635..5abf8ff 100644
--- a/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll
+++ b/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
 
diff --git a/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll b/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll
index d4d931f..d16ba45 100644
--- a/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll
+++ b/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-loops -polly-scops -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=print<polly-function-scops>' -disable-output < %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/Alias-0.ll b/polly/test/ScopInfo/Alias-0.ll
index 0fc4ad9..ebbe744 100644
--- a/polly/test/ScopInfo/Alias-0.ll
+++ b/polly/test/ScopInfo/Alias-0.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
-; RUN: opt %loadPolly -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
 ; REQUIRES: asserts
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/Alias-1.ll b/polly/test/ScopInfo/Alias-1.ll
index eab8c06..b1711c2 100644
--- a/polly/test/ScopInfo/Alias-1.ll
+++ b/polly/test/ScopInfo/Alias-1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
-; RUN: opt %loadPolly -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
 ; REQUIRES: asserts
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/Alias-2.ll b/polly/test/ScopInfo/Alias-2.ll
index 64f1e0b..b94f130 100644
--- a/polly/test/ScopInfo/Alias-2.ll
+++ b/polly/test/ScopInfo/Alias-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
-; RUN: opt %loadPolly -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
 ; REQUIRES: asserts
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/Alias-3.ll b/polly/test/ScopInfo/Alias-3.ll
index 5e9b94e..af78165 100644
--- a/polly/test/ScopInfo/Alias-3.ll
+++ b/polly/test/ScopInfo/Alias-3.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
-; RUN: opt %loadPolly -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
 ; REQUIRES: asserts
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/Alias-4.ll b/polly/test/ScopInfo/Alias-4.ll
index 4d5a91a..fe651c8 100644
--- a/polly/test/ScopInfo/Alias-4.ll
+++ b/polly/test/ScopInfo/Alias-4.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -disable-basic-aa -polly-print-scops -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
-; RUN: opt %loadPolly -disable-basic-aa -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
+; RUN: opt %loadNPMPolly -aa-pipeline= '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
+; RUN: opt %loadNPMPolly -aa-pipeline= '-passes=print<polly-detect>,print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
 ; REQUIRES: asserts
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/BoundChecks/single-loop.ll b/polly/test/ScopInfo/BoundChecks/single-loop.ll
index bc96c90..10a0a58 100644
--- a/polly/test/ScopInfo/BoundChecks/single-loop.ll
+++ b/polly/test/ScopInfo/BoundChecks/single-loop.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ; This only works after the post-dominator tree has been fixed.
 ;
diff --git a/polly/test/ScopInfo/BoundChecks/two-loops.ll b/polly/test/ScopInfo/BoundChecks/two-loops.ll
index 14e07f4..c85ac5b 100644
--- a/polly/test/ScopInfo/BoundChecks/two-loops.ll
+++ b/polly/test/ScopInfo/BoundChecks/two-loops.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output< %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ; This only works after the post-dominator tree has fixed.
 ; XFAIL: *
diff --git a/polly/test/ScopInfo/NonAffine/div_backedge.ll b/polly/test/ScopInfo/NonAffine/div_backedge.ll
index a6aca03..3b0c673 100644
--- a/polly/test/ScopInfo/NonAffine/div_backedge.ll
+++ b/polly/test/ScopInfo/NonAffine/div_backedge.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(float *A) {
 ;      for (long i = 1;; i++) {
diff --git a/polly/test/ScopInfo/NonAffine/div_domain.ll b/polly/test/ScopInfo/NonAffine/div_domain.ll
index f61c4eb..34a5cec 100644
--- a/polly/test/ScopInfo/NonAffine/div_domain.ll
+++ b/polly/test/ScopInfo/NonAffine/div_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(float *A) {
 ;      for (long i = 0; i < 16; i++) {
diff --git a/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll b/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll
index f5d63df..7d02fae 100644
--- a/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll
+++ b/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting=true -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int *B, int *C) {
 ;      for (int i = 0; i < 1000; i++)
diff --git a/polly/test/ScopInfo/NonAffine/modulo_backedge.ll b/polly/test/ScopInfo/NonAffine/modulo_backedge.ll
index dec63ca..d5c808d 100644
--- a/polly/test/ScopInfo/NonAffine/modulo_backedge.ll
+++ b/polly/test/ScopInfo/NonAffine/modulo_backedge.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Domain :=
 ; CHECK:   { Stmt_for_body[i0] : 0 <= i0 <= 6 };
diff --git a/polly/test/ScopInfo/NonAffine/modulo_domain.ll b/polly/test/ScopInfo/NonAffine/modulo_domain.ll
index f5ebec2..13fe53f 100644
--- a/polly/test/ScopInfo/NonAffine/modulo_domain.ll
+++ b/polly/test/ScopInfo/NonAffine/modulo_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; TODO: The new domain generation cannot handle modulo domain constraints,
 ;       hence modulo handling has been disabled completely. Once this is
diff --git a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll
index 837d9b2..2b8427d 100644
--- a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll
+++ b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                                   -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCALAR
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-process-unprofitable=false -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                                   '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCALAR
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-process-unprofitable=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFIT
 ;
 ; SCALAR:      Function: f
 ; SCALAR-NEXT: Region: %bb1---%bb13
diff --git a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll
index e39569ab..30f756e 100644
--- a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll
+++ b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                       -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                        -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=ALL
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                       '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                        '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL
 ;
 ; Here we have a non-affine loop (in the context of the loop nest)
 ; and also a non-affine access (A[k]). While we can always model the
diff --git a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll
index 75dd7ac..6dacd71 100644
--- a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll
+++ b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                       -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                        -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=ALL
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                       '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                        '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL
 ;
 ; Here we have a non-affine loop (in the context of the loop nest)
 ; and also a non-affine access (A[k]). While we can always model the
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll b/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll
index 34b0493..8a13f79 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A) {
 ;      for (int i = 0; i < 128; i++)
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll b/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll
index 9955c88..1e70d2c 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Statements {
 ; CHECK-NEXT:     Stmt_for_body
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll b/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll
index b194ee7..dcfaa92 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void pos(float *A, long n) {
 ;      for (long i = 0; i < 100; i++)
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll b/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll
index 1f55530..24bfe60 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll
index 3511362..931ad36 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll
@@ -1,11 +1,11 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches \
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches \
 ; RUN:     -polly-invariant-load-hoisting=true \
 ; RUN:     -polly-allow-nonaffine-loops=true \
-; RUN:     -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadPolly -polly-allow-nonaffine \
+; RUN:     '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine \
 ; RUN:     -polly-invariant-load-hoisting=true \
 ; RUN:     -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true \
-; RUN:     -polly-print-scops -disable-output < %s | FileCheck %s \
+; RUN:     '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s \
 ; RUN:     --check-prefix=ALL
 ;
 ; Negative test for INNERMOST.
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll
index c2e1e46..37b51ceb 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll
@@ -1,16 +1,16 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches \
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches \
 ; RUN:     -polly-invariant-load-hoisting=true \
 ; RUN:     -polly-allow-nonaffine-loops=true \
-; RUN:     -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadPolly -polly-allow-nonaffine \
+; RUN:     '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine \
 ; RUN:     -polly-invariant-load-hoisting=true \
 ; RUN:     -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true \
-; RUN:     -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=ALL
-; RUN: opt %loadPolly -polly-allow-nonaffine \
+; RUN:     '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine \
 ; RUN:     -polly-invariant-load-hoisting=true \
 ; RUN:     -polly-process-unprofitable=false \
 ; RUN:     -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true \
-; RUN:     -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=PROFIT
+; RUN:     '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 ;
 ; Negative test for INNERMOST.
 ; At the moment we will optimistically assume A[i] in the conditional before the inner
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll b/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll
index c62447b..7bfd7f8 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(float *A) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll b/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll
index 873b44b..fc779d5 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops                                   -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-process-unprofitable=false -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=PROFIT
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-detect-reductions=false    -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=NO-REDUCTION
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops                                   '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-detect-reductions=false    '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NO-REDUCTION
 ;
 ;    void f(int *A, int *C) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll b/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll
index 127bf80b..79b61ec 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops                                                                        -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-unprofitable-scalar-accs=true -polly-process-unprofitable=false -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops                                                                        '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-unprofitable-scalar-accs=true -polly-process-unprofitable=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 ;
 ; Verify that we over approximate the read acces of A[j] in the last statement as j is
 ; computed in a non-affine loop we do not model.
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll b/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll
index de011e2..d33befe 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, double A[], int INDEX[]) {
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll b/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll
index 7303b4e..77c2df4 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-loops -polly-detect -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-detect '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll b/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll
index 4f54d03..9ed340d 100644
--- a/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll
+++ b/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 ;
 ; Regression test that triggered a memory leak at some point (24947).
 ;
diff --git a/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll b/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll
index dc59fbf..cbd024b 100644
--- a/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll
+++ b/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that there is no alias group because we either access A or B never both.
 ;
diff --git a/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll b/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll
index a19d60d..3858d8a 100644
--- a/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll
+++ b/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we create two alias groups since the minimal/maximal accesses
 ; depend on %b.
diff --git a/polly/test/ScopInfo/aliasing_dead_access.ll b/polly/test/ScopInfo/aliasing_dead_access.ll
index 2a725cf..7baa3dc 100644
--- a/polly/test/ScopInfo/aliasing_dead_access.ll
+++ b/polly/test/ScopInfo/aliasing_dead_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not create a SCoP if there is no statement executed.
 ;
diff --git a/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll b/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll
index 937d4ad..7265aab 100644
--- a/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll
+++ b/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output \
-; RUN:                < %s | FileCheck %s --check-prefix=FOUND
-; RUN: opt %loadPolly -polly-print-scops -disable-output \
-; RUN:                -polly-rtc-max-arrays-per-group=3 < %s | FileCheck %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \
+; RUN:                < %s 2>&1 | FileCheck %s --check-prefix=FOUND
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \
+; RUN:                -polly-rtc-max-arrays-per-group=3 < %s 2>&1 | FileCheck %s \
 ; RUN:                --check-prefix=IGNORED
 ;
 ; FOUND: Function: foo
diff --git a/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll b/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll
index c22cfe5..d66a10b 100644
--- a/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll
+++ b/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Assumed Context:
 ; CHECK-NEXT: { : }
diff --git a/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll b/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll
index 16cb3dc..9943802 100644
--- a/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll
+++ b/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output          < %s | FileCheck %s --check-prefix=NOAA
-; RUN: opt %loadPolly -polly-print-scops -disable-output -tbaa    < %s | FileCheck %s --check-prefix=TBAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -aa-pipeline=         < %s 2>&1 | FileCheck %s --check-prefix=NOAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -aa-pipeline=tbaa    < %s 2>&1 | FileCheck %s --check-prefix=TBAA
 ;
 ;    void jd(int *Int0, int *Int1, float *Float0, float *Float1) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/ScopInfo/aliasing_with_non_affine_access.ll b/polly/test/ScopInfo/aliasing_with_non_affine_access.ll
index 056b644..900d5d4 100644
--- a/polly/test/ScopInfo/aliasing_with_non_affine_access.ll
+++ b/polly/test/ScopInfo/aliasing_with_non_affine_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-process-unprofitable -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-process-unprofitable -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; @test1
 ; Make sure we generate the correct aliasing check for a fixed-size memset operation.
diff --git a/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll b/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll
index d170a50..cb06e35 100644
--- a/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll
+++ b/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll
@@ -1,14 +1,14 @@
-; RUN: opt %loadPolly -disable-output -polly-invariant-load-hoisting \
+; RUN: opt %loadNPMPolly -disable-output -polly-invariant-load-hoisting \
 ; RUN: -polly-allow-dereference-of-all-function-parameters \
-; RUN: -polly-print-scops < %s | FileCheck %s --check-prefix=SCOP
+; RUN: '-passes=print<polly-function-scops>' < %s 2>&1 | FileCheck %s --check-prefix=SCOP
 
-; RUN: opt %loadPolly -S -polly-invariant-load-hoisting \
-; RUN: -polly-codegen < %s | FileCheck %s --check-prefix=CODE-RTC
+; RUN: opt %loadNPMPolly -S -polly-invariant-load-hoisting \
+; RUN: -passes=polly-codegen < %s 2>&1 | FileCheck %s --check-prefix=CODE-RTC
 
 
-; RUN: opt %loadPolly -S -polly-invariant-load-hoisting \
+; RUN: opt %loadNPMPolly -S -polly-invariant-load-hoisting \
 ; RUN: -polly-allow-dereference-of-all-function-parameters \
-; RUN: -polly-codegen < %s | FileCheck %s --check-prefix=CODE
+; RUN: -passes=polly-codegen < %s 2>&1 | FileCheck %s --check-prefix=CODE
 
 ; SCOP:      Function: hoge
 ; SCOP-NEXT: Region: %bb15---%bb37
diff --git a/polly/test/ScopInfo/assume_gep_bounds.ll b/polly/test/ScopInfo/assume_gep_bounds.ll
index d0ce471..bd14e38 100644
--- a/polly/test/ScopInfo/assume_gep_bounds.ll
+++ b/polly/test/ScopInfo/assume_gep_bounds.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ;    void foo(float A[][20][30], long n, long m, long p) {
 ;      for (long i = 0; i < n; i++)
diff --git a/polly/test/ScopInfo/assume_gep_bounds_2.ll b/polly/test/ScopInfo/assume_gep_bounds_2.ll
index e327195..7a8c187 100644
--- a/polly/test/ScopInfo/assume_gep_bounds_2.ll
+++ b/polly/test/ScopInfo/assume_gep_bounds_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN:  -polly-precise-inbounds | FileCheck %s
 ;
 ;    void foo(float A[restrict][20], float B[restrict][20], long n, long m,
diff --git a/polly/test/ScopInfo/assume_gep_bounds_many.ll b/polly/test/ScopInfo/assume_gep_bounds_many.ll
index 2614915..01fc12c 100644
--- a/polly/test/ScopInfo/assume_gep_bounds_many.ll
+++ b/polly/test/ScopInfo/assume_gep_bounds_many.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -disable-output -polly-print-scops -polly-ignore-aliasing \
-; RUN:    < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' -polly-ignore-aliasing \
+; RUN:    < %s 2>&1 | FileCheck %s
 
 ; CHECK: Assumed Context:
 ; CHECK-NEXT: [n1_a, n1_b, n1_c, n1_d, n2_a, n2_b, n2_c, n2_d, n3_a, n3_b, n3_c, n3_d, n4_a, n4_b, n4_c, n4_d, n5_a, n5_b, n5_c, n5_d, n6_a, n6_b, n6_c, n6_d, n7_a, n7_b, n7_c, n7_d, n8_a, n8_b, n8_c, n8_d, n9_a, n9_b, n9_c, n9_d, p1_b, p1_c, p1_d, p2_b, p2_c, p2_d, p3_b, p3_c, p3_d, p4_b, p4_c, p4_d, p5_b, p5_c, p5_d, p6_b, p6_c, p6_d, p7_b, p7_c, p7_d, p8_b, p8_c, p8_d, p9_b, p9_c, p9_d] -> {  : p1_b >= n1_b and p1_c >= n1_c and p1_d >= n1_d and p2_b >= n2_b and p2_c >= n2_c and p2_d >= n2_d and p3_b >= n3_b and p3_c >= n3_c and p3_d >= n3_d and p4_b >= n4_b and p4_c >= n4_c and p4_d >= n4_d and p5_b >= n5_b and p5_c >= n5_c and p5_d >= n5_d and p6_b >= n6_b and p6_c >= n6_c and p6_d >= n6_d and p7_b >= n7_b and p7_c >= n7_c and p7_d >= n7_d and p8_b >= n8_b and p8_c >= n8_c and p8_d >= n8_d and p9_b >= n9_b and p9_c >= n9_c and p9_d >= n9_d }
diff --git a/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll b/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll
index 0e17eb1..3fb7a13 100644
--- a/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll
+++ b/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do no introduce a parameter here that is actually not needed.
 ;
diff --git a/polly/test/ScopInfo/bool-addrec.ll b/polly/test/ScopInfo/bool-addrec.ll
index 1924a4b..81fcade 100644
--- a/polly/test/ScopInfo/bool-addrec.ll
+++ b/polly/test/ScopInfo/bool-addrec.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -disable-output -polly-print-ast -polly-process-unprofitable < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-ast>' -polly-process-unprofitable < %s 2>&1 | FileCheck %s
 
 ; CHECK:      for (int c0 = 0; c0 <= 19999; c0 += 1) {
 ; CHECK-NEXT:   if (c0 % 2 == 0)
diff --git a/polly/test/ScopInfo/bounded_loop_assumptions.ll b/polly/test/ScopInfo/bounded_loop_assumptions.ll
index d472c75..5628092 100644
--- a/polly/test/ScopInfo/bounded_loop_assumptions.ll
+++ b/polly/test/ScopInfo/bounded_loop_assumptions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The assumed context is tricky here as the equality test for the inner loop
 ; allows an "unbounded" loop trip count. We assume that does not happen, thus
diff --git a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll
index 5c5f264..83743e4 100644
--- a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll
+++ b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | \
 ; RUN:     FileCheck %s -check-prefix=DETECT
 
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | \
 ; RUN:     FileCheck %s -check-prefix=SCOP
 
 ; DETECT: Valid Region for Scop: loop => barrier
diff --git a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll
index d69d3a1..9685ba3 100644
--- a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll
+++ b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | \
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | \
 ; RUN:     FileCheck %s -check-prefix=NONAFFINE
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output \
-; RUN:     -polly-allow-nonaffine-branches=false < %s | \
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output \
+; RUN:     -polly-allow-nonaffine-branches=false < %s 2>&1 | \
 ; RUN:     FileCheck %s -check-prefix=NO-NONEAFFINE
 
 ; NONAFFINE:      Statements {
diff --git a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll
index 57918fa..f41e650 100644
--- a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll
+++ b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | \
 ; RUN:     FileCheck %s -check-prefix=NONAFFINE
-; RUN: opt %loadPolly -polly-print-scops -disable-output \
-; RUN:     -polly-allow-nonaffine-branches=false < %s | \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \
+; RUN:     -polly-allow-nonaffine-branches=false < %s 2>&1 | \
 ; RUN:     FileCheck %s -check-prefix=NO-NONEAFFINE
 
 ; NONAFFINE-NOT: Statements
diff --git a/polly/test/ScopInfo/bug_2010_10_22.ll b/polly/test/ScopInfo/bug_2010_10_22.ll
index 7ba996b..71e7051 100644
--- a/polly/test/ScopInfo/bug_2010_10_22.ll
+++ b/polly/test/ScopInfo/bug_2010_10_22.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/ScopInfo/bug_2011_1_5.ll b/polly/test/ScopInfo/bug_2011_1_5.ll
index 95c25f9..f4a24e0 100644
--- a/polly/test/ScopInfo/bug_2011_1_5.ll
+++ b/polly/test/ScopInfo/bug_2011_1_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 
 ; Bug description: Alias Analysis thinks IntToPtrInst aliases with alloca instructions created by IndependentBlocks Pass.
 ;                  This will trigger the assertion when we are verifying the SCoP after IndependentBlocks.
diff --git a/polly/test/ScopInfo/bug_scev_not_fully_eval.ll b/polly/test/ScopInfo/bug_scev_not_fully_eval.ll
index 89d5f31..ed6bbafd 100644
--- a/polly/test/ScopInfo/bug_scev_not_fully_eval.ll
+++ b/polly/test/ScopInfo/bug_scev_not_fully_eval.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | not FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | not FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @edge.8265 = external global [72 x i32], align 32 ; <ptr> [#uses=1]
diff --git a/polly/test/ScopInfo/cfg_consequences.ll b/polly/test/ScopInfo/cfg_consequences.ll
index 84f94b1..9161d3d 100644
--- a/polly/test/ScopInfo/cfg_consequences.ll
+++ b/polly/test/ScopInfo/cfg_consequences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; void consequences(int *A, int bool_cond, int lhs, int rhs) {
 ;
diff --git a/polly/test/ScopInfo/complex-branch-structure.ll b/polly/test/ScopInfo/complex-branch-structure.ll
index 24ebdcf..de79c22 100644
--- a/polly/test/ScopInfo/complex-branch-structure.ll
+++ b/polly/test/ScopInfo/complex-branch-structure.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-print-scops \
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
 ; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 
 ; We build a scop of the following form to check that the domain construction
diff --git a/polly/test/ScopInfo/complex-condition.ll b/polly/test/ScopInfo/complex-condition.ll
index 31d34b0..c3b8d2b 100644
--- a/polly/test/ScopInfo/complex-condition.ll
+++ b/polly/test/ScopInfo/complex-condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-print-scops \
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
 ; RUN:     -polly-invariant-load-hoisting=true \
 ; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 ;
diff --git a/polly/test/ScopInfo/complex-expression.ll b/polly/test/ScopInfo/complex-expression.ll
index 1822c9d..6a6dde6 100644
--- a/polly/test/ScopInfo/complex-expression.ll
+++ b/polly/test/ScopInfo/complex-expression.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-print-scops \
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
 ; RUN:     -polly-invariant-load-hoisting=true \
 ; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 ;
diff --git a/polly/test/ScopInfo/complex-loop-nesting.ll b/polly/test/ScopInfo/complex-loop-nesting.ll
index 97a9bfd..36cb078 100644
--- a/polly/test/ScopInfo/complex-loop-nesting.ll
+++ b/polly/test/ScopInfo/complex-loop-nesting.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/complex-successor-structure-2.ll b/polly/test/ScopInfo/complex-successor-structure-2.ll
index 6bb7bb1..f4a78bf 100644
--- a/polly/test/ScopInfo/complex-successor-structure-2.ll
+++ b/polly/test/ScopInfo/complex-successor-structure-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-print-scops \
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
 ; RUN:     -polly-invariant-load-hoisting=true \
 ; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 
diff --git a/polly/test/ScopInfo/complex-successor-structure-3.ll b/polly/test/ScopInfo/complex-successor-structure-3.ll
index 14c3fc1..6da1fe3 100644
--- a/polly/test/ScopInfo/complex-successor-structure-3.ll
+++ b/polly/test/ScopInfo/complex-successor-structure-3.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -disable-output -polly-print-scops \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 ;
 ; Check that propagation of domains from A(X) to A(X+1) will keep the
 ; domains small and concise.
diff --git a/polly/test/ScopInfo/complex-successor-structure.ll b/polly/test/ScopInfo/complex-successor-structure.ll
index 3643440..6c87ba3 100644
--- a/polly/test/ScopInfo/complex-successor-structure.ll
+++ b/polly/test/ScopInfo/complex-successor-structure.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-print-scops \
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
 ; RUN:     -polly-invariant-load-hoisting=true \
 ; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 
diff --git a/polly/test/ScopInfo/complex_domain_binary_condition.ll b/polly/test/ScopInfo/complex_domain_binary_condition.ll
index cec2685..6091e3b 100644
--- a/polly/test/ScopInfo/complex_domain_binary_condition.ll
+++ b/polly/test/ScopInfo/complex_domain_binary_condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-scops \
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
 ; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Low complexity assumption: {  : false }
diff --git a/polly/test/ScopInfo/complex_execution_context.ll b/polly/test/ScopInfo/complex_execution_context.ll
index 1642543..9880a1d 100644
--- a/polly/test/ScopInfo/complex_execution_context.ll
+++ b/polly/test/ScopInfo/complex_execution_context.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-print-scops \
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
 ; RUN:     -polly-invariant-load-hoisting=true \
 ; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 ;
diff --git a/polly/test/ScopInfo/cond_constant_in_loop.ll b/polly/test/ScopInfo/cond_constant_in_loop.ll
index ef7d857..552fddc 100644
--- a/polly/test/ScopInfo/cond_constant_in_loop.ll
+++ b/polly/test/ScopInfo/cond_constant_in_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ;void f(long a[], long N, long M) {
 ;  long i, j, k;
diff --git a/polly/test/ScopInfo/cond_in_loop.ll b/polly/test/ScopInfo/cond_in_loop.ll
index 2d435f6..c06dcd9 100644
--- a/polly/test/ScopInfo/cond_in_loop.ll
+++ b/polly/test/ScopInfo/cond_in_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ;void f(long a[], long N, long M) {
 ;  long i, j, k;
diff --git a/polly/test/ScopInfo/condition-after-error-block-2.ll b/polly/test/ScopInfo/condition-after-error-block-2.ll
index 695d864..8c4b217 100644
--- a/polly/test/ScopInfo/condition-after-error-block-2.ll
+++ b/polly/test/ScopInfo/condition-after-error-block-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Verify that we do not allow PHI nodes such as %phi, if they reference an error
 ; block and are used by anything else than a terminator instruction.
diff --git a/polly/test/ScopInfo/condition-after-error-block-before-scop.ll b/polly/test/ScopInfo/condition-after-error-block-before-scop.ll
index 184be36..d5069da9 100644
--- a/polly/test/ScopInfo/condition-after-error-block-before-scop.ll
+++ b/polly/test/ScopInfo/condition-after-error-block-before-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/ScopInfo/condtion-after-error-block.ll b/polly/test/ScopInfo/condtion-after-error-block.ll
index 92e743e..d9de4fc 100644
--- a/polly/test/ScopInfo/condtion-after-error-block.ll
+++ b/polly/test/ScopInfo/condtion-after-error-block.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Verify that we allow scops containing uniform branch conditions, where all
 ; but one incoming block comes from an error condition.
diff --git a/polly/test/ScopInfo/const_srem_sdiv.ll b/polly/test/ScopInfo/const_srem_sdiv.ll
index 3acca98..b4c2f11 100644
--- a/polly/test/ScopInfo/const_srem_sdiv.ll
+++ b/polly/test/ScopInfo/const_srem_sdiv.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 ;
 ; See http://research.microsoft.com/pubs/151917/divmodnote-letter.pdf
 ;
diff --git a/polly/test/ScopInfo/constant-non-integer-branch-condition.ll b/polly/test/ScopInfo/constant-non-integer-branch-condition.ll
index fc95a4c..42c3b83 100644
--- a/polly/test/ScopInfo/constant-non-integer-branch-condition.ll
+++ b/polly/test/ScopInfo/constant-non-integer-branch-condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; At some point this caused a problem in the domain generation as we
 ; assumed any constant branch condition to be valid. However, only constant
diff --git a/polly/test/ScopInfo/constant_factor_in_parameter.ll b/polly/test/ScopInfo/constant_factor_in_parameter.ll
index 1f0173c..b58d413 100644
--- a/polly/test/ScopInfo/constant_factor_in_parameter.ll
+++ b/polly/test/ScopInfo/constant_factor_in_parameter.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -disable-output -polly-print-scops < %s | FileCheck %s
-; RUN: opt %loadPolly -disable-output -polly-print-function-scops < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' < %s 2>&1 | FileCheck %s
 ;
 ; Check that the constant part of the N * M * 4 expression is not part of the
 ; parameter but explicit in the access function. This can avoid existentially
diff --git a/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll b/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll
index 38b2b89..62e6cd4 100644
--- a/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll
+++ b/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
diff --git a/polly/test/ScopInfo/constant_start_integer.ll b/polly/test/ScopInfo/constant_start_integer.ll
index aa6640c..8991f82 100644
--- a/polly/test/ScopInfo/constant_start_integer.ll
+++ b/polly/test/ScopInfo/constant_start_integer.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(float *input) {
diff --git a/polly/test/ScopInfo/debug_call.ll b/polly/test/ScopInfo/debug_call.ll
index 93b5bc5..a6761ec 100644
--- a/polly/test/ScopInfo/debug_call.ll
+++ b/polly/test/ScopInfo/debug_call.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-debug-func=dbg_printf -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-debug-func=dbg_printf '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Check that the call to dbg_printf is accepted as a debug-function.
 ;
diff --git a/polly/test/ScopInfo/delinearize-together-all-data-refs.ll b/polly/test/ScopInfo/delinearize-together-all-data-refs.ll
index 108392b..676c8a2 100644
--- a/polly/test/ScopInfo/delinearize-together-all-data-refs.ll
+++ b/polly/test/ScopInfo/delinearize-together-all-data-refs.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
 ;   for (long i = 0; i < n-3; i++)
diff --git a/polly/test/ScopInfo/div_by_zero.ll b/polly/test/ScopInfo/div_by_zero.ll
index 2205b85..aecd168 100644
--- a/polly/test/ScopInfo/div_by_zero.ll
+++ b/polly/test/ScopInfo/div_by_zero.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/do-not-model-error-block-accesses.ll b/polly/test/ScopInfo/do-not-model-error-block-accesses.ll
index 997e0d4..baa423f 100644
--- a/polly/test/ScopInfo/do-not-model-error-block-accesses.ll
+++ b/polly/test/ScopInfo/do-not-model-error-block-accesses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 
 ; Check that we do not crash on this input. Earlier this indeed crashed as
 ; we tried to model the access functions in an error block.
diff --git a/polly/test/ScopInfo/eager-binary-and-or-conditions.ll b/polly/test/ScopInfo/eager-binary-and-or-conditions.ll
index e9ad63c..a988b3f 100644
--- a/polly/test/ScopInfo/eager-binary-and-or-conditions.ll
+++ b/polly/test/ScopInfo/eager-binary-and-or-conditions.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output< %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
 ;
 ; void or(float *A, long n, long m) {
 ;   for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/early_exit_for_complex_domains.ll b/polly/test/ScopInfo/early_exit_for_complex_domains.ll
index a72ea03..eed19b3 100644
--- a/polly/test/ScopInfo/early_exit_for_complex_domains.ll
+++ b/polly/test/ScopInfo/early_exit_for_complex_domains.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 ;
 ; Check we do not crash.
 ;
diff --git a/polly/test/ScopInfo/error-blocks-1.ll b/polly/test/ScopInfo/error-blocks-1.ll
index 03353ed..047b095 100644
--- a/polly/test/ScopInfo/error-blocks-1.ll
+++ b/polly/test/ScopInfo/error-blocks-1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Context:
 ; CHECK-NEXT:    [N] -> {  : -2147483648 <= N <= 2147483647 }
diff --git a/polly/test/ScopInfo/error-blocks-2.ll b/polly/test/ScopInfo/error-blocks-2.ll
index 29095da..6fa1294 100644
--- a/polly/test/ScopInfo/error-blocks-2.ll
+++ b/polly/test/ScopInfo/error-blocks-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Invariant Accesses: {
 ; CHECK-NEXT:            ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/escaping_empty_scop.ll b/polly/test/ScopInfo/escaping_empty_scop.ll
index 8837e19..2efaef3 100644
--- a/polly/test/ScopInfo/escaping_empty_scop.ll
+++ b/polly/test/ScopInfo/escaping_empty_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void g();
 ;    int f(int *A) {
diff --git a/polly/test/ScopInfo/exit-phi-1.ll b/polly/test/ScopInfo/exit-phi-1.ll
index 8e6c5fb..cbd6c28 100644
--- a/polly/test/ScopInfo/exit-phi-1.ll
+++ b/polly/test/ScopInfo/exit-phi-1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-codegen -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-codegen -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ; Check for correct code generation of exit PHIs, even if the same PHI value
 ; is used again inside the the SCoP.
diff --git a/polly/test/ScopInfo/exit-phi-2.ll b/polly/test/ScopInfo/exit-phi-2.ll
index d218d5f..695c617 100644
--- a/polly/test/ScopInfo/exit-phi-2.ll
+++ b/polly/test/ScopInfo/exit-phi-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that there is no MK_ExitPHI READ access.
 ;
diff --git a/polly/test/ScopInfo/exit_phi_accesses-2.ll b/polly/test/ScopInfo/exit_phi_accesses-2.ll
index e376f0d..b3b7cb1 100644
--- a/polly/test/ScopInfo/exit_phi_accesses-2.ll
+++ b/polly/test/ScopInfo/exit_phi_accesses-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK-LABEL: Function: foo
 ;
diff --git a/polly/test/ScopInfo/exit_phi_accesses.ll b/polly/test/ScopInfo/exit_phi_accesses.ll
index f4fbe31..77b038e 100644
--- a/polly/test/ScopInfo/exit_phi_accesses.ll
+++ b/polly/test/ScopInfo/exit_phi_accesses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Check that PHI nodes only create PHI access and nothing else (e.g. unnecessary
 ; SCALAR accesses). In this case, for a PHI in the exit node, hence there is no
diff --git a/polly/test/ScopInfo/expensive-boundary-context.ll b/polly/test/ScopInfo/expensive-boundary-context.ll
index 7001b96..1a8858d 100644
--- a/polly/test/ScopInfo/expensive-boundary-context.ll
+++ b/polly/test/ScopInfo/expensive-boundary-context.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output \
-; RUN:                < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \
+; RUN:                < %s 2>&1 | FileCheck %s
 
 ; CHECK-NOT:   Assumed Context:
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll b/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll
index 89ca344..5e833e7 100644
--- a/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll
+++ b/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 
 ; CHECK: Valid Region for Scop: bb10 => bb16
 
diff --git a/polly/test/ScopInfo/full-function.ll b/polly/test/ScopInfo/full-function.ll
index 6704725..596c3d0 100644
--- a/polly/test/ScopInfo/full-function.ll
+++ b/polly/test/ScopInfo/full-function.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output -polly-detect-full-functions < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-detect-full-functions < %s 2>&1 \
 ; RUN: | FileCheck %s -check-prefix=FULL
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN: | FileCheck %s -check-prefix=WITHOUT-FULL
 
 ; FULL:      Region: %bb---FunctionExit
diff --git a/polly/test/ScopInfo/granularity_same_name.ll b/polly/test/ScopInfo/granularity_same_name.ll
index 1ebf5c6f..17f75fb 100644
--- a/polly/test/ScopInfo/granularity_same_name.ll
+++ b/polly/test/ScopInfo/granularity_same_name.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb           -polly-use-llvm-names=0 -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines -check-prefix=IDX
-; RUN: opt %loadPolly -polly-stmt-granularity=bb           -polly-use-llvm-names=1 -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines -check-prefix=BB
-; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=0 -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines -check-prefix=IDX
-; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=1 -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines -check-prefix=BB
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb           -polly-use-llvm-names=0 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=IDX
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb           -polly-use-llvm-names=1 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=BB
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=0 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=IDX
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=1 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=BB
 ;
 ; Check that the statement has the same name, regardless of how the
 ; basic block is split into multiple statements.
diff --git a/polly/test/ScopInfo/granularity_scalar-indep.ll b/polly/test/ScopInfo/granularity_scalar-indep.ll
index fe509b4..5c4484f 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Split a block into two independent statements that share no scalar.
 ; This case has the instructions of the two statements interleaved, such that
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll
index 56bc11a..7ae0d96 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Two PHIs, cross-referencing each other. The PHI READs must be carried-out
 ; before the PHI WRITEs to ensure that the value when entering the block is
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll
index f46cf4e..7839e51 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Two PHIs, cross-referencing each other. The PHI READs must be carried-out
 ; before the PHI WRITEs to ensure that the value when entering the block is
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll b/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll
index e202e38..8643e85 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Split a block into two independent statements that share no scalar.
 ; This case has an independent statement just for PHI writes.
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll b/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll
index 40af34b..bc71cbe4 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Check that the PHI Write of value that is defined in the same basic
 ; block is in the statement where it is defined.
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll b/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll
index 9a0d207..f3864ba 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; This case has no explicit epilogue for PHI writes because it would
 ; have a scalar dependency to the previous statement.
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll b/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll
index d093806..43101a8 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; This case should be split into two statements because {X[0], Y[0]}
 ; and {A[0], B[0]} do not intersect.
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll b/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll
index b1d2936..4974f7e 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; This case cannot be split into two statements because the order of
 ; loads and store would be violated.
diff --git a/polly/test/ScopInfo/i1_params.ll b/polly/test/ScopInfo/i1_params.ll
index 1cb1329..be3e287 100644
--- a/polly/test/ScopInfo/i1_params.ll
+++ b/polly/test/ScopInfo/i1_params.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that both a signed as well as an unsigned extended i1 parameter
 ; is represented correctly.
diff --git a/polly/test/ScopInfo/infeasible-rtc.ll b/polly/test/ScopInfo/infeasible-rtc.ll
index ef96627..7a0bfe0 100644
--- a/polly/test/ScopInfo/infeasible-rtc.ll
+++ b/polly/test/ScopInfo/infeasible-rtc.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \
 ; RUN:  | FileCheck %s -check-prefix=DETECT
 
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN:  | FileCheck %s -check-prefix=SCOPS
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
diff --git a/polly/test/ScopInfo/infeasible_invalid_context.ll b/polly/test/ScopInfo/infeasible_invalid_context.ll
index 2c299f0..006901a 100644
--- a/polly/test/ScopInfo/infeasible_invalid_context.ll
+++ b/polly/test/ScopInfo/infeasible_invalid_context.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \
 ; RUN:  | FileCheck %s -check-prefix=DETECT
 
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN:  | FileCheck %s -check-prefix=SCOPS
 
 ; DETECT: Valid Region for Scop: if.end116 => for.inc216
diff --git a/polly/test/ScopInfo/int2ptr_ptr2int.ll b/polly/test/ScopInfo/int2ptr_ptr2int.ll
index 9fadc5a..f6668ecd 100644
--- a/polly/test/ScopInfo/int2ptr_ptr2int.ll
+++ b/polly/test/ScopInfo/int2ptr_ptr2int.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s 2>&1 | FileCheck %s --check-prefix=IR
 ;
 ;    void f(long *A, long *ptr, long val) {
 ;      for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/int2ptr_ptr2int_2.ll b/polly/test/ScopInfo/int2ptr_ptr2int_2.ll
index 97878f7..361bf5a 100644
--- a/polly/test/ScopInfo/int2ptr_ptr2int_2.ll
+++ b/polly/test/ScopInfo/int2ptr_ptr2int_2.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-print-scops \
-; RUN: -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -S -polly-codegen \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' \
+; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s --check-prefix=IR
 ;
 ;    void f(long *A, long *B, long *ptr, long val) {
 ;      for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/integers.ll b/polly/test/ScopInfo/integers.ll
index b608bf8..4f6d111 100644
--- a/polly/test/ScopInfo/integers.ll
+++ b/polly/test/ScopInfo/integers.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Check that we correctly convert integers to isl values.
 
diff --git a/polly/test/ScopInfo/inter-error-bb-dependence.ll b/polly/test/ScopInfo/inter-error-bb-dependence.ll
index 4e23de7..761fcbb 100644
--- a/polly/test/ScopInfo/inter-error-bb-dependence.ll
+++ b/polly/test/ScopInfo/inter-error-bb-dependence.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-print-scops -disable-output < %s 2>&1 > /dev/null | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 > /dev/null | FileCheck %s
 ;
 ; Error statements (%bb33) do not require their uses to be verified.
 ; In this case it uses %tmp32 from %bb31 which is not available because
diff --git a/polly/test/ScopInfo/inter_bb_scalar_dep.ll b/polly/test/ScopInfo/inter_bb_scalar_dep.ll
index 456f7a7..7313618 100644
--- a/polly/test/ScopInfo/inter_bb_scalar_dep.ll
+++ b/polly/test/ScopInfo/inter_bb_scalar_dep.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops \
-; RUN: -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' \
+; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], int N, int *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll b/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll
index 859972b..d2ed3c1 100644
--- a/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll
+++ b/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output \
-; RUN:     < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
+; RUN:     < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Statements {
 ; CHECK-NEXT:     Stmt_loop__TO__backedge
diff --git a/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll b/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll
index 37f4e05..b3286cd 100644
--- a/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll
+++ b/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], int N, int *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/intra_bb_scalar_dep.ll b/polly/test/ScopInfo/intra_bb_scalar_dep.ll
index 0252273..86855e7 100644
--- a/polly/test/ScopInfo/intra_bb_scalar_dep.ll
+++ b/polly/test/ScopInfo/intra_bb_scalar_dep.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], int N, int *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/intrinsics.ll b/polly/test/ScopInfo/intrinsics.ll
index 8534293..c5bbacb 100644
--- a/polly/test/ScopInfo/intrinsics.ll
+++ b/polly/test/ScopInfo/intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-print-instructions -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-print-instructions -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we remove the ignored intrinsics from the instruction list.
 ;
diff --git a/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll b/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll
index 8d0de03..7239426 100644
--- a/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll
+++ b/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 ;
 ; This crashed at some point as we place %1 and %4 in the same equivalence class
 ; for invariant loads and when we remap SCEVs to use %4 instead of %1 AddRec SCEVs
diff --git a/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll b/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll
index dcb0ad3..c493c22 100644
--- a/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll
+++ b/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 ;
 ; Check that no invalidated iterator is accessed while elements from
 ; the list of MemoryAccesses are removed.
diff --git a/polly/test/ScopInfo/invariant-load-instlist.ll b/polly/test/ScopInfo/invariant-load-instlist.ll
index 7f4cf05..ecb80e4 100644
--- a/polly/test/ScopInfo/invariant-load-instlist.ll
+++ b/polly/test/ScopInfo/invariant-load-instlist.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 
 ; The load is a required invariant load and at the same time used in a store.
 ; Polly used to add two MemoryAccesses for it which caused an assertion to fail.
diff --git a/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll b/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll
index b97fe22..89eac6c 100644
--- a/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll
+++ b/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
 
 ; CHECK:      Statements {
 ; CHECK-NEXT: 	Stmt_L_4
diff --git a/polly/test/ScopInfo/invariant_load.ll b/polly/test/ScopInfo/invariant_load.ll
index fcea77e..9dc0642 100644
--- a/polly/test/ScopInfo/invariant_load.ll
+++ b/polly/test/ScopInfo/invariant_load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll
index 100a8db..40aa309 100644
--- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll
+++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ;    struct {
 ;      int a;
diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll
index e31deb6..2876760 100644
--- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll
+++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ;    struct {
 ;      int a;
diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll
index bbf6d69..cb745b4 100644
--- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll
+++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ;    int U;
 ;    void f(int *A) {
diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll
index 011c2fe..fa5429d 100644
--- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll
+++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ;    int U;
 ;    int f(int *A) {
diff --git a/polly/test/ScopInfo/invariant_load_addrec_sum.ll b/polly/test/ScopInfo/invariant_load_addrec_sum.ll
index 09b158d..2e639f7 100644
--- a/polly/test/ScopInfo/invariant_load_addrec_sum.ll
+++ b/polly/test/ScopInfo/invariant_load_addrec_sum.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Region: %entry.split---%if.end
 ; CHECK:     Invariant Accesses: {
diff --git a/polly/test/ScopInfo/invariant_load_base_pointer.ll b/polly/test/ScopInfo/invariant_load_base_pointer.ll
index ddf11d8..f2539af 100644
--- a/polly/test/ScopInfo/invariant_load_base_pointer.ll
+++ b/polly/test/ScopInfo/invariant_load_base_pointer.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll b/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll
index 07f2c37..f854b1f 100644
--- a/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll
+++ b/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll b/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll
index d66d718..5a9c5c6 100644
--- a/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll
+++ b/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_branch_condition.ll b/polly/test/ScopInfo/invariant_load_branch_condition.ll
index 4f49d29..d12750c 100644
--- a/polly/test/ScopInfo/invariant_load_branch_condition.ll
+++ b/polly/test/ScopInfo/invariant_load_branch_condition.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output \
-; RUN:     -polly-invariant-load-hoisting < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
+; RUN:     -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Invariant Accesses: {
 ; CHECK-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll
index c6a7faf..34d50a1 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN:  -polly-invariant-load-hoisting \
 ; RUN:  | FileCheck %s
 
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll
index 921dd4f..51f3cf6 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN:  -polly-invariant-load-hoisting \
 ; RUN:  | FileCheck %s
 
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll
index c15d11c..3a742bb 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN:  -polly-invariant-load-hoisting \
 ; RUN:  | FileCheck %s
 
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll
index 0495a33..6bd8b31 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN:  -polly-invariant-load-hoisting \
 ; RUN:  | FileCheck %s
 
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll
index 9144fcf..cb7e564 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN:  -polly-invariant-load-hoisting \
 ; RUN:  | FileCheck %s
 
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll
index aefacff..6f7fbacc 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN:  -polly-invariant-load-hoisting \
 ; RUN:  | FileCheck %s
 
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll
index ecc0c0a..4458328 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN:  -polly-invariant-load-hoisting \
 ; RUN:  | FileCheck %s
 
diff --git a/polly/test/ScopInfo/invariant_load_complex_condition.ll b/polly/test/ScopInfo/invariant_load_complex_condition.ll
index e721c22..11e7088 100644
--- a/polly/test/ScopInfo/invariant_load_complex_condition.ll
+++ b/polly/test/ScopInfo/invariant_load_complex_condition.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -S -polly-print-scops -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -S '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/invariant_load_condition.ll b/polly/test/ScopInfo/invariant_load_condition.ll
index 8454698..c7d7b3c 100644
--- a/polly/test/ScopInfo/invariant_load_condition.ll
+++ b/polly/test/ScopInfo/invariant_load_condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_dereferenceable.ll b/polly/test/ScopInfo/invariant_load_dereferenceable.ll
index adba32d..526bdc6 100644
--- a/polly/test/ScopInfo/invariant_load_dereferenceable.ll
+++ b/polly/test/ScopInfo/invariant_load_dereferenceable.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-print-detect -polly-print-scops \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' '-passes=print<polly-function-scops>' \
 ; RUN: -polly-invariant-load-hoisting=true \
-; RUN: -disable-output < %s | FileCheck %s
+; RUN: -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK-NOT: Function: foo_undereferanceable
 
diff --git a/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll b/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll
index 60b4a1d..eb14806 100644
--- a/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll
+++ b/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not consolidate the invariant loads to smp[order - 1] and
 ; smp[order - 2] in the blocks %0 and %16. While they have the same pointer
diff --git a/polly/test/ScopInfo/invariant_load_in_non_affine.ll b/polly/test/ScopInfo/invariant_load_in_non_affine.ll
index d00bc2d6..5261113 100644
--- a/polly/test/ScopInfo/invariant_load_in_non_affine.ll
+++ b/polly/test/ScopInfo/invariant_load_in_non_affine.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output \
-; RUN:   -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output \
+; RUN:   -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid Region for Scop
 ;
diff --git a/polly/test/ScopInfo/invariant_load_loop_ub.ll b/polly/test/ScopInfo/invariant_load_loop_ub.ll
index 856b6e4..ee889e6 100644
--- a/polly/test/ScopInfo/invariant_load_loop_ub.ll
+++ b/polly/test/ScopInfo/invariant_load_loop_ub.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll b/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll
index 69463d4..6af7cae 100644
--- a/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll
+++ b/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -tbaa -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing \
-; RUN:                -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing \
+; RUN:                -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Note: The order of the invariant accesses is important because A is the
 ;       base pointer of tmp3 and we will generate code in the same order as
diff --git a/polly/test/ScopInfo/invariant_load_scalar_dep.ll b/polly/test/ScopInfo/invariant_load_scalar_dep.ll
index 79a1042..319f24b 100644
--- a/polly/test/ScopInfo/invariant_load_scalar_dep.ll
+++ b/polly/test/ScopInfo/invariant_load_scalar_dep.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_stmt_domain.ll b/polly/test/ScopInfo/invariant_load_stmt_domain.ll
index 6cd71c8..7159480 100644
--- a/polly/test/ScopInfo/invariant_load_stmt_domain.ll
+++ b/polly/test/ScopInfo/invariant_load_stmt_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 
 ; This test case verifies that the statement domain of the invariant access
 ; is the universe. In earlier versions of Polly, we accidentally computed an
diff --git a/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll b/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll
index e775152..a610832 100644
--- a/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll
+++ b/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -scalar-evolution-max-value-compare-depth=3 -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -scalar-evolution-max-value-compare-depth=3 -polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
+; RUN: opt %loadNPMPolly -scalar-evolution-max-value-compare-depth=3 '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -scalar-evolution-max-value-compare-depth=3 -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
 ;
 ; Stress test for the code generation of invariant accesses.
 ;
diff --git a/polly/test/ScopInfo/invariant_load_zext_parameter.ll b/polly/test/ScopInfo/invariant_load_zext_parameter.ll
index 1bde702..e3c183a 100644
--- a/polly/test/ScopInfo/invariant_load_zext_parameter.ll
+++ b/polly/test/ScopInfo/invariant_load_zext_parameter.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ;    void f(int *I0, int *I1, int *V) {
 ;      for (int i = 0; i < 1000; i++) {
diff --git a/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll b/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll
index 775369e5..b5168e9 100644
--- a/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll
+++ b/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
 ;
 ; CHECK: Execution Context: [p_0_loaded_from_currpc] -> {  :  }
 ;
diff --git a/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll b/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll
index 1d54ccc..8536082 100644
--- a/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll
+++ b/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Invariant Accesses: {
 ; CHECK-NEXT:         ReadAccess :=    [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll b/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll
index e97de0c..134eac2 100644
--- a/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll
+++ b/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Negative test. If we assume UB[*V] to be invariant we get a cyclic
 ; dependence in the invariant loads that needs to be resolved by
diff --git a/polly/test/ScopInfo/invariant_loop_bounds.ll b/polly/test/ScopInfo/invariant_loop_bounds.ll
index 4e1fd88..f22199c 100644
--- a/polly/test/ScopInfo/invariant_loop_bounds.ll
+++ b/polly/test/ScopInfo/invariant_loop_bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Invariant Accesses: {
 ; CHECK-NEXT:         ReadAccess :=    [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll
index 3d5737b..a473ef3 100644
--- a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll
+++ b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we only have one parameter and one invariant load for all
 ; three loads that occure in the region but actually access the same
diff --git a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll
index e2de503..66a0bc6 100644
--- a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll
+++ b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we only have one parameter and one invariant load for all
 ; three loads that occure in the region but actually access the same
diff --git a/polly/test/ScopInfo/isl_aff_out_of_bounds.ll b/polly/test/ScopInfo/isl_aff_out_of_bounds.ll
index ca1b235..2df96fa 100644
--- a/polly/test/ScopInfo/isl_aff_out_of_bounds.ll
+++ b/polly/test/ScopInfo/isl_aff_out_of_bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-detect < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s 2>&1
 
 ; Used to fail with:
 ; ../../isl/isl_aff.c:591: position out of bounds
diff --git a/polly/test/ScopInfo/isl_trip_count_01.ll b/polly/test/ScopInfo/isl_trip_count_01.ll
index fc6b79c..480b6e9 100644
--- a/polly/test/ScopInfo/isl_trip_count_01.ll
+++ b/polly/test/ScopInfo/isl_trip_count_01.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: [M, N] -> { Stmt_while_body[i0] : i0 > 0 and 4i0 <= -M + N; Stmt_while_body[0] };
 ;
diff --git a/polly/test/ScopInfo/isl_trip_count_02.ll b/polly/test/ScopInfo/isl_trip_count_02.ll
index 9376cb4..b78fb83 100644
--- a/polly/test/ScopInfo/isl_trip_count_02.ll
+++ b/polly/test/ScopInfo/isl_trip_count_02.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; TODO: We do not allow unbounded loops at the moment.
 ;
diff --git a/polly/test/ScopInfo/isl_trip_count_03.ll b/polly/test/ScopInfo/isl_trip_count_03.ll
index f5b0048..96df05f 100644
--- a/polly/test/ScopInfo/isl_trip_count_03.ll
+++ b/polly/test/ScopInfo/isl_trip_count_03.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Test comes from a bug (15771) or better a feature request. It was not allowed
 ; in Polly in the old domain generation as ScalarEvolution cannot figure out the
diff --git a/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll b/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll
index 91bc19e..fd310ec 100644
--- a/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll
+++ b/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/licm_reduction_nested.ll b/polly/test/ScopInfo/licm_reduction_nested.ll
index a3ba478..c167603 100644
--- a/polly/test/ScopInfo/licm_reduction_nested.ll
+++ b/polly/test/ScopInfo/licm_reduction_nested.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -loop-rotate -indvars       -polly-prepare -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -basic-aa -loop-rotate -indvars -licm -polly-prepare -polly-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -loop-rotate -indvars       -passes=polly-prepare '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -loop-rotate -indvars -licm -passes=polly-prepare '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; XFAIL: *
 ;
diff --git a/polly/test/ScopInfo/long-compile-time-alias-analysis.ll b/polly/test/ScopInfo/long-compile-time-alias-analysis.ll
index 1cbecf0..f102518 100644
--- a/polly/test/ScopInfo/long-compile-time-alias-analysis.ll
+++ b/polly/test/ScopInfo/long-compile-time-alias-analysis.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 
 ; Verify that the compilation of this test case does not take infinite time.
 ; At some point Polly tried to model this test case and got stuck in
diff --git a/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll b/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll
index c88ea13..6027975 100644
--- a/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll
+++ b/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/ScopInfo/long-sequence-of-error-blocks.ll b/polly/test/ScopInfo/long-sequence-of-error-blocks.ll
index 5b6ea9c..4ef5ef0 100644
--- a/polly/test/ScopInfo/long-sequence-of-error-blocks.ll
+++ b/polly/test/ScopInfo/long-sequence-of-error-blocks.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/ScopInfo/loop-multiexit-succ-cond.ll b/polly/test/ScopInfo/loop-multiexit-succ-cond.ll
index 350db05..431c907 100644
--- a/polly/test/ScopInfo/loop-multiexit-succ-cond.ll
+++ b/polly/test/ScopInfo/loop-multiexit-succ-cond.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | FileCheck %s --check-prefix=IR
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/loop_affine_bound_0.ll b/polly/test/ScopInfo/loop_affine_bound_0.ll
index 33f49df..918d409 100644
--- a/polly/test/ScopInfo/loop_affine_bound_0.ll
+++ b/polly/test/ScopInfo/loop_affine_bound_0.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long a[][128], long N, long M) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/loop_affine_bound_1.ll b/polly/test/ScopInfo/loop_affine_bound_1.ll
index 38e47b7..8f7a87f 100644
--- a/polly/test/ScopInfo/loop_affine_bound_1.ll
+++ b/polly/test/ScopInfo/loop_affine_bound_1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output< %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ;void f(long a[][128], long N, long M) {
 ;  long i, j;
diff --git a/polly/test/ScopInfo/loop_affine_bound_2.ll b/polly/test/ScopInfo/loop_affine_bound_2.ll
index e34662f..2d9f997 100644
--- a/polly/test/ScopInfo/loop_affine_bound_2.ll
+++ b/polly/test/ScopInfo/loop_affine_bound_2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long a[][128], long N, long M) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/loop_carry.ll b/polly/test/ScopInfo/loop_carry.ll
index f7c1dca..20ebbfb 100644
--- a/polly/test/ScopInfo/loop_carry.ll
+++ b/polly/test/ScopInfo/loop_carry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/ScopInfo/many-scalar-dependences.ll b/polly/test/ScopInfo/many-scalar-dependences.ll
index aaa02f5..5b00332 100644
--- a/polly/test/ScopInfo/many-scalar-dependences.ll
+++ b/polly/test/ScopInfo/many-scalar-dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(float a[100][100]) {
 ;      float x;
diff --git a/polly/test/ScopInfo/max-loop-depth.ll b/polly/test/ScopInfo/max-loop-depth.ll
index 3c7db44..71e9c02 100644
--- a/polly/test/ScopInfo/max-loop-depth.ll
+++ b/polly/test/ScopInfo/max-loop-depth.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void bar();
 ;    void foo(int *A, int *B, long int N, long int M) {
diff --git a/polly/test/ScopInfo/memcpy-raw-source.ll b/polly/test/ScopInfo/memcpy-raw-source.ll
index 137ab82..d9024cd 100644
--- a/polly/test/ScopInfo/memcpy-raw-source.ll
+++ b/polly/test/ScopInfo/memcpy-raw-source.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -scoped-noalias-aa -tbaa -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa,scoped-noalias-aa,tbaa '-passes=print<polly-function-scops>' -disable-output < %s
 ;
 ; Ensure that ScopInfo's alias analysis llvm.memcpy for,
 ; like the AliasSetTracker, preserves bitcasts.
diff --git a/polly/test/ScopInfo/memcpy.ll b/polly/test/ScopInfo/memcpy.ll
index 705dea7..95c455f 100644
--- a/polly/test/ScopInfo/memcpy.ll
+++ b/polly/test/ScopInfo/memcpy.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-allow-differing-element-types -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -S -basic-aa -polly-allow-differing-element-types -polly-codegen < %s | FileCheck --check-prefix=IR %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -aa-pipeline=basic-aa -polly-allow-differing-element-types -passes=polly-codegen < %s 2>&1 | FileCheck --check-prefix=IR %s
 ;
 ; CHECK:         Arrays {
 ; CHECK-NEXT:        i8 MemRef_A[*]; // Element size 1
diff --git a/polly/test/ScopInfo/memmove.ll b/polly/test/ScopInfo/memmove.ll
index 1512342..8ff471a 100644
--- a/polly/test/ScopInfo/memmove.ll
+++ b/polly/test/ScopInfo/memmove.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-allow-differing-element-types -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -S -basic-aa -polly-allow-differing-element-types -polly-codegen < %s | FileCheck --check-prefix=IR %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -aa-pipeline=basic-aa -polly-allow-differing-element-types -passes=polly-codegen < %s 2>&1 | FileCheck --check-prefix=IR %s
 ;
 ; CHECK:         Arrays {
 ; CHECK-NEXT:        i8 MemRef_A[*]; // Element size 1
diff --git a/polly/test/ScopInfo/memset.ll b/polly/test/ScopInfo/memset.ll
index ef86b4c..89b0487 100644
--- a/polly/test/ScopInfo/memset.ll
+++ b/polly/test/ScopInfo/memset.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-allow-differing-element-types -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -S -polly-allow-differing-element-types -polly-codegen < %s | FileCheck --check-prefix=IR %s
+; RUN: opt %loadNPMPolly -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -polly-allow-differing-element-types -passes=polly-codegen < %s 2>&1 | FileCheck --check-prefix=IR %s
 ;
 ; CHECK:         Arrays {
 ; CHECK-NEXT:        i8 MemRef_A[*]; // Element size 1
diff --git a/polly/test/ScopInfo/memset_null.ll b/polly/test/ScopInfo/memset_null.ll
index 1608ff6..9755cf1 100644
--- a/polly/test/ScopInfo/memset_null.ll
+++ b/polly/test/ScopInfo/memset_null.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-allow-modref-calls -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-allow-modref-calls -S -polly-codegen < %s
+; RUN: opt %loadNPMPolly -polly-allow-modref-calls '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-modref-calls -S -passes=polly-codegen < %s
 ;
 ; Verify we can handle a memset to "null" and that we do not model it.
 ; TODO: FIXME: We could use the undefined memset to optimize the code further,
diff --git a/polly/test/ScopInfo/mismatching-array-dimensions.ll b/polly/test/ScopInfo/mismatching-array-dimensions.ll
index a1c6d4e..ed1e28c 100644
--- a/polly/test/ScopInfo/mismatching-array-dimensions.ll
+++ b/polly/test/ScopInfo/mismatching-array-dimensions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK-NOT: AssumedContext
 
diff --git a/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll b/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll
index 7288932..6bc5f8d 100644
--- a/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll
+++ b/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-scops -polly-allow-modref-calls \
-; RUN:     -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-codegen -polly-allow-modref-calls \
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb -passes=polly-codegen -polly-allow-modref-calls \
 ; RUN:     -disable-output < %s
 ;
 ; Verify that we model the may-write access of the prefetch intrinsic
diff --git a/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll b/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll
index 2f6c679..21322bc 100644
--- a/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll
+++ b/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-scops -polly-allow-modref-calls \
-; RUN:     -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -basic-aa -polly-codegen -disable-output \
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -disable-output \
 ; RUN:     -polly-allow-modref-calls < %s
 ;
 ; Verify that we model the read access of the gcread intrinsic
diff --git a/polly/test/ScopInfo/mod_ref_read_pointer.ll b/polly/test/ScopInfo/mod_ref_read_pointer.ll
index 657e37c..25e56a0 100644
--- a/polly/test/ScopInfo/mod_ref_read_pointer.ll
+++ b/polly/test/ScopInfo/mod_ref_read_pointer.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-allow-modref-calls -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -basic-aa -polly-allow-modref-calls -polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls -passes=polly-codegen -disable-output < %s
 ;
 ; Check that we assume the call to func has a read on the whole A array.
 ;
diff --git a/polly/test/ScopInfo/mod_ref_read_pointers.ll b/polly/test/ScopInfo/mod_ref_read_pointers.ll
index 7ed3423..5cc96cf 100644
--- a/polly/test/ScopInfo/mod_ref_read_pointers.ll
+++ b/polly/test/ScopInfo/mod_ref_read_pointers.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops -polly-allow-modref-calls \
-; RUN:     -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -basic-aa -polly-codegen -disable-output \
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -disable-output \
 ; RUN:     -polly-allow-modref-calls < %s
 ;
 ; Check that the call to func will "read" not only the A array but also the
diff --git a/polly/test/ScopInfo/modulo_zext_1.ll b/polly/test/ScopInfo/modulo_zext_1.ll
index d611ec4..0a8957d 100644
--- a/polly/test/ScopInfo/modulo_zext_1.ll
+++ b/polly/test/ScopInfo/modulo_zext_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Assumed Context:
 ; CHECK-NEXT:    [N] -> {  :  }
diff --git a/polly/test/ScopInfo/modulo_zext_2.ll b/polly/test/ScopInfo/modulo_zext_2.ll
index 8d23218..7af2411 100644
--- a/polly/test/ScopInfo/modulo_zext_2.ll
+++ b/polly/test/ScopInfo/modulo_zext_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Assumed Context:
 ; CHECK-NEXT:    [N] -> {  :  }
diff --git a/polly/test/ScopInfo/modulo_zext_3.ll b/polly/test/ScopInfo/modulo_zext_3.ll
index acb26dc..1dac723 100644
--- a/polly/test/ScopInfo/modulo_zext_3.ll
+++ b/polly/test/ScopInfo/modulo_zext_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Assumed Context:
 ; CHECK-NEXT:    [N] -> {  :  }
diff --git a/polly/test/ScopInfo/multi-scop.ll b/polly/test/ScopInfo/multi-scop.ll
index e26c8c7..c6dc1f2 100644
--- a/polly/test/ScopInfo/multi-scop.ll
+++ b/polly/test/ScopInfo/multi-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -polly-scops  -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>'  -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; This test case contains two scops.
diff --git a/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll b/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll
index 278c06a..bd46532 100644
--- a/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll
+++ b/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll b/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll
index 06a7646..cdd4630 100644
--- a/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll
+++ b/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll b/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll
index bfbe568..0b735b9 100644
--- a/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll
+++ b/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/ScopInfo/multidim_2d_with_modref_call.ll b/polly/test/ScopInfo/multidim_2d_with_modref_call.ll
index ba934adb..befca87 100644
--- a/polly/test/ScopInfo/multidim_2d_with_modref_call.ll
+++ b/polly/test/ScopInfo/multidim_2d_with_modref_call.ll
@@ -1,9 +1,9 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -polly-allow-modref-calls \
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
 ; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -polly-allow-nonaffine \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-nonaffine \
 ; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -polly-allow-modref-calls -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
+; RUN:     -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
 
 ;  TODO: We should delinearize the accesses despite the use in a call to a
 ;        readonly function. For now we verify we do not delinearize them though.
diff --git a/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll b/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll
index 3da123f..cceb535 100644
--- a/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll
+++ b/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll
@@ -1,9 +1,9 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -polly-allow-modref-calls \
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
 ; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -polly-allow-nonaffine \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-nonaffine \
 ; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -polly-allow-modref-calls -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
+; RUN:     -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
 
 ;  TODO: We should delinearize the accesses despite the use in a call to a
 ;        readonly function. For now we verify we do not delinearize them though.
diff --git a/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll b/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll
index 988475575..c957dd1 100644
--- a/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll
+++ b/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
diff --git a/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll b/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll
index ddc35a4..4a1ee3b 100644
--- a/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll
+++ b/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    #define N 400
 ;
diff --git a/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll b/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll
index 9c749f0..9a6d8fb 100644
--- a/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll
+++ b/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Context:
 ; CHECK-NEXT: {  :  }
diff --git a/polly/test/ScopInfo/multidim_fold_constant_dim.ll b/polly/test/ScopInfo/multidim_fold_constant_dim.ll
index e95d400..9f47694 100644
--- a/polly/test/ScopInfo/multidim_fold_constant_dim.ll
+++ b/polly/test/ScopInfo/multidim_fold_constant_dim.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    struct com {
 ;      double Real;
diff --git a/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll b/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll
index 57275e4..5778126 100644
--- a/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll
+++ b/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -debug -disable-output < %s 2>&1 | FileCheck %s
 
 ; REQUIRES: asserts
 
diff --git a/polly/test/ScopInfo/multidim_fortran_2d.ll b/polly/test/ScopInfo/multidim_fortran_2d.ll
index 29279a4..e5b005f 100644
--- a/polly/test/ScopInfo/multidim_fortran_2d.ll
+++ b/polly/test/ScopInfo/multidim_fortran_2d.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops \
-; RUN: -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' \
+; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 
 ;   subroutine init_array(ni, nj, pi, pj, a)
 ;   implicit none
diff --git a/polly/test/ScopInfo/multidim_fortran_2d_params.ll b/polly/test/ScopInfo/multidim_fortran_2d_params.ll
index 93145b3..a7f7ebc 100644
--- a/polly/test/ScopInfo/multidim_fortran_2d_params.ll
+++ b/polly/test/ScopInfo/multidim_fortran_2d_params.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
 ; RUN: -polly-precise-fold-accesses \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 
 ;   subroutine init_array(ni, nj, pi, pj, a)
 ;   implicit none
diff --git a/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll b/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll
index dff6a8b..5f3080a 100644
--- a/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll
+++ b/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll
@@ -1,9 +1,9 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -polly-allow-modref-calls \
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
 ; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -polly-allow-nonaffine \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-nonaffine \
 ; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -polly-allow-modref-calls -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
+; RUN:     -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
 
 ;  TODO: We should delinearize the accesses despite the use in a call to a
 ;        readonly function. For now we verify we do not delinearize them though.
diff --git a/polly/test/ScopInfo/multidim_fortran_srem.ll b/polly/test/ScopInfo/multidim_fortran_srem.ll
index 8c24c5b..31cc633 100644
--- a/polly/test/ScopInfo/multidim_fortran_srem.ll
+++ b/polly/test/ScopInfo/multidim_fortran_srem.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; CHECK:      Statements {
diff --git a/polly/test/ScopInfo/multidim_gep_pointercast.ll b/polly/test/ScopInfo/multidim_gep_pointercast.ll
index 20d59fa..fd8048b 100644
--- a/polly/test/ScopInfo/multidim_gep_pointercast.ll
+++ b/polly/test/ScopInfo/multidim_gep_pointercast.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The load access to A has a pointer-bitcast to another elements size before the
 ; GetElementPtr. Verify that we do not the GEP delinearization because it
diff --git a/polly/test/ScopInfo/multidim_gep_pointercast2.ll b/polly/test/ScopInfo/multidim_gep_pointercast2.ll
index deed9c7..b31a0d0 100644
--- a/polly/test/ScopInfo/multidim_gep_pointercast2.ll
+++ b/polly/test/ScopInfo/multidim_gep_pointercast2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verfy that we do not use the GetElementPtr information to delinearize A
 ; because of the cast in-between. Use the single-dimensional modeling instead.
diff --git a/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll b/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll
index 9f7e6bc..92b42a9 100644
--- a/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll
+++ b/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
diff --git a/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll b/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll
index 131bb7b..261cba1 100644
--- a/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll
+++ b/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-precise-fold-accesses -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-precise-fold-accesses '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o], long p, long q, long r) {
diff --git a/polly/test/ScopInfo/multidim_many_references.ll b/polly/test/ScopInfo/multidim_many_references.ll
index b0483b2..3801fda 100644
--- a/polly/test/ScopInfo/multidim_many_references.ll
+++ b/polly/test/ScopInfo/multidim_many_references.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-ignore-aliasing -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -polly-ignore-aliasing -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/multidim_nested_start_integer.ll b/polly/test/ScopInfo/multidim_nested_start_integer.ll
index 741a0ef..6ee9798 100644
--- a/polly/test/ScopInfo/multidim_nested_start_integer.ll
+++ b/polly/test/ScopInfo/multidim_nested_start_integer.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
diff --git a/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll b/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll
index 692746b..e238bdd 100644
--- a/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll
+++ b/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
diff --git a/polly/test/ScopInfo/multidim_only_ivs_2d.ll b/polly/test/ScopInfo/multidim_only_ivs_2d.ll
index 7124564..33b3217 100644
--- a/polly/test/ScopInfo/multidim_only_ivs_2d.ll
+++ b/polly/test/ScopInfo/multidim_only_ivs_2d.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/ScopInfo/multidim_only_ivs_3d.ll b/polly/test/ScopInfo/multidim_only_ivs_3d.ll
index a019d58..39ea424 100644
--- a/polly/test/ScopInfo/multidim_only_ivs_3d.ll
+++ b/polly/test/ScopInfo/multidim_only_ivs_3d.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
diff --git a/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll b/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll
index 41577ef..7f7f7f9 100644
--- a/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll
+++ b/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void foo(int n, int m, int o, double A[n][m][o]) {
 ;
diff --git a/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll b/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll
index 25907f2..1675110 100644
--- a/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll
+++ b/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; This test case checks for array access functions where the order in which the
diff --git a/polly/test/ScopInfo/multidim_param_in_subscript-2.ll b/polly/test/ScopInfo/multidim_param_in_subscript-2.ll
index 0790664..da9827f 100644
--- a/polly/test/ScopInfo/multidim_param_in_subscript-2.ll
+++ b/polly/test/ScopInfo/multidim_param_in_subscript-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-precise-fold-accesses -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-precise-fold-accesses '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(long n, long m, float A[][n][m]) {
 ;      for (long i = 0; i < 100; i++)
diff --git a/polly/test/ScopInfo/multidim_param_in_subscript.ll b/polly/test/ScopInfo/multidim_param_in_subscript.ll
index b8ec80b..c86b5f0 100644
--- a/polly/test/ScopInfo/multidim_param_in_subscript.ll
+++ b/polly/test/ScopInfo/multidim_param_in_subscript.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;
 ;    void foo(long n, float A[][n]) {
diff --git a/polly/test/ScopInfo/multidim_parameter_addrec_product.ll b/polly/test/ScopInfo/multidim_parameter_addrec_product.ll
index 7db3e9d..da563a0 100644
--- a/polly/test/ScopInfo/multidim_parameter_addrec_product.ll
+++ b/polly/test/ScopInfo/multidim_parameter_addrec_product.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting=true -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(float *A, long *p) {
 ;      for (long i = 0; i < 100; i++)
diff --git a/polly/test/ScopInfo/multidim_single_and_multidim_array.ll b/polly/test/ScopInfo/multidim_single_and_multidim_array.ll
index 1e302de..7059e53 100644
--- a/polly/test/ScopInfo/multidim_single_and_multidim_array.ll
+++ b/polly/test/ScopInfo/multidim_single_and_multidim_array.ll
@@ -1,11 +1,11 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-delinearize=false -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-scops -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=DELIN
-; RUN: opt %loadPolly -polly-print-scops -polly-allow-nonaffine -disable-output < %s | FileCheck %s --check-prefix=DELIN
-; RUN: opt %loadPolly -polly-print-function-scops -polly-delinearize=false -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
-; RUN: opt %loadPolly -polly-print-function-scops -disable-output < %s | FileCheck %s --check-prefix=DELIN
-; RUN: opt %loadPolly -polly-print-function-scops -polly-allow-nonaffine -disable-output < %s | FileCheck %s --check-prefix=DELIN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/multidim_srem.ll b/polly/test/ScopInfo/multidim_srem.ll
index f89843f..c965e2c 100644
--- a/polly/test/ScopInfo/multidim_srem.ll
+++ b/polly/test/ScopInfo/multidim_srem.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(long n, float A[][n][n]) {
 ;      for (long i = 0; i < 200; i++)
diff --git a/polly/test/ScopInfo/multidim_with_bitcast.ll b/polly/test/ScopInfo/multidim_with_bitcast.ll
index b77ff68..0ab9c2d 100644
--- a/polly/test/ScopInfo/multidim_with_bitcast.ll
+++ b/polly/test/ScopInfo/multidim_with_bitcast.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/multiple-binary-or-conditions.ll b/polly/test/ScopInfo/multiple-binary-or-conditions.ll
index b905a11..65416e6 100644
--- a/polly/test/ScopInfo/multiple-binary-or-conditions.ll
+++ b/polly/test/ScopInfo/multiple-binary-or-conditions.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
 ;
 ; void or(float *A, long n, long m) {
 ;   for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll b/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll
index 2d03ad9..910e624 100644
--- a/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll
+++ b/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -pass-remarks-analysis="polly-scops" \
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \
 ; RUN:     -polly-allow-differing-element-types \
 ; RUN:     -disable-output < %s  2>&1 | FileCheck %s
 ;
diff --git a/polly/test/ScopInfo/multiple-types-non-affine-2.ll b/polly/test/ScopInfo/multiple-types-non-affine-2.ll
index 5b0aa5d..cb0630d 100644
--- a/polly/test/ScopInfo/multiple-types-non-affine-2.ll
+++ b/polly/test/ScopInfo/multiple-types-non-affine-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types -polly-print-scops -polly-allow-nonaffine -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types -polly-codegen -polly-allow-nonaffine -disable-output
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types -passes=polly-codegen -polly-allow-nonaffine -disable-output
 ;
 ;    // Check that accessing one array with different types works,
 ;    // even though some accesses are non-affine.
diff --git a/polly/test/ScopInfo/multiple-types-non-affine.ll b/polly/test/ScopInfo/multiple-types-non-affine.ll
index 8e4be4c..7349c5a 100644
--- a/polly/test/ScopInfo/multiple-types-non-affine.ll
+++ b/polly/test/ScopInfo/multiple-types-non-affine.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types -polly-print-scops -polly-allow-nonaffine -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types -polly-codegen -polly-allow-nonaffine -disable-output
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types -passes=polly-codegen -polly-allow-nonaffine -disable-output
 ;
 ;    // Check that accessing one array with different types works,
 ;    // even though some accesses are non-affine.
diff --git a/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll b/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll
index 01f5923..df280c8 100644
--- a/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll
+++ b/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-allow-differing-element-types -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;  void multiple_types(i8 *A) {
 ;    for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/multiple-types-non-power-of-two.ll b/polly/test/ScopInfo/multiple-types-non-power-of-two.ll
index 142a5ac..b949418 100644
--- a/polly/test/ScopInfo/multiple-types-non-power-of-two.ll
+++ b/polly/test/ScopInfo/multiple-types-non-power-of-two.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-allow-differing-element-types -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;  void multiple_types(i8 *A) {
 ;    for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll b/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll
index 1e2e53e..e971ccc 100644
--- a/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll
+++ b/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -pass-remarks-analysis="polly-scops" \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \
 ; RUN:                -polly-allow-differing-element-types \
 ; RUN:                -disable-output < %s 2>&1 | FileCheck %s
 ;
diff --git a/polly/test/ScopInfo/multiple-types-two-dimensional.ll b/polly/test/ScopInfo/multiple-types-two-dimensional.ll
index 21dc96e..3417950 100644
--- a/polly/test/ScopInfo/multiple-types-two-dimensional.ll
+++ b/polly/test/ScopInfo/multiple-types-two-dimensional.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -pass-remarks-analysis="polly-scops" \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \
 ; RUN:     -polly-allow-differing-element-types \
 ; RUN:     -disable-output < %s  2>&1 | FileCheck %s
 ;
diff --git a/polly/test/ScopInfo/multiple-types.ll b/polly/test/ScopInfo/multiple-types.ll
index 16db191..84d7d33 100644
--- a/polly/test/ScopInfo/multiple-types.ll
+++ b/polly/test/ScopInfo/multiple-types.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops \
-; RUN: -polly-allow-differing-element-types -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' \
+; RUN: -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    // Check that accessing one array with different types works.
 ;    void multiple_types(char *Short, char *Float, char *Double) {
diff --git a/polly/test/ScopInfo/multiple_exiting_blocks.ll b/polly/test/ScopInfo/multiple_exiting_blocks.ll
index f8e5d410..b0c425e 100644
--- a/polly/test/ScopInfo/multiple_exiting_blocks.ll
+++ b/polly/test/ScopInfo/multiple_exiting_blocks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll b/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll
index c695f3c..ff0ec47 100644
--- a/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll
+++ b/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/multiple_latch_blocks.ll b/polly/test/ScopInfo/multiple_latch_blocks.ll
index d3949e7..e5085da 100644
--- a/polly/test/ScopInfo/multiple_latch_blocks.ll
+++ b/polly/test/ScopInfo/multiple_latch_blocks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Domain :=
 ; CHECK:   [N, P] -> { Stmt_if_end[i0] : 0 <= i0 < N and (i0 > P or i0 < P) };
diff --git a/polly/test/ScopInfo/nested-loops.ll b/polly/test/ScopInfo/nested-loops.ll
index ed814f8..9100297 100644
--- a/polly/test/ScopInfo/nested-loops.ll
+++ b/polly/test/ScopInfo/nested-loops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
diff --git a/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll b/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll
index 7c55e24..df01084 100644
--- a/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll
+++ b/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not generate any scalar dependences regarding x. It is
 ; defined and used on the non-affine subregion only, thus we do not need
diff --git a/polly/test/ScopInfo/non-affine-region-phi.ll b/polly/test/ScopInfo/non-affine-region-phi.ll
index f99782b..3fb655e 100644
--- a/polly/test/ScopInfo/non-affine-region-phi.ll
+++ b/polly/test/ScopInfo/non-affine-region-phi.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine -S < %s | FileCheck %s --check-prefix=CODE
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -S < %s 2>&1 | FileCheck %s --check-prefix=CODE
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify there is a phi in the non-affine region but it is not represented in
 ; the SCoP as all operands as well as the uses are inside the region too.
diff --git a/polly/test/ScopInfo/non-affine-region-with-loop-2.ll b/polly/test/ScopInfo/non-affine-region-with-loop-2.ll
index b673fda..4c3ca4d 100644
--- a/polly/test/ScopInfo/non-affine-region-with-loop-2.ll
+++ b/polly/test/ScopInfo/non-affine-region-with-loop-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-allow-nonaffine-loops -polly-print-scops -polly-codegen -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-nonaffine-loops '-passes=print<polly-detect>,print<polly-function-scops>,scop(polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Stmt_loop3
 ; CHECK:            Domain :=
diff --git a/polly/test/ScopInfo/non-affine-region-with-loop.ll b/polly/test/ScopInfo/non-affine-region-with-loop.ll
index 32dde8b..f4c028a 100644
--- a/polly/test/ScopInfo/non-affine-region-with-loop.ll
+++ b/polly/test/ScopInfo/non-affine-region-with-loop.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-loops -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-allow-nonaffine-loops -polly-codegen -disable-output
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -passes=polly-codegen -disable-output
 ;
 ; CHECK:      Domain :=
 ; CHECK-NEXT:   { Stmt_loop2__TO__loop[] };
diff --git a/polly/test/ScopInfo/non-precise-inv-load-1.ll b/polly/test/ScopInfo/non-precise-inv-load-1.ll
index 5394206..d55344b 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-1.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we do hoist the invariant access to I with a execution context
 ; as the address computation might wrap in the original but not in our
diff --git a/polly/test/ScopInfo/non-precise-inv-load-2.ll b/polly/test/ScopInfo/non-precise-inv-load-2.ll
index 5c0c565..79ef3b8 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-2.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;
 ; CHECK:       Invariant Accesses: {
diff --git a/polly/test/ScopInfo/non-precise-inv-load-3.ll b/polly/test/ScopInfo/non-precise-inv-load-3.ll
index 09d0931..aa92847 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-3.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:        Invariant Accesses: {
 ; CHECK-NEXT:     ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/non-precise-inv-load-4.ll b/polly/test/ScopInfo/non-precise-inv-load-4.ll
index da5f656..2a2241c 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-4.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we hoist I[0] without execution context even though it
 ; is executed in a statement with an invalid domain.
diff --git a/polly/test/ScopInfo/non-precise-inv-load-5.ll b/polly/test/ScopInfo/non-precise-inv-load-5.ll
index bff5f59..a414c7c 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-5.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we do not hoist I[c] without execution context because it
 ; is executed in a statement with an invalid domain and it depends
diff --git a/polly/test/ScopInfo/non-precise-inv-load-6.ll b/polly/test/ScopInfo/non-precise-inv-load-6.ll
index 03540a8..1300617 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-6.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-6.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we model the execution context correctly.
 ;
diff --git a/polly/test/ScopInfo/non-pure-function-call.ll b/polly/test/ScopInfo/non-pure-function-call.ll
index 4ffb8d2..81d43db 100644
--- a/polly/test/ScopInfo/non-pure-function-call.ll
+++ b/polly/test/ScopInfo/non-pure-function-call.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Assumed Context:
 ; CHECK-NEXT:   [N] -> {  :  }
diff --git a/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll b/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll
index 27998b5..6cbb410 100644
--- a/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll
+++ b/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Error blocks are skipped during SCoP detection. We skip them during
 ; SCoP formation too as they might contain instructions we can not handle.
diff --git a/polly/test/ScopInfo/non-pure-function-calls.ll b/polly/test/ScopInfo/non-pure-function-calls.ll
index 3ecf758..f976440 100644
--- a/polly/test/ScopInfo/non-pure-function-calls.ll
+++ b/polly/test/ScopInfo/non-pure-function-calls.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Allow the user to define function names that are treated as
 ; error functions and assumed not to be executed.
diff --git a/polly/test/ScopInfo/non_affine_access.ll b/polly/test/ScopInfo/non_affine_access.ll
index a83c948..0338edf 100644
--- a/polly/test/ScopInfo/non_affine_access.ll
+++ b/polly/test/ScopInfo/non_affine_access.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-scops -polly-allow-nonaffine -disable-output < %s | FileCheck %s -check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long *A) {
diff --git a/polly/test/ScopInfo/non_affine_region_1.ll b/polly/test/ScopInfo/non_affine_region_1.ll
index 7c43125..8980a71 100644
--- a/polly/test/ScopInfo/non_affine_region_1.ll
+++ b/polly/test/ScopInfo/non_affine_region_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify only the incoming scalar x is modeled as a read in the non-affine
 ; region.
diff --git a/polly/test/ScopInfo/non_affine_region_2.ll b/polly/test/ScopInfo/non_affine_region_2.ll
index 0bc467c..b2e072f 100644
--- a/polly/test/ScopInfo/non_affine_region_2.ll
+++ b/polly/test/ScopInfo/non_affine_region_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify the scalar x defined in a non-affine subregion is written as it
 ; escapes the region. In this test the two conditionals inside the region
diff --git a/polly/test/ScopInfo/non_affine_region_3.ll b/polly/test/ScopInfo/non_affine_region_3.ll
index 6d5f94df..d850cb5 100644
--- a/polly/test/ScopInfo/non_affine_region_3.ll
+++ b/polly/test/ScopInfo/non_affine_region_3.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify the scalar x defined in a non-affine subregion is written as it
 ; escapes the region. In this test the two conditionals inside the region
diff --git a/polly/test/ScopInfo/non_affine_region_4.ll b/polly/test/ScopInfo/non_affine_region_4.ll
index f37e0ec..c530973 100644
--- a/polly/test/ScopInfo/non_affine_region_4.ll
+++ b/polly/test/ScopInfo/non_affine_region_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that both scalars (x and y) are properly written in the non-affine
 ; region and read afterwards.
diff --git a/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll b/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll
index 445dd16..b1ce00f0 100644
--- a/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll
+++ b/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-loops -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Domain :=
 ; CHECK-NEXT:   { Stmt_while_cond_i__TO__while_end_i[] };
diff --git a/polly/test/ScopInfo/not-a-reduction.ll b/polly/test/ScopInfo/not-a-reduction.ll
index 8790929..3a961b2 100644
--- a/polly/test/ScopInfo/not-a-reduction.ll
+++ b/polly/test/ScopInfo/not-a-reduction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s 2>&1 | not FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | not FileCheck %s
 
 ;#define TYPE float
 ;#define NUM 4
diff --git a/polly/test/ScopInfo/opaque-struct.ll b/polly/test/ScopInfo/opaque-struct.ll
index 19fdd9b..f4f7952 100644
--- a/polly/test/ScopInfo/opaque-struct.ll
+++ b/polly/test/ScopInfo/opaque-struct.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 ;
 ; Check that we do not crash with unsized (opaque) types.
 ;
diff --git a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll
index 394173b..eed27b1 100644
--- a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll
+++ b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | FileCheck %s
 ;
 ; Check whether %newval is identified as escaping value, even though it is used
 ; in a phi that is in the region. Non-affine subregion case.
diff --git a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll
index e17164e..44da399 100644
--- a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll
+++ b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: MustWriteAccess :=  [Reduction Type: NONE] [Scalar: 1]
 ; CHECK-NEXT: [p_0] -> { Stmt_bb3[] -> MemRef_tmp5[] };
diff --git a/polly/test/ScopInfo/parameter-constant-division.ll b/polly/test/ScopInfo/parameter-constant-division.ll
index cd6b9e3..e5dd359 100644
--- a/polly/test/ScopInfo/parameter-constant-division.ll
+++ b/polly/test/ScopInfo/parameter-constant-division.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops \
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' \
 ; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -disable-output < %s | FileCheck %s
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:          Invariant Accesses: {
 ; CHECK-NEXT:            ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/parameter_in_dead_statement.ll b/polly/test/ScopInfo/parameter_in_dead_statement.ll
index 4b4a87f0..b295f17 100644
--- a/polly/test/ScopInfo/parameter_in_dead_statement.ll
+++ b/polly/test/ScopInfo/parameter_in_dead_statement.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -S \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s --check-prefix=IR
 ;
 ; Verify we do not create assumptions based on the parameter p_1 which is the
 ; load %0 and due to error-assumptions not "part of the SCoP".
diff --git a/polly/test/ScopInfo/parameter_product.ll b/polly/test/ScopInfo/parameter_product.ll
index 1ba7280..2fe16f9 100644
--- a/polly/test/ScopInfo/parameter_product.ll
+++ b/polly/test/ScopInfo/parameter_product.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; int n, m;
 ; void foo(char* __restrict a)
diff --git a/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll b/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll
index 72d5808..6544aae 100644
--- a/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll
+++ b/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the access function of the store is simple and concise
 ;
diff --git a/polly/test/ScopInfo/partially_invariant_load_1.ll b/polly/test/ScopInfo/partially_invariant_load_1.ll
index 274a787..f3923f61 100644
--- a/polly/test/ScopInfo/partially_invariant_load_1.ll
+++ b/polly/test/ScopInfo/partially_invariant_load_1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=IR
 ;
 ; CHECK:          Invariant Accesses: {
 ; CHECK-NEXT:             ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/partially_invariant_load_2.ll b/polly/test/ScopInfo/partially_invariant_load_2.ll
index ee10928..d0d74ad 100644
--- a/polly/test/ScopInfo/partially_invariant_load_2.ll
+++ b/polly/test/ScopInfo/partially_invariant_load_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not try to preload *I and assume p != 42.
 ;
diff --git a/polly/test/ScopInfo/phi-in-non-affine-region.ll b/polly/test/ScopInfo/phi-in-non-affine-region.ll
index 6ef24e3..fbbc158 100644
--- a/polly/test/ScopInfo/phi-in-non-affine-region.ll
+++ b/polly/test/ScopInfo/phi-in-non-affine-region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Verify that 'tmp' is stored in bb1 and read by bb3, as it is needed as
 ; incoming value for the tmp11 PHI node.
diff --git a/polly/test/ScopInfo/phi_after_error_block.ll b/polly/test/ScopInfo/phi_after_error_block.ll
index 039fb86..a1eadff 100644
--- a/polly/test/ScopInfo/phi_after_error_block.ll
+++ b/polly/test/ScopInfo/phi_after_error_block.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 declare void @bar()
 
diff --git a/polly/test/ScopInfo/phi_condition_modeling_1.ll b/polly/test/ScopInfo/phi_condition_modeling_1.ll
index a879c200..a889ec9 100644
--- a/polly/test/ScopInfo/phi_condition_modeling_1.ll
+++ b/polly/test/ScopInfo/phi_condition_modeling_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int c, int N) {
 ;      int tmp;
diff --git a/polly/test/ScopInfo/phi_condition_modeling_2.ll b/polly/test/ScopInfo/phi_condition_modeling_2.ll
index cedc140..b56b77e 100644
--- a/polly/test/ScopInfo/phi_condition_modeling_2.ll
+++ b/polly/test/ScopInfo/phi_condition_modeling_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int c, int N) {
 ;      int tmp;
diff --git a/polly/test/ScopInfo/phi_conditional_simple_1.ll b/polly/test/ScopInfo/phi_conditional_simple_1.ll
index 90213a9..14fdc38 100644
--- a/polly/test/ScopInfo/phi_conditional_simple_1.ll
+++ b/polly/test/ScopInfo/phi_conditional_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void jd(int *A, int c) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/ScopInfo/phi_loop_carried_float.ll b/polly/test/ScopInfo/phi_loop_carried_float.ll
index d8d26083..76e5507 100644
--- a/polly/test/ScopInfo/phi_loop_carried_float.ll
+++ b/polly/test/ScopInfo/phi_loop_carried_float.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    float f(float *A, int N) {
 ;      float tmp = 0;
diff --git a/polly/test/ScopInfo/phi_not_grouped_at_top.ll b/polly/test/ScopInfo/phi_not_grouped_at_top.ll
index be08216..c97d9a27 100644
--- a/polly/test/ScopInfo/phi_not_grouped_at_top.ll
+++ b/polly/test/ScopInfo/phi_not_grouped_at_top.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-prepare -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-prepare -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 declare i32 @funa() align 2
diff --git a/polly/test/ScopInfo/phi_scalar_simple_1.ll b/polly/test/ScopInfo/phi_scalar_simple_1.ll
index d042613..ffd1a37 100644
--- a/polly/test/ScopInfo/phi_scalar_simple_1.ll
+++ b/polly/test/ScopInfo/phi_scalar_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The assumed context should be empty since the <nsw> flags on the IV
 ; increments already guarantee that there is no wrap in the loop trip
diff --git a/polly/test/ScopInfo/phi_scalar_simple_2.ll b/polly/test/ScopInfo/phi_scalar_simple_2.ll
index fb4292e..0d6d902 100644
--- a/polly/test/ScopInfo/phi_scalar_simple_2.ll
+++ b/polly/test/ScopInfo/phi_scalar_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    int jd(int *restrict A, int x, int N, int c) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/phi_with_invoke_edge.ll b/polly/test/ScopInfo/phi_with_invoke_edge.ll
index dbcf04c..9c98ec0 100644
--- a/polly/test/ScopInfo/phi_with_invoke_edge.ll
+++ b/polly/test/ScopInfo/phi_with_invoke_edge.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-detect -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 declare i32 @generic_personality_v0(i32, i64, ptr, ptr)
diff --git a/polly/test/ScopInfo/pointer-comparison-no-nsw.ll b/polly/test/ScopInfo/pointer-comparison-no-nsw.ll
index 094c5cc..18ba18c 100644
--- a/polly/test/ScopInfo/pointer-comparison-no-nsw.ll
+++ b/polly/test/ScopInfo/pointer-comparison-no-nsw.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int *B) {
 ;      while (A != B) {
diff --git a/polly/test/ScopInfo/pointer-comparison.ll b/polly/test/ScopInfo/pointer-comparison.ll
index 15ce049..846640a 100644
--- a/polly/test/ScopInfo/pointer-comparison.ll
+++ b/polly/test/ScopInfo/pointer-comparison.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; TODO: FIXME: Investigate why we need a InvalidContext here.
 ;
diff --git a/polly/test/ScopInfo/pointer-type-expressions.ll b/polly/test/ScopInfo/pointer-type-expressions.ll
index ebbb644..89dce65 100644
--- a/polly/test/ScopInfo/pointer-type-expressions.ll
+++ b/polly/test/ScopInfo/pointer-type-expressions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], int N, float *P) {
 ;   int i;
diff --git a/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll b/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll
index 3ac86a3..7b6d0d5 100644
--- a/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll
+++ b/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; In this test case we pass a pointer %A into a PHI node and also use this
 ; pointer as base pointer of an array store. As a result, we get both scalar
diff --git a/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll b/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll
index 8152010..13087a5 100644
--- a/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll
+++ b/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Statements {
 ; CHECK-NEXT:  	Stmt_bb9
diff --git a/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll b/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll
index 4a68acd..33fa012 100644
--- a/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll
+++ b/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 
 target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/process_added_dimensions.ll b/polly/test/ScopInfo/process_added_dimensions.ll
index 6cb270a..2d06f4b 100644
--- a/polly/test/ScopInfo/process_added_dimensions.ll
+++ b/polly/test/ScopInfo/process_added_dimensions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Context:
 ; CHECK-NEXT: {  :  }
diff --git a/polly/test/ScopInfo/pwaff-complexity-bailout.ll b/polly/test/ScopInfo/pwaff-complexity-bailout.ll
index 19dd156..931e08f 100644
--- a/polly/test/ScopInfo/pwaff-complexity-bailout.ll
+++ b/polly/test/ScopInfo/pwaff-complexity-bailout.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-scops -pass-remarks-analysis=.* -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis=.* -disable-output < %s 2>&1 | FileCheck %s
 
 ; Make sure we hit the complexity bailout, and don't crash.
 ; CHECK: Low complexity assumption:       {  : false }
diff --git a/polly/test/ScopInfo/ranged_parameter.ll b/polly/test/ScopInfo/ranged_parameter.ll
index 4b04960e..03562b1 100644
--- a/polly/test/ScopInfo/ranged_parameter.ll
+++ b/polly/test/ScopInfo/ranged_parameter.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the constraints on the parameter derived from the
 ; range metadata (see bottom of the file) are present:
diff --git a/polly/test/ScopInfo/ranged_parameter_2.ll b/polly/test/ScopInfo/ranged_parameter_2.ll
index cd7d2bf..18cbbf3 100644
--- a/polly/test/ScopInfo/ranged_parameter_2.ll
+++ b/polly/test/ScopInfo/ranged_parameter_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output -polly-allow-nonaffine -polly-invariant-load-hoisting=true < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-allow-nonaffine -polly-invariant-load-hoisting=true < %s \
 ; RUN:  -debug 2>&1 | FileCheck %s
 
 ; REQUIRES: asserts
diff --git a/polly/test/ScopInfo/ranged_parameter_wrap.ll b/polly/test/ScopInfo/ranged_parameter_wrap.ll
index 1737463..d236eee 100644
--- a/polly/test/ScopInfo/ranged_parameter_wrap.ll
+++ b/polly/test/ScopInfo/ranged_parameter_wrap.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the constraints on the parameter derived from the
 ; __wrapping__ range metadata (see bottom of the file) are present:
diff --git a/polly/test/ScopInfo/ranged_parameter_wrap_2.ll b/polly/test/ScopInfo/ranged_parameter_wrap_2.ll
index 33f57f3..fc0a737 100644
--- a/polly/test/ScopInfo/ranged_parameter_wrap_2.ll
+++ b/polly/test/ScopInfo/ranged_parameter_wrap_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the context is built fast and does not explode due to us
 ; combining a large number of non-convex ranges. Instead, after a certain
diff --git a/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll b/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll
index 23c7aa2..7e6f240 100644
--- a/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll
+++ b/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    float foo(float sum, float A[]) {
 ;
diff --git a/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll b/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll
index 20f44c9..18e6c1f 100644
--- a/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll
+++ b/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    float foo(float sum, float A[]) {
 ;
diff --git a/polly/test/ScopInfo/read-only-scalars.ll b/polly/test/ScopInfo/read-only-scalars.ll
index 71c2d21..f04163e 100644
--- a/polly/test/ScopInfo/read-only-scalars.ll
+++ b/polly/test/ScopInfo/read-only-scalars.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=false -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=true  -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCALARS
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=false '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=true  '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCALARS
 
 ; CHECK-NOT: Memref_scalar
 
diff --git a/polly/test/ScopInfo/read-only-statements.ll b/polly/test/ScopInfo/read-only-statements.ll
index a93063e..7bac53a 100644
--- a/polly/test/ScopInfo/read-only-statements.ll
+++ b/polly/test/ScopInfo/read-only-statements.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check we remove read only statements.
 ;
diff --git a/polly/test/ScopInfo/reduction_alternating_base.ll b/polly/test/ScopInfo/reduction_alternating_base.ll
index 854e280..e38ff60 100644
--- a/polly/test/ScopInfo/reduction_alternating_base.ll
+++ b/polly/test/ScopInfo/reduction_alternating_base.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;
 ;    void f(int *A) {
diff --git a/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll b/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll
index fb02749..17f9dc5 100644
--- a/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll
+++ b/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: NONE
 ;
diff --git a/polly/test/ScopInfo/reduction_different_index.ll b/polly/test/ScopInfo/reduction_different_index.ll
index 575e5a1..d2786d5 100644
--- a/polly/test/ScopInfo/reduction_different_index.ll
+++ b/polly/test/ScopInfo/reduction_different_index.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ; Verify if the following case is not detected as reduction.
 ;
 ; void f(int *A,int *sum) {
diff --git a/polly/test/ScopInfo/reduction_different_index1.ll b/polly/test/ScopInfo/reduction_different_index1.ll
index 39bd3c4..710ae3e 100644
--- a/polly/test/ScopInfo/reduction_different_index1.ll
+++ b/polly/test/ScopInfo/reduction_different_index1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ; Verify if the following case is not detected as reduction.
 ;
 ; void f(int *A, int *sum, int i1, int i2) {
diff --git a/polly/test/ScopInfo/reduction_disabled_multiplicative.ll b/polly/test/ScopInfo/reduction_disabled_multiplicative.ll
index 7120740..61228e0 100644
--- a/polly/test/ScopInfo/reduction_disabled_multiplicative.ll
+++ b/polly/test/ScopInfo/reduction_disabled_multiplicative.ll
@@ -1,4 +1,4 @@
-; RUN: opt -basic-aa %loadPolly -polly-stmt-granularity=bb -polly-print-scops -polly-disable-multiplicative-reductions -disable-output < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-disable-multiplicative-reductions -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: ReadAccess :=       [Reduction Type: +
 ; CHECK:     { Stmt_for_body[i0] -> MemRef_sum[0] };
diff --git a/polly/test/ScopInfo/reduction_escaping_intermediate.ll b/polly/test/ScopInfo/reduction_escaping_intermediate.ll
index dde0910..c66a8be 100644
--- a/polly/test/ScopInfo/reduction_escaping_intermediate.ll
+++ b/polly/test/ScopInfo/reduction_escaping_intermediate.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; void f(int N, int * restrict sums, int * restrict escape) {
 ;   int i, j;
diff --git a/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll b/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll
index 702fc56..c574d31 100644
--- a/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll
+++ b/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; void f(int N, int * restrict sums, int * restrict escape) {
 ;   int i, j;
diff --git a/polly/test/ScopInfo/reduction_invalid_different_operators.ll b/polly/test/ScopInfo/reduction_invalid_different_operators.ll
index f47919d..9846f10 100644
--- a/polly/test/ScopInfo/reduction_invalid_different_operators.ll
+++ b/polly/test/ScopInfo/reduction_invalid_different_operators.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; int f() {
 ;   int i, sum = 0, sth = 0;
diff --git a/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll b/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll
index be1d7b5..4d70e53 100644
--- a/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll
+++ b/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; void f(int *sums) {
 ;   int i, j;
diff --git a/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll b/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll
index 8d20fa1..800eb20 100644
--- a/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll
+++ b/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll
@@ -1,4 +1,4 @@
-; RUN: opt -basic-aa %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Stmt_for_body
 ; CHECK: Reduction Type: *
diff --git a/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll b/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll
index 782332b..49ebdcb 100644
--- a/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll
+++ b/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt -basic-aa %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Stmt_for_body
 ; CHECK: Reduction Type: NONE
diff --git a/polly/test/ScopInfo/reduction_multiple_simple_binary.ll b/polly/test/ScopInfo/reduction_multiple_simple_binary.ll
index 0f1a3ad..77b71f4 100644
--- a/polly/test/ScopInfo/reduction_multiple_simple_binary.ll
+++ b/polly/test/ScopInfo/reduction_multiple_simple_binary.ll
@@ -1,4 +1,4 @@
-; RUN: opt -basic-aa %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: ReadAccess :=       [Reduction Type: NONE
 ; CHECK:     { Stmt_for_body[i0] -> MemRef_A[1 + i0] };
diff --git a/polly/test/ScopInfo/reduction_non_overlapping_chains.ll b/polly/test/ScopInfo/reduction_non_overlapping_chains.ll
index 4e3f841..61aaa05 100644
--- a/polly/test/ScopInfo/reduction_non_overlapping_chains.ll
+++ b/polly/test/ScopInfo/reduction_non_overlapping_chains.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: +
 ; CHECK: Reduction Type: +
diff --git a/polly/test/ScopInfo/reduction_only_reduction_like_access.ll b/polly/test/ScopInfo/reduction_only_reduction_like_access.ll
index 0c61d63..fb6d236 100644
--- a/polly/test/ScopInfo/reduction_only_reduction_like_access.ll
+++ b/polly/test/ScopInfo/reduction_only_reduction_like_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: +
 ;
diff --git a/polly/test/ScopInfo/reduction_simple_fp.ll b/polly/test/ScopInfo/reduction_simple_fp.ll
index ba0a034..aa4cd00 100644
--- a/polly/test/ScopInfo/reduction_simple_fp.ll
+++ b/polly/test/ScopInfo/reduction_simple_fp.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Function: f_no_fast_math
 ; CHECK: Reduction Type: NONE
diff --git a/polly/test/ScopInfo/reduction_simple_w_constant.ll b/polly/test/ScopInfo/reduction_simple_w_constant.ll
index dc1f855..e385b66 100644
--- a/polly/test/ScopInfo/reduction_simple_w_constant.ll
+++ b/polly/test/ScopInfo/reduction_simple_w_constant.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: +
 ;
diff --git a/polly/test/ScopInfo/reduction_simple_w_iv.ll b/polly/test/ScopInfo/reduction_simple_w_iv.ll
index b6c3229..e22eccb 100644
--- a/polly/test/ScopInfo/reduction_simple_w_iv.ll
+++ b/polly/test/ScopInfo/reduction_simple_w_iv.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: +
 ;
diff --git a/polly/test/ScopInfo/reduction_two_identical_reads.ll b/polly/test/ScopInfo/reduction_two_identical_reads.ll
index 19d45a5..8f00954 100644
--- a/polly/test/ScopInfo/reduction_two_identical_reads.ll
+++ b/polly/test/ScopInfo/reduction_two_identical_reads.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: NONE
 ;
diff --git a/polly/test/ScopInfo/redundant_parameter_constraint.ll b/polly/test/ScopInfo/redundant_parameter_constraint.ll
index c9d9121..ad71f1f 100644
--- a/polly/test/ScopInfo/redundant_parameter_constraint.ll
+++ b/polly/test/ScopInfo/redundant_parameter_constraint.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The constraint that r2 has to be bigger than r1 is implicitly contained in
 ; the domain, hence we do not want to see it explicitly.
diff --git a/polly/test/ScopInfo/region-with-instructions.ll b/polly/test/ScopInfo/region-with-instructions.ll
index 39d4a72..d472051 100644
--- a/polly/test/ScopInfo/region-with-instructions.ll
+++ b/polly/test/ScopInfo/region-with-instructions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-print-instructions -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-print-instructions -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Statements {
 ; CHECK: 	Stmt_bb46
diff --git a/polly/test/ScopInfo/remarks.ll b/polly/test/ScopInfo/remarks.ll
index dcdeb58..2c173a3 100644
--- a/polly/test/ScopInfo/remarks.ll
+++ b/polly/test/ScopInfo/remarks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-scops \
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
 ; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: remark: test/ScopInfo/remarks.c:4:7: SCoP begins here.
diff --git a/polly/test/ScopInfo/required-invariant-loop-bounds.ll b/polly/test/ScopInfo/required-invariant-loop-bounds.ll
index 248acbe..abf0b0e 100644
--- a/polly/test/ScopInfo/required-invariant-loop-bounds.ll
+++ b/polly/test/ScopInfo/required-invariant-loop-bounds.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Invariant Accesses: {
 ; CHECK-NEXT:       ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/restriction_in_dead_block.ll b/polly/test/ScopInfo/restriction_in_dead_block.ll
index 81d9b96..487c585 100644
--- a/polly/test/ScopInfo/restriction_in_dead_block.ll
+++ b/polly/test/ScopInfo/restriction_in_dead_block.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we do not generate an empty invalid context only because the wrap
 ; in the second conditional will always happen if the block is executed.
diff --git a/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll b/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll
index d36da2b..702b7dc 100644
--- a/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll
+++ b/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \
 ; RUN: | FileCheck %s -check-prefix=DETECT
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; DETECT: Valid Region for Scop: bb124 => bb176
 ;
diff --git a/polly/test/ScopInfo/run-time-check-many-parameters.ll b/polly/test/ScopInfo/run-time-check-many-parameters.ll
index 30f8d5f..559c38d 100644
--- a/polly/test/ScopInfo/run-time-check-many-parameters.ll
+++ b/polly/test/ScopInfo/run-time-check-many-parameters.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; A valid Scop would print the list of it's statements, we check that we do not
 ; see that list.
diff --git a/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll b/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll
index 487c803..3cf4c40 100644
--- a/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll
+++ b/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \
 ; RUN: | FileCheck %s -check-prefix=DETECT
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; DETECT: Valid Region for Scop: for => return
 ;
diff --git a/polly/test/ScopInfo/run-time-check-read-only-arrays.ll b/polly/test/ScopInfo/run-time-check-read-only-arrays.ll
index d590aaf..51ab814 100644
--- a/polly/test/ScopInfo/run-time-check-read-only-arrays.ll
+++ b/polly/test/ScopInfo/run-time-check-read-only-arrays.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; void foo(float *A, float *B, float *C, long N) {
 ; 	for (long i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/same-base-address-scalar-and-array.ll b/polly/test/ScopInfo/same-base-address-scalar-and-array.ll
index a5f353e..dd809ba 100644
--- a/polly/test/ScopInfo/same-base-address-scalar-and-array.ll
+++ b/polly/test/ScopInfo/same-base-address-scalar-and-array.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we introduce two ScopArrayInfo objects (or virtual arrays) for the %out variable
 ; as it is used as a memory base pointer (%0) but also as a scalar (%out.addr.0.lcssa).
diff --git a/polly/test/ScopInfo/scalar.ll b/polly/test/ScopInfo/scalar.ll
index c38eaa8..812d2fd 100644
--- a/polly/test/ScopInfo/scalar.ll
+++ b/polly/test/ScopInfo/scalar.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
diff --git a/polly/test/ScopInfo/scalar_dependence_cond_br.ll b/polly/test/ScopInfo/scalar_dependence_cond_br.ll
index 3303bfb..59549f3 100644
--- a/polly/test/ScopInfo/scalar_dependence_cond_br.ll
+++ b/polly/test/ScopInfo/scalar_dependence_cond_br.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output< %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int c, int d) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/ScopInfo/scalar_to_array.ll b/polly/test/ScopInfo/scalar_to_array.ll
index 5c27510..d64f169 100644
--- a/polly/test/ScopInfo/scalar_to_array.ll
+++ b/polly/test/ScopInfo/scalar_to_array.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -basic-aa -polly-print-function-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; ModuleID = 'scalar_to_array.ll'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll b/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll
index fc7a1bf..d14569c 100644
--- a/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll
+++ b/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Derived from test-suite/SingleSource/UnitTests/Vector/SSE/sse.stepfft.c
 
diff --git a/polly/test/ScopInfo/scev-invalidated.ll b/polly/test/ScopInfo/scev-invalidated.ll
index 97fc5ec..6b9efd4 100644
--- a/polly/test/ScopInfo/scev-invalidated.ll
+++ b/polly/test/ScopInfo/scev-invalidated.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Region: %if.then6---%return
 ;
diff --git a/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll b/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll
index 2fdf7d6..6e2ed12 100644
--- a/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll
+++ b/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll b/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll
index 9268585..d0e8a2a 100644
--- a/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll
+++ b/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll b/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll
index 413d1d8..9ffc30f 100644
--- a/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll
+++ b/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not build a SCoP and do not crash.
 ;
diff --git a/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll b/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll
index be25447..65f2f99 100644
--- a/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll
+++ b/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not build a SCoP and do not crash.
 ;
diff --git a/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll b/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll
index ff339e0..7c36f8d 100644
--- a/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll
+++ b/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-scops -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -disable-output < %s
 ;
 ; This test contains a infinite loop (bb13) and crashed the domain generation
 ; at some point. Just verify it does not anymore.
diff --git a/polly/test/ScopInfo/scop-affine-parameter-ordering.ll b/polly/test/ScopInfo/scop-affine-parameter-ordering.ll
index 24c028a..c8a234e 100644
--- a/polly/test/ScopInfo/scop-affine-parameter-ordering.ll
+++ b/polly/test/ScopInfo/scop-affine-parameter-ordering.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-m:e-i64:64-i128:128-n8:16:32:64-S128"
 target triple = "aarch64--linux-android"
 
diff --git a/polly/test/ScopInfo/sign_wrapped_set.ll b/polly/test/ScopInfo/sign_wrapped_set.ll
index 23c9c8a..93b63df 100644
--- a/polly/test/ScopInfo/sign_wrapped_set.ll
+++ b/polly/test/ScopInfo/sign_wrapped_set.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-process-unprofitable -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-process-unprofitable '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:        Domain :=
 ; CHECK-NEXT:       [srcHeight] -> { Stmt_for_cond6_preheader_us[i0] : 0 <= i0 <= -3 + srcHeight };
diff --git a/polly/test/ScopInfo/simple_loop_1.ll b/polly/test/ScopInfo/simple_loop_1.ll
index 2c3481f..e736f33 100644
--- a/polly/test/ScopInfo/simple_loop_1.ll
+++ b/polly/test/ScopInfo/simple_loop_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], int N) {
 ;   int i;
diff --git a/polly/test/ScopInfo/simple_loop_2.ll b/polly/test/ScopInfo/simple_loop_2.ll
index 2f58009..ae83dd6 100644
--- a/polly/test/ScopInfo/simple_loop_2.ll
+++ b/polly/test/ScopInfo/simple_loop_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], int N) {
 ;   int i;
diff --git a/polly/test/ScopInfo/simple_loop_unsigned.ll b/polly/test/ScopInfo/simple_loop_unsigned.ll
index 12903d9..c4a96e4 100644
--- a/polly/test/ScopInfo/simple_loop_unsigned.ll
+++ b/polly/test/ScopInfo/simple_loop_unsigned.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], unsigned N) {
 ;   unsigned i;
diff --git a/polly/test/ScopInfo/simple_loop_unsigned_2.ll b/polly/test/ScopInfo/simple_loop_unsigned_2.ll
index 1379180..37e907d 100644
--- a/polly/test/ScopInfo/simple_loop_unsigned_2.ll
+++ b/polly/test/ScopInfo/simple_loop_unsigned_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Assumed Context:
 ; CHECK-NEXT: [N] -> {  :  }
diff --git a/polly/test/ScopInfo/simple_loop_unsigned_3.ll b/polly/test/ScopInfo/simple_loop_unsigned_3.ll
index 7783c46..7f2cf5c 100644
--- a/polly/test/ScopInfo/simple_loop_unsigned_3.ll
+++ b/polly/test/ScopInfo/simple_loop_unsigned_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Assumed Context:
 ; CHECK-NEXT: [N] -> { : }
diff --git a/polly/test/ScopInfo/simple_nonaffine_loop_not.ll b/polly/test/ScopInfo/simple_nonaffine_loop_not.ll
index 42eff85..4df0d34 100644
--- a/polly/test/ScopInfo/simple_nonaffine_loop_not.ll
+++ b/polly/test/ScopInfo/simple_nonaffine_loop_not.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | not FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | not FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 @.str = private unnamed_addr constant [17 x i8] c"Random Value: %d\00", align 1
diff --git a/polly/test/ScopInfo/smax.ll b/polly/test/ScopInfo/smax.ll
index b938e4e..8968e13 100644
--- a/polly/test/ScopInfo/smax.ll
+++ b/polly/test/ScopInfo/smax.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:32-n32-S64"
 
 define void @foo(ptr noalias %data, ptr noalias %ptr, i32 %x_pos, i32 %w) {
diff --git a/polly/test/ScopInfo/statistics.ll b/polly/test/ScopInfo/statistics.ll
index 3797b7d..0a294f2 100644
--- a/polly/test/ScopInfo/statistics.ll
+++ b/polly/test/ScopInfo/statistics.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-scops -stats -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -stats -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 ; CHECK-DAG:  4 polly-scops      - Maximal number of loops in scops
diff --git a/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll b/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll
index d86d241..a46acb0 100644
--- a/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll
+++ b/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:   Stmt_Region__TO__Stmt
diff --git a/polly/test/ScopInfo/stmt_split_no_after_split.ll b/polly/test/ScopInfo/stmt_split_no_after_split.ll
index f8339bd..3a5ebf0 100644
--- a/polly/test/ScopInfo/stmt_split_no_after_split.ll
+++ b/polly/test/ScopInfo/stmt_split_no_after_split.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT: 	Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_split_no_dependence.ll b/polly/test/ScopInfo/stmt_split_no_dependence.ll
index 7ad48f4..9edd0f0 100644
--- a/polly/test/ScopInfo/stmt_split_no_dependence.ll
+++ b/polly/test/ScopInfo/stmt_split_no_dependence.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ;      void func(int *A, int *B){
 ;        for (int i = 0; i < 1024; i+=1) {
diff --git a/polly/test/ScopInfo/stmt_split_on_store.ll b/polly/test/ScopInfo/stmt_split_on_store.ll
index 6af3dc8..d645bec 100644
--- a/polly/test/ScopInfo/stmt_split_on_store.ll
+++ b/polly/test/ScopInfo/stmt_split_on_store.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=store -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=store -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ;      void func(int *A, int *B){
 ;        for (int i = 0; i < 1024; i+=1) {
diff --git a/polly/test/ScopInfo/stmt_split_on_synthesizable.ll b/polly/test/ScopInfo/stmt_split_on_synthesizable.ll
index 92855cf..1a1ccff 100644
--- a/polly/test/ScopInfo/stmt_split_on_synthesizable.ll
+++ b/polly/test/ScopInfo/stmt_split_on_synthesizable.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:   Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll b/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll
index ee6afa4..594b362 100644
--- a/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll
+++ b/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:  	Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll b/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll
index 0a5f41d..6c9f1c2 100644
--- a/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll
+++ b/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:  	Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_split_scalar_dependence.ll b/polly/test/ScopInfo/stmt_split_scalar_dependence.ll
index 5b02d1b..07abe46 100644
--- a/polly/test/ScopInfo/stmt_split_scalar_dependence.ll
+++ b/polly/test/ScopInfo/stmt_split_scalar_dependence.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:  	Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_split_within_loop.ll b/polly/test/ScopInfo/stmt_split_within_loop.ll
index 3ed9bbb..9a42ae3 100644
--- a/polly/test/ScopInfo/stmt_split_within_loop.ll
+++ b/polly/test/ScopInfo/stmt_split_within_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:  	Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll b/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll
index 73fc543..ba4801d 100644
--- a/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll
+++ b/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The statement Stmt_for_if_else_1 should be removed because it has no
 ; sideeffects.  But it has a use of MemRef_tmp21 that must also be
diff --git a/polly/test/ScopInfo/switch-1.ll b/polly/test/ScopInfo/switch-1.ll
index 0ea40a7..0c36101 100644
--- a/polly/test/ScopInfo/switch-1.ll
+++ b/polly/test/ScopInfo/switch-1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/switch-2.ll b/polly/test/ScopInfo/switch-2.ll
index 7956058..f0056da 100644
--- a/polly/test/ScopInfo/switch-2.ll
+++ b/polly/test/ScopInfo/switch-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/switch-3.ll b/polly/test/ScopInfo/switch-3.ll
index aa7ada4..a1810bf 100644
--- a/polly/test/ScopInfo/switch-3.ll
+++ b/polly/test/ScopInfo/switch-3.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/switch-4.ll b/polly/test/ScopInfo/switch-4.ll
index 6aeb719..00665fd 100644
--- a/polly/test/ScopInfo/switch-4.ll
+++ b/polly/test/ScopInfo/switch-4.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/switch-5.ll b/polly/test/ScopInfo/switch-5.ll
index 24cc92a..2de3695 100644
--- a/polly/test/ScopInfo/switch-5.ll
+++ b/polly/test/ScopInfo/switch-5.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/switch-6.ll b/polly/test/ScopInfo/switch-6.ll
index efb3df5..b859840 100644
--- a/polly/test/ScopInfo/switch-6.ll
+++ b/polly/test/ScopInfo/switch-6.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++) {
diff --git a/polly/test/ScopInfo/switch-7.ll b/polly/test/ScopInfo/switch-7.ll
index 2f0d034..f73d97f 100644
--- a/polly/test/ScopInfo/switch-7.ll
+++ b/polly/test/ScopInfo/switch-7.ll
@@ -1,6 +1,5 @@
-
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int c, int N) {
 ;      switch (c) {
diff --git a/polly/test/ScopInfo/tempscop-printing.ll b/polly/test/ScopInfo/tempscop-printing.ll
index 80c675d..4f02176 100644
--- a/polly/test/ScopInfo/tempscop-printing.ll
+++ b/polly/test/ScopInfo/tempscop-printing.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-invariant-load-hoisting=true -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], int N, int *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/test-wrapping-in-condition.ll b/polly/test/ScopInfo/test-wrapping-in-condition.ll
index 3ff978f..7463504 100644
--- a/polly/test/ScopInfo/test-wrapping-in-condition.ll
+++ b/polly/test/ScopInfo/test-wrapping-in-condition.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Invalid Context:
 ; CHECK:        [N] -> {  : N >= 129 }
diff --git a/polly/test/ScopInfo/truncate-1.ll b/polly/test/ScopInfo/truncate-1.ll
index 5c5fac1..44222c8 100644
--- a/polly/test/ScopInfo/truncate-1.ll
+++ b/polly/test/ScopInfo/truncate-1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(char *A, short N) {
 ;      for (char i = 0; i < (char)N; i++)
diff --git a/polly/test/ScopInfo/truncate-2.ll b/polly/test/ScopInfo/truncate-2.ll
index e6c5f2c..c78a533 100644
--- a/polly/test/ScopInfo/truncate-2.ll
+++ b/polly/test/ScopInfo/truncate-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(char *A, short N) {
 ;      for (short i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/truncate-3.ll b/polly/test/ScopInfo/truncate-3.ll
index dd0fe489..5a80a87 100644
--- a/polly/test/ScopInfo/truncate-3.ll
+++ b/polly/test/ScopInfo/truncate-3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-scops -pass-remarks-analysis="polly-scops" \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \
 ; RUN:                -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Signed-unsigned restriction: [p] -> {  : p <= -129 or p >= 128 }
diff --git a/polly/test/ScopInfo/two-loops-one-infinite.ll b/polly/test/ScopInfo/two-loops-one-infinite.ll
index 71f7238..e2723a8 100644
--- a/polly/test/ScopInfo/two-loops-one-infinite.ll
+++ b/polly/test/ScopInfo/two-loops-one-infinite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we do not create a SCoP in the presence of infinite loops.
 ;
diff --git a/polly/test/ScopInfo/two-loops-right-after-each-other.ll b/polly/test/ScopInfo/two-loops-right-after-each-other.ll
index dd457c3..51f3c2d 100644
--- a/polly/test/ScopInfo/two-loops-right-after-each-other.ll
+++ b/polly/test/ScopInfo/two-loops-right-after-each-other.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Statements {
 ; CHECK-NEXT:     Stmt_loop_1
diff --git a/polly/test/ScopInfo/undef_in_cond.ll b/polly/test/ScopInfo/undef_in_cond.ll
index 5282a85..ef11761 100644
--- a/polly/test/ScopInfo/undef_in_cond.ll
+++ b/polly/test/ScopInfo/undef_in_cond.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define fastcc void @fix_operands() nounwind {
diff --git a/polly/test/ScopInfo/unnamed_nonaffine.ll b/polly/test/ScopInfo/unnamed_nonaffine.ll
index bf32cc7..5b9f980 100644
--- a/polly/test/ScopInfo/unnamed_nonaffine.ll
+++ b/polly/test/ScopInfo/unnamed_nonaffine.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-use-llvm-names=true  -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-use-llvm-names=false -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=UNNAMED
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-use-llvm-names=true  '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-use-llvm-names=false '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=UNNAMED
 ;
 ;    void f(int *A, int b) {
 ;      int x;
diff --git a/polly/test/ScopInfo/unnamed_stmts.ll b/polly/test/ScopInfo/unnamed_stmts.ll
index 686c0f8..5a18945 100644
--- a/polly/test/ScopInfo/unnamed_stmts.ll
+++ b/polly/test/ScopInfo/unnamed_stmts.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; This test case verifies that we generate numbered statement names in case
 ; no LLVM-IR names are used in the test case. We also verify, that we
diff --git a/polly/test/ScopInfo/unpredictable_nonscop_loop.ll b/polly/test/ScopInfo/unpredictable_nonscop_loop.ll
index 0656b77..daa1f8c 100644
--- a/polly/test/ScopInfo/unpredictable_nonscop_loop.ll
+++ b/polly/test/ScopInfo/unpredictable_nonscop_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ; Derived from test-suite/MultiSource/Applications/sgefa/blas.c
 ;
 ; The exit value of %i.0320 in land.rhs is not computable.
diff --git a/polly/test/ScopInfo/unprofitable_scalar-accs.ll b/polly/test/ScopInfo/unprofitable_scalar-accs.ll
index 9703587..ca8daa4 100644
--- a/polly/test/ScopInfo/unprofitable_scalar-accs.ll
+++ b/polly/test/ScopInfo/unprofitable_scalar-accs.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=true  -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=HEURISTIC
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=true  '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=HEURISTIC
 
 ; Check the effect of -polly-unprofitable-scalar-accs
 
diff --git a/polly/test/ScopInfo/unsigned-condition.ll b/polly/test/ScopInfo/unsigned-condition.ll
index 35673d1..0529ded 100644
--- a/polly/test/ScopInfo/unsigned-condition.ll
+++ b/polly/test/ScopInfo/unsigned-condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], int N, unsigned P) {
 ;   int i;
diff --git a/polly/test/ScopInfo/unsigned-division-1.ll b/polly/test/ScopInfo/unsigned-division-1.ll
index 8c65062..1c06b55 100644
--- a/polly/test/ScopInfo/unsigned-division-1.ll
+++ b/polly/test/ScopInfo/unsigned-division-1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, unsigned N) {
 ;      for (unsigned i = 0; i < N / 2; i++)
diff --git a/polly/test/ScopInfo/unsigned-division-2.ll b/polly/test/ScopInfo/unsigned-division-2.ll
index bf4ebce..153639c 100644
--- a/polly/test/ScopInfo/unsigned-division-2.ll
+++ b/polly/test/ScopInfo/unsigned-division-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, unsigned N) {
 ;      for (unsigned i = 0; i < N / 2 + 3; i++)
diff --git a/polly/test/ScopInfo/unsigned-division-3.ll b/polly/test/ScopInfo/unsigned-division-3.ll
index 47ba1f2..34561fc 100644
--- a/polly/test/ScopInfo/unsigned-division-3.ll
+++ b/polly/test/ScopInfo/unsigned-division-3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, unsigned char N) {
 ;      for (unsigned i = 0; i <= N / -128; i++)
diff --git a/polly/test/ScopInfo/unsigned-division-4.ll b/polly/test/ScopInfo/unsigned-division-4.ll
index edcd8a1..be539b4 100644
--- a/polly/test/ScopInfo/unsigned-division-4.ll
+++ b/polly/test/ScopInfo/unsigned-division-4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, unsigned char N) {
 ;      for (unsigned i = 0; i < (N / -128) + 3; i++)
diff --git a/polly/test/ScopInfo/unsigned-division-5.ll b/polly/test/ScopInfo/unsigned-division-5.ll
index f9a3d39..61716ec 100644
--- a/polly/test/ScopInfo/unsigned-division-5.ll
+++ b/polly/test/ScopInfo/unsigned-division-5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting=true -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, unsigned N) {
 ;      for (unsigned i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/unsigned_wrap_uge.ll b/polly/test/ScopInfo/unsigned_wrap_uge.ll
index 89c50ee..d25a957 100644
--- a/polly/test/ScopInfo/unsigned_wrap_uge.ll
+++ b/polly/test/ScopInfo/unsigned_wrap_uge.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Unsigned wrap-around check.
 ;
diff --git a/polly/test/ScopInfo/unsigned_wrap_ugt.ll b/polly/test/ScopInfo/unsigned_wrap_ugt.ll
index 3249123..0310fdd 100644
--- a/polly/test/ScopInfo/unsigned_wrap_ugt.ll
+++ b/polly/test/ScopInfo/unsigned_wrap_ugt.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Unsigned wrap-around check.
 ;
diff --git a/polly/test/ScopInfo/unsigned_wrap_ule.ll b/polly/test/ScopInfo/unsigned_wrap_ule.ll
index 3c6ea18..47bfc60 100644
--- a/polly/test/ScopInfo/unsigned_wrap_ule.ll
+++ b/polly/test/ScopInfo/unsigned_wrap_ule.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Unsigned wrap-around check.
 ;
diff --git a/polly/test/ScopInfo/unsigned_wrap_ult.ll b/polly/test/ScopInfo/unsigned_wrap_ult.ll
index 5d859f8..1b73c0d 100644
--- a/polly/test/ScopInfo/unsigned_wrap_ult.ll
+++ b/polly/test/ScopInfo/unsigned_wrap_ult.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Unsigned wrap-around check.
 ;
diff --git a/polly/test/ScopInfo/user_context.ll b/polly/test/ScopInfo/user_context.ll
index 46232cd..7408812 100644
--- a/polly/test/ScopInfo/user_context.ll
+++ b/polly/test/ScopInfo/user_context.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly                                      -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-context='[N] -> {: N = 1024}' -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=CTX
-; RUN: opt %loadPolly -polly-context='[N,M] -> {: 1 = 0}'  -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-context='[] -> {: 1 = 0}'     -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly                                      '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-context='[N] -> {: N = 1024}' '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=CTX
+; RUN: opt %loadNPMPolly -polly-context='[N,M] -> {: 1 = 0}'  '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-context='[] -> {: 1 = 0}'     '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], int N) {
 ;   int i;
diff --git a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll
index 4bd02c9..bd13ba8 100644
--- a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; REMARK: remark: <unknown>:0:0: Use user assumption: [n, b] -> {  : n <= 100 or (b = 0 and n >= 101) }
 ;
diff --git a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll
index 262bd13..45f5917 100644
--- a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Context:
 ; CHECK-NEXT: [n] -> {  : -9223372036854775808 <= n <= 100 }
diff --git a/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll b/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll
index 4a10fcf..fb71c75 100644
--- a/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; REMARK:      remark: <unknown>:0:0: SCoP begins here.
 ; REMARK-NEXT: remark: <unknown>:0:0: Use user assumption: [n] -> {  : n <= 100 }
diff --git a/polly/test/ScopInfo/user_provided_assumptions.ll b/polly/test/ScopInfo/user_provided_assumptions.ll
index 6640e4a6..49b23b1 100644
--- a/polly/test/ScopInfo/user_provided_assumptions.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
 ;
 ; CHECK:      remark: <unknown>:0:0: SCoP begins here.
 ; CHECK-NEXT: remark: <unknown>:0:0: Use user assumption: [M, N] -> {  : N <= 2147483647 - M }
diff --git a/polly/test/ScopInfo/user_provided_assumptions_2.ll b/polly/test/ScopInfo/user_provided_assumptions_2.ll
index 994cd6f1..f8643b6 100644
--- a/polly/test/ScopInfo/user_provided_assumptions_2.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions_2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
 ;
 ; CHECK:      remark: <unknown>:0:0: SCoP begins here.
 ; CHECK-NEXT: remark: <unknown>:0:0: Use user assumption: { : }
diff --git a/polly/test/ScopInfo/user_provided_assumptions_3.ll b/polly/test/ScopInfo/user_provided_assumptions_3.ll
index 2fcde8b..70f8f359 100644
--- a/polly/test/ScopInfo/user_provided_assumptions_3.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions_3.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
 ;
 ; CHECK:      remark: <unknown>:0:0: SCoP begins here.
 ; CHECK-NEXT: remark: <unknown>:0:0: Use user assumption: [N] -> { : N >= 2 }
diff --git a/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll b/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll
index 1eb3c15..3e7883d 100644
--- a/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll
+++ b/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-scops \
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
 ; RUN:    -polly-precise-inbounds -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      remark: <unknown>:0:0: SCoP begins here.
@@ -18,7 +18,7 @@
 ;
 
 
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-scops \
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
 ; RUN:    -polly-precise-inbounds -disable-output < %s 2>&1 -pass-remarks-output=%t.yaml
 ; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
 ; YAML: --- !Analysis
diff --git a/polly/test/ScopInfo/variant_base_pointer.ll b/polly/test/ScopInfo/variant_base_pointer.ll
index 321657c..32cb114 100644
--- a/polly/test/ScopInfo/variant_base_pointer.ll
+++ b/polly/test/ScopInfo/variant_base_pointer.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true -polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true -passes=polly-codegen -disable-output < %s
 ;
 ; %tmp is added to the list of required hoists by -polly-scops and just
 ; assumed to be hoisted. Only -polly-scops recognizes it to be unhoistable
diff --git a/polly/test/ScopInfo/variant_load_empty_domain.ll b/polly/test/ScopInfo/variant_load_empty_domain.ll
index 0e685c3..6a28bd0 100644
--- a/polly/test/ScopInfo/variant_load_empty_domain.ll
+++ b/polly/test/ScopInfo/variant_load_empty_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Invariant Accesses: {
 ; CHECK-NEXT: }
diff --git a/polly/test/ScopInfo/wraping_signed_expr_0.ll b/polly/test/ScopInfo/wraping_signed_expr_0.ll
index 7ad0f64..f5f06bf 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_0.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_0.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, char N, char p) {
 ;      for (char i = 0; i < N; i++) {
diff --git a/polly/test/ScopInfo/wraping_signed_expr_1.ll b/polly/test/ScopInfo/wraping_signed_expr_1.ll
index 0a62b9c..e04257a 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_1.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(long *A, long N, long p) {
 ;      for (long i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/wraping_signed_expr_2.ll b/polly/test/ScopInfo/wraping_signed_expr_2.ll
index f3b4665..2511c0d 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_2.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int N, int p) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/wraping_signed_expr_3.ll b/polly/test/ScopInfo/wraping_signed_expr_3.ll
index 7a5cbba..2106bdf 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_3.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int N, int p) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/wraping_signed_expr_4.ll b/polly/test/ScopInfo/wraping_signed_expr_4.ll
index ec65f70..3ea17f6 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_4.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(char *A, char N, char p) {
 ;      for (char i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/wraping_signed_expr_5.ll b/polly/test/ScopInfo/wraping_signed_expr_5.ll
index 5f3b09b..90706a3 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_5.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; We should not generate runtime check for ((int)r1 + (int)r2) as it is known not
 ; to overflow. However (p + q) can, thus checks are needed.
diff --git a/polly/test/ScopInfo/wraping_signed_expr_6.ll b/polly/test/ScopInfo/wraping_signed_expr_6.ll
index 23258bb..9cf67fc 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_6.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_6.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Invalid Context:
 ; CHECK:        [N] -> {  : N >= 129 }
diff --git a/polly/test/ScopInfo/wraping_signed_expr_7.ll b/polly/test/ScopInfo/wraping_signed_expr_7.ll
index 0663d4e..d18d2b2 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_7.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_7.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Invalid Context:
 ; CHECK:        [N] -> {  : N >= 129 }
diff --git a/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll b/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll
index ec36d2c..8462686 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; This checks that the no-wraps checks will be computed fast as some example
 ; already showed huge slowdowns even though the inbounds and nsw flags were
diff --git a/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll b/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll
index 6db33ab..b4dd567 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; This checks that the no-wraps checks will be computed fast as some example
 ; already showed huge slowdowns even though the inbounds and nsw flags were
diff --git a/polly/test/ScopInfo/zero_ext_of_truncate.ll b/polly/test/ScopInfo/zero_ext_of_truncate.ll
index fc55df5..bd3749b 100644
--- a/polly/test/ScopInfo/zero_ext_of_truncate.ll
+++ b/polly/test/ScopInfo/zero_ext_of_truncate.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting=true -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(unsigned *restrict I, unsigned *restrict A, unsigned N, unsigned M) {
 ;      for (unsigned i = 0; i < N; i++) {
diff --git a/polly/test/ScopInfo/zero_ext_of_truncate_2.ll b/polly/test/ScopInfo/zero_ext_of_truncate_2.ll
index 13e9c03..b306045 100644
--- a/polly/test/ScopInfo/zero_ext_of_truncate_2.ll
+++ b/polly/test/ScopInfo/zero_ext_of_truncate_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting=true -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(unsigned long *restrict I, unsigned *restrict A, unsigned N) {
 ;      for (unsigned i = 0; i < N; i++) {
diff --git a/polly/test/ScopInfo/zero_ext_space_mismatch.ll b/polly/test/ScopInfo/zero_ext_space_mismatch.ll
index 835a866..3c02ae2 100644
--- a/polly/test/ScopInfo/zero_ext_space_mismatch.ll
+++ b/polly/test/ScopInfo/zero_ext_space_mismatch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Assumed Context:
 ; CHECK-NEXT:    [dim] -> {  : dim > 0 }
diff --git a/polly/test/ScopInliner/invariant-load-func.ll b/polly/test/ScopInliner/invariant-load-func.ll
index 38e4a15..ffd2ec9 100644
--- a/polly/test/ScopInliner/invariant-load-func.ll
+++ b/polly/test/ScopInliner/invariant-load-func.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-detect-full-functions -polly-scop-inliner \
-; RUN: -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-detect-full-functions -polly-scop-inliner \
+; RUN: -polly-invariant-load-hoisting '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s
 
 ; Check that we inline a function that requires invariant load hoisting
 ; correctly.
diff --git a/polly/test/Simplify/coalesce_3partials.ll b/polly/test/Simplify/coalesce_3partials.ll
index 0c1556f..4112787 100644
--- a/polly/test/Simplify/coalesce_3partials.ll
+++ b/polly/test/Simplify/coalesce_3partials.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Combine 3 partial accesses into one.
 ;
diff --git a/polly/test/Simplify/coalesce_disjointelements.ll b/polly/test/Simplify/coalesce_disjointelements.ll
index 2f4cf4e..b140f28 100644
--- a/polly/test/Simplify/coalesce_disjointelements.ll
+++ b/polly/test/Simplify/coalesce_disjointelements.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Combine four partial stores into two.
 ; The stores write to the same array, but never the same element.
diff --git a/polly/test/Simplify/coalesce_overlapping.ll b/polly/test/Simplify/coalesce_overlapping.ll
index 78ed21e..ee716fc 100644
--- a/polly/test/Simplify/coalesce_overlapping.ll
+++ b/polly/test/Simplify/coalesce_overlapping.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Combine two partial stores (with overlapping domains) into one.
 ;
diff --git a/polly/test/Simplify/coalesce_partial.ll b/polly/test/Simplify/coalesce_partial.ll
index c42aaa1..aea691f 100644
--- a/polly/test/Simplify/coalesce_partial.ll
+++ b/polly/test/Simplify/coalesce_partial.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Combine two partial stores (with disjoint domains) into one.
 ;
diff --git a/polly/test/Simplify/dead_access_load.ll b/polly/test/Simplify/dead_access_load.ll
index 1804613..66f9479 100644
--- a/polly/test/Simplify/dead_access_load.ll
+++ b/polly/test/Simplify/dead_access_load.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Remove a dead load-instruction
diff --git a/polly/test/Simplify/dead_access_phi.ll b/polly/test/Simplify/dead_access_phi.ll
index d263b89..fb40e4c 100644
--- a/polly/test/Simplify/dead_access_phi.ll
+++ b/polly/test/Simplify/dead_access_phi.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Remove a dead PHI write/read pair
diff --git a/polly/test/Simplify/dead_access_value.ll b/polly/test/Simplify/dead_access_value.ll
index 6e3c211..a8ff7f2 100644
--- a/polly/test/Simplify/dead_access_value.ll
+++ b/polly/test/Simplify/dead_access_value.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Remove a dead value write/read pair
diff --git a/polly/test/Simplify/dead_instruction.ll b/polly/test/Simplify/dead_instruction.ll
index 4e693b0..81e55e1 100644
--- a/polly/test/Simplify/dead_instruction.ll
+++ b/polly/test/Simplify/dead_instruction.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Remove a dead instruction
diff --git a/polly/test/Simplify/emptyaccessdomain.ll b/polly/test/Simplify/emptyaccessdomain.ll
index 54ac14a..9b06cec 100644
--- a/polly/test/Simplify/emptyaccessdomain.ll
+++ b/polly/test/Simplify/emptyaccessdomain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; for (int j = 0; j < n; j += 1) {
 ;   A[0] = 42.0;
diff --git a/polly/test/Simplify/exit_phi_accesses-2.ll b/polly/test/Simplify/exit_phi_accesses-2.ll
index 01748aa..379c7e0 100644
--- a/polly/test/Simplify/exit_phi_accesses-2.ll
+++ b/polly/test/Simplify/exit_phi_accesses-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-scops -polly-print-simplify -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,scop(print<polly-simplify>)' -disable-output < %s | FileCheck %s
 ;
 ; The use of %sum.next by %phi counts as an escaping use.
 ; Don't remove the scalar write of %sum.next.
diff --git a/polly/test/Simplify/func-b320a7.ll b/polly/test/Simplify/func-b320a7.ll
index c8a823a..5aa2cab 100644
--- a/polly/test/Simplify/func-b320a7.ll
+++ b/polly/test/Simplify/func-b320a7.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-simplify -polly-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-simplify>,polly-optree' -disable-output < %s | FileCheck %s -match-full-lines
 
 ; llvm.org/PR47098
 ; Use-after-free by reference to Stmt remaining in InstStmtMap after removing it has been removed by Scop::simplifyScop.
diff --git a/polly/test/Simplify/gemm.ll b/polly/test/Simplify/gemm.ll
index 23f8de5..5120de2 100644
--- a/polly/test/Simplify/gemm.ll
+++ b/polly/test/Simplify/gemm.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
 ;
 ;    void gemm(float A[][1024], float B[][1024], float C[][1024]) {
 ;      for (long i = 0; i < 1024; i++)
diff --git a/polly/test/Simplify/nocoalesce_differentvalues.ll b/polly/test/Simplify/nocoalesce_differentvalues.ll
index 68991d2..33d04b2 100644
--- a/polly/test/Simplify/nocoalesce_differentvalues.ll
+++ b/polly/test/Simplify/nocoalesce_differentvalues.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Do not combine stores that write different values.
 ;
diff --git a/polly/test/Simplify/nocoalesce_elementmismatch.ll b/polly/test/Simplify/nocoalesce_elementmismatch.ll
index 2bab360..608b055 100644
--- a/polly/test/Simplify/nocoalesce_elementmismatch.ll
+++ b/polly/test/Simplify/nocoalesce_elementmismatch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Do not combine stores that do not write to different elements in the
 ; same instance.
diff --git a/polly/test/Simplify/nocoalesce_readbetween.ll b/polly/test/Simplify/nocoalesce_readbetween.ll
index ada79dc..e112b03 100644
--- a/polly/test/Simplify/nocoalesce_readbetween.ll
+++ b/polly/test/Simplify/nocoalesce_readbetween.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Do not combine stores if there is a read between them.
 ; Note: The read between is unused, so will be removed by markAndSweep.
diff --git a/polly/test/Simplify/nocoalesce_writebetween.ll b/polly/test/Simplify/nocoalesce_writebetween.ll
index 48e785e..fd5eee5 100644
--- a/polly/test/Simplify/nocoalesce_writebetween.ll
+++ b/polly/test/Simplify/nocoalesce_writebetween.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Do not combine stores if there is a write between them.
 ;
diff --git a/polly/test/Simplify/notdead_region_exitphi.ll b/polly/test/Simplify/notdead_region_exitphi.ll
index bd29fd5..42fafb4 100644
--- a/polly/test/Simplify/notdead_region_exitphi.ll
+++ b/polly/test/Simplify/notdead_region_exitphi.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Do not remove dependencies of a phi node in a region's exit block.
diff --git a/polly/test/Simplify/notdead_region_innerphi.ll b/polly/test/Simplify/notdead_region_innerphi.ll
index a176a28..966448c 100644
--- a/polly/test/Simplify/notdead_region_innerphi.ll
+++ b/polly/test/Simplify/notdead_region_innerphi.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Do not remove dependencies of a phi node within a region statement (%phi).
diff --git a/polly/test/Simplify/notredundant_region_loop.ll b/polly/test/Simplify/notredundant_region_loop.ll
index 0ea9be7..88f6c41 100644
--- a/polly/test/Simplify/notredundant_region_loop.ll
+++ b/polly/test/Simplify/notredundant_region_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-allow-nonaffine-loops -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -polly-allow-nonaffine-loops -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Do not remove the store in region_entry. It can be executed multiple times
 ; due to being part of a non-affine loop.
diff --git a/polly/test/Simplify/notredundant_region_middle.ll b/polly/test/Simplify/notredundant_region_middle.ll
index 8459874..43c0543 100644
--- a/polly/test/Simplify/notredundant_region_middle.ll
+++ b/polly/test/Simplify/notredundant_region_middle.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Do not remove redundant stores in the middle of region statements.
diff --git a/polly/test/Simplify/notredundant_synthesizable_unknownit.ll b/polly/test/Simplify/notredundant_synthesizable_unknownit.ll
index 2affdbb..8a9aec8 100644
--- a/polly/test/Simplify/notredundant_synthesizable_unknownit.ll
+++ b/polly/test/Simplify/notredundant_synthesizable_unknownit.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Do not remove the scalar value write of %i.trunc in inner.for.
diff --git a/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll b/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll
index 511f35a..7218f32 100644
--- a/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll
+++ b/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -polly-print-simplify -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>,scop(print<polly-simplify>)' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; %tmp5 must keep the Value WRITE MemoryAccess, because as an incoming value of
 ; %tmp4, it is an "external use".
diff --git a/polly/test/Simplify/overwritten.ll b/polly/test/Simplify/overwritten.ll
index a32d6a8..eccdd80 100644
--- a/polly/test/Simplify/overwritten.ll
+++ b/polly/test/Simplify/overwritten.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s
 ;
 ; Remove a store that is overwritten by another store in the same statement.
diff --git a/polly/test/Simplify/overwritten_3phi.ll b/polly/test/Simplify/overwritten_3phi.ll
index 24758b9..4cee4f1 100644
--- a/polly/test/Simplify/overwritten_3phi.ll
+++ b/polly/test/Simplify/overwritten_3phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Remove identical writes
 ; (two stores in the same statement that write the same value to the same
diff --git a/polly/test/Simplify/overwritten_3store.ll b/polly/test/Simplify/overwritten_3store.ll
index 63eb5b5..c9f06c8 100644
--- a/polly/test/Simplify/overwritten_3store.ll
+++ b/polly/test/Simplify/overwritten_3store.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s
 ;
 ; Remove a store that is overwritten by another store in the same statement.
diff --git a/polly/test/Simplify/overwritten_implicit_and_explicit.ll b/polly/test/Simplify/overwritten_implicit_and_explicit.ll
index 56c63b4..b1b7635 100644
--- a/polly/test/Simplify/overwritten_implicit_and_explicit.ll
+++ b/polly/test/Simplify/overwritten_implicit_and_explicit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Remove a store that is overwritten by another store in the same statement.
 ; Check that this works even if one of the writes is a scalar MemoryKind.
diff --git a/polly/test/Simplify/overwritten_loadbetween.ll b/polly/test/Simplify/overwritten_loadbetween.ll
index b31f45d..cdca2f1 100644
--- a/polly/test/Simplify/overwritten_loadbetween.ll
+++ b/polly/test/Simplify/overwritten_loadbetween.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s
 ;
 ; Do not remove overwrites when the value is read before.
diff --git a/polly/test/Simplify/overwritten_scalar.ll b/polly/test/Simplify/overwritten_scalar.ll
index d55ea77..700adb6 100644
--- a/polly/test/Simplify/overwritten_scalar.ll
+++ b/polly/test/Simplify/overwritten_scalar.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Remove identical writes
 ; (two stores in the same statement that write the same value to the same
diff --git a/polly/test/Simplify/pass_existence.ll b/polly/test/Simplify/pass_existence.ll
index fc5287e..4d1d800 100644
--- a/polly/test/Simplify/pass_existence.ll
+++ b/polly/test/Simplify/pass_existence.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-print-simplify -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly -disable-output "-passes=scop(print<polly-simplify>)" < %s -aa-pipeline=basic-aa < %s | FileCheck %s
 ;
 ; Simple test for the existence of the Simplify pass.
diff --git a/polly/test/Simplify/phi_in_regionstmt.ll b/polly/test/Simplify/phi_in_regionstmt.ll
index 32bb754..76efd48 100644
--- a/polly/test/Simplify/phi_in_regionstmt.ll
+++ b/polly/test/Simplify/phi_in_regionstmt.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; The PHINode %cond91.sink.sink.us.sink.6 is in the middle of a region
diff --git a/polly/test/Simplify/pr33323.ll b/polly/test/Simplify/pr33323.ll
index 751f0bf..22921d5 100644
--- a/polly/test/Simplify/pr33323.ll
+++ b/polly/test/Simplify/pr33323.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
 ;
 ; llvm.org/PR33323
 ;
diff --git a/polly/test/Simplify/redundant.ll b/polly/test/Simplify/redundant.ll
index e85352b..540e537 100644
--- a/polly/test/Simplify/redundant.ll
+++ b/polly/test/Simplify/redundant.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Remove redundant store (a store that writes the same value already
diff --git a/polly/test/Simplify/redundant_differentindex.ll b/polly/test/Simplify/redundant_differentindex.ll
index 23531c2..5ce2583 100644
--- a/polly/test/Simplify/redundant_differentindex.ll
+++ b/polly/test/Simplify/redundant_differentindex.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; A store that has a different index than the load it is storing is
diff --git a/polly/test/Simplify/redundant_region.ll b/polly/test/Simplify/redundant_region.ll
index dbcb420..927aac6 100644
--- a/polly/test/Simplify/redundant_region.ll
+++ b/polly/test/Simplify/redundant_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Remove redundant store (a store that writes the same value already
 ; at the destination) in a region.
diff --git a/polly/test/Simplify/redundant_region_scalar.ll b/polly/test/Simplify/redundant_region_scalar.ll
index 95a581a..72d570d 100644
--- a/polly/test/Simplify/redundant_region_scalar.ll
+++ b/polly/test/Simplify/redundant_region_scalar.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Remove redundant store (a store that writes the same value already
 ; at the destination) in a region.
diff --git a/polly/test/Simplify/redundant_scalarwrite.ll b/polly/test/Simplify/redundant_scalarwrite.ll
index e2f7bbe..84cb971 100644
--- a/polly/test/Simplify/redundant_scalarwrite.ll
+++ b/polly/test/Simplify/redundant_scalarwrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Remove redundant scalar stores.
 ;
diff --git a/polly/test/Simplify/redundant_storebetween.ll b/polly/test/Simplify/redundant_storebetween.ll
index f624b6e..6540d77 100644
--- a/polly/test/Simplify/redundant_storebetween.ll
+++ b/polly/test/Simplify/redundant_storebetween.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Don't remove store where there is another store to the same target
diff --git a/polly/test/Simplify/scalability1.ll b/polly/test/Simplify/scalability1.ll
index 0ef99ce..c6e36f9 100644
--- a/polly/test/Simplify/scalability1.ll
+++ b/polly/test/Simplify/scalability1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-ignore-inbounds -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-ignore-inbounds '-passes=print<polly-simplify>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Test scalability.
 ;
diff --git a/polly/test/Simplify/scalability2.ll b/polly/test/Simplify/scalability2.ll
index bac0810..adcf9ee 100644
--- a/polly/test/Simplify/scalability2.ll
+++ b/polly/test/Simplify/scalability2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-ignore-inbounds -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-ignore-inbounds '-passes=print<polly-simplify>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Test scalability.
 ;
diff --git a/polly/test/Simplify/sweep_mapped_phi.ll b/polly/test/Simplify/sweep_mapped_phi.ll
index add1681..495d77a 100644
--- a/polly/test/Simplify/sweep_mapped_phi.ll
+++ b/polly/test/Simplify/sweep_mapped_phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Map %phi to A[j], so the scalar write in Stmt_for_bodyA can be removed.
 ;
diff --git a/polly/test/Simplify/sweep_mapped_value.ll b/polly/test/Simplify/sweep_mapped_value.ll
index 2e2f9c3..c83941a 100644
--- a/polly/test/Simplify/sweep_mapped_value.ll
+++ b/polly/test/Simplify/sweep_mapped_value.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Map %val to A[j], so the scalar write on Stmt_for_bodyB can be removed.
 ;
diff --git a/polly/test/Simplify/ununsed_read_in_region_entry.ll b/polly/test/Simplify/ununsed_read_in_region_entry.ll
index 9b2d452..f2436c2 100644
--- a/polly/test/Simplify/ununsed_read_in_region_entry.ll
+++ b/polly/test/Simplify/ununsed_read_in_region_entry.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-simplify -disable-output< %s | FileCheck %s -match-full-lines
-; RUN: opt %loadPolly -polly-simplify -polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-simplify>' -disable-output< %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-simplify,polly-codegen' -S < %s | FileCheck %s -check-prefix=CODEGEN
 ;
 ; for (int i = 0; i < n; i+=1) {
 ;    (void)A[0];
diff --git a/polly/test/Support/Plugins.ll b/polly/test/Support/Plugins.ll
index cee878f..872a32f 100644
--- a/polly/test/Support/Plugins.ll
+++ b/polly/test/Support/Plugins.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes='polly-prepare,scop(print<polly-ast>)' -S < %s \
+; RUN: opt %loadNPMPolly '-passes=polly-prepare,scop(print<polly-ast>)' -S < %s \
 ; RUN: | FileCheck %s
 
 ; This testcase tests plugin registration. Check-lines below serve to verify
diff --git a/polly/test/Support/isl-args.ll b/polly/test/Support/isl-args.ll
index efa9419..206cb73 100644
--- a/polly/test/Support/isl-args.ll
+++ b/polly/test/Support/isl-args.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-scops -disable-output -polly-isl-arg=-V < %s | FileCheck %s -match-full-lines --check-prefix=VERSION
-; RUN: opt %loadPolly -polly-scops -disable-output -polly-isl-arg=-h < %s | FileCheck %s -match-full-lines --check-prefix=HELP
-; RUN: not opt %loadPolly -polly-scops -disable-output -polly-isl-arg=-asdf < %s 2>&1| FileCheck %s -match-full-lines --check-prefix=UNKNOWN
-; RUN: opt %loadPolly -polly-scops -disable-output -polly-isl-arg=--schedule-algorithm=feautrier < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=-V < %s | FileCheck %s -match-full-lines --check-prefix=VERSION
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=-h < %s | FileCheck %s -match-full-lines --check-prefix=HELP
+; RUN: not opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=-asdf < %s 2>&1| FileCheck %s -match-full-lines --check-prefix=UNKNOWN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=--schedule-algorithm=feautrier < %s
 
 ; VERSION: isl-{{.*}}-IMath-32
 ; HELP: Usage: -polly-isl-arg [OPTION...]
diff --git a/polly/test/lit.site.cfg.in b/polly/test/lit.site.cfg.in
index b440612..d8a0b6a 100644
--- a/polly/test/lit.site.cfg.in
+++ b/polly/test/lit.site.cfg.in
@@ -48,7 +48,6 @@ else:
     config.substitutions.append(('%loadPolly', commonOpts ))
     config.substitutions.append(('%loadNPMPolly', commonOpts ))
 
-
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
 
diff --git a/polly/test/polly.ll b/polly/test/polly.ll
index f78ccea..2e455b3 100644
--- a/polly/test/polly.ll
+++ b/polly/test/polly.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-scops -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -S < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 define void @foo() nounwind {
 start:
diff --git a/utils/bazel/.bazelrc b/utils/bazel/.bazelrc
index 5a6d188..09111bc 100644
--- a/utils/bazel/.bazelrc
+++ b/utils/bazel/.bazelrc
@@ -51,9 +51,6 @@ build --experimental_cc_shared_library
 build:zlib_external --repo_env=BAZEL_LLVM_ZLIB_STRATEGY=external
 build:zlib_system --repo_env=BAZEL_LLVM_ZLIB_STRATEGY=system
 
-build:terminfo_external --repo_env=BAZEL_LLVM_TERMINFO_STRATEGY=external
-build:terminfo_system --repo_env=BAZEL_LLVM_TERMINFO_STRATEGY=system
-
 ###############################################################################
 # Options for "generic_clang" builds: these options should generally apply to
 # builds using a Clang-based compiler, and default to the `clang` executable on
diff --git a/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel b/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel
index 1c12c81..7413b01 100644
--- a/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel
@@ -167,7 +167,10 @@ cc_library(
     ]),
     hdrs = glob([
         "include/bolt/Passes/*.h",
-    ]),
+    ]) + [
+        # To avoid circular dependency on "Profile".
+        "include/bolt/Profile/BoltAddressTranslation.h",
+    ],
     includes = ["include"],
     deps = [
         ":Core",
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index c469da7..81e12b7 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -1881,6 +1881,7 @@ cc_library(
         ":Instrumentation",
         ":MC",
         ":MCParser",
+        ":ObjCARC",
         ":Object",
         ":ProfileData",
         ":Remarks",
diff --git a/utils/bazel/llvm-project-overlay/llvm/driver.bzl b/utils/bazel/llvm-project-overlay/llvm/driver.bzl
index 10796d9..a57a14e 100644
--- a/utils/bazel/llvm-project-overlay/llvm/driver.bzl
+++ b/utils/bazel/llvm-project-overlay/llvm/driver.bzl
@@ -39,6 +39,7 @@ _EXTRA_ALIASES = {
     "clang": ["clang++", "clang-cl", "clang-cpp"],
     "lld": ["ld", "lld-link", "ld.lld", "ld64.lld", "wasm-ld"],
     "llvm-ar": ["ranlib", "lib", "dlltool"],
+    "llvm-cxxfilt": ["c++filt"],
     "llvm-objcopy": ["bitcode-strip", "install-name-tool", "strip"],
     "llvm-objdump": ["otool"],
     "llvm-rc": ["windres"],
diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
index e9385f4..a4fb47d 100644
--- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
+++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
@@ -222,9 +222,6 @@
 /* Define to 1 if you have the <sys/types.h> header file. */
 #define HAVE_SYS_TYPES_H 1
 
-/* Define if the setupterm() function is supported this platform. */
-/* LLVM_ENABLE_TERMINFO defined in Bazel */
-
 /* Define to 1 if you have the <termios.h> header file. */
 #define HAVE_TERMIOS_H 1
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index fc449e9..a7bbe459 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -51,10 +51,7 @@ expand_template(
         "#cmakedefine01 MLIR_ENABLE_NVPTXCOMPILER": "#define MLIR_ENABLE_NVPTXCOMPILER 0",
         "#cmakedefine01 MLIR_ENABLE_PDL_IN_PATTERNMATCH": "#define MLIR_ENABLE_PDL_IN_PATTERNMATCH 1",
         "#cmakedefine01 MLIR_ENABLE_ROCM_CONVERSIONS": "#define MLIR_ENABLE_ROCM_CONVERSIONS 0",
-    } | if_cuda_available(
-        {"#cmakedefine01 MLIR_ENABLE_CUDA_CONVERSIONS": "#define MLIR_ENABLE_CUDA_CONVERSIONS 1"},
-        {"#cmakedefine01 MLIR_ENABLE_CUDA_CONVERSIONS": "#define MLIR_ENABLE_CUDA_CONVERSIONS 0"},
-    ),
+    },
     template = "include/mlir/Config/mlir-config.h.cmake",
 )
 
@@ -5616,7 +5613,6 @@ cc_library(
         ":Transforms",
         ":VectorToLLVM",
         ":VectorToSCF",
-        ":config",
     ],
 )
 
@@ -6282,7 +6278,6 @@ cc_library(
         ":NVVMToLLVMIRTranslation",
         ":TargetLLVM",
         ":ToLLVMIRTranslation",
-        ":config",
         "//llvm:NVPTXCodeGen",
         "//llvm:Support",
         "//llvm:config",
@@ -7597,7 +7592,6 @@ cc_library(
         "include/mlir/Transforms/LoopInvariantCodeMotionUtils.h",
         "include/mlir/Transforms/OneToNTypeConversion.h",
         "include/mlir/Transforms/RegionUtils.h",
-        "include/mlir/Transforms/TopologicalSortUtils.h",
     ],
     includes = ["include"],
     deps = [
@@ -8367,6 +8361,7 @@ cc_library(
         ":ArithDialect",
         ":ConversionPassIncGen",
         ":EmitCDialect",
+        ":PDLLAST",
         ":Pass",
         ":TransformUtils",
     ],
@@ -8723,6 +8718,7 @@ cc_library(
     ],
     includes = ["include"],
     deps = [
+        ":Analysis",
         ":DLTIDialect",
         ":IR",
         ":LLVMConversionIncGen",
@@ -8957,6 +8953,7 @@ cc_library(
     hdrs = glob(["include/mlir/Target/LLVMIR/Dialect/OpenACC/*.h"]),
     includes = ["include"],
     deps = [
+        ":Analysis",
         ":IR",
         ":LLVMDialect",
         ":OpenACCDialect",
@@ -8976,6 +8973,7 @@ cc_library(
     hdrs = glob(["include/mlir/Target/LLVMIR/Dialect/OpenMP/*.h"]),
     includes = ["include"],
     deps = [
+        ":Analysis",
         ":IR",
         ":LLVMDialect",
         ":OpenMPCommon",
@@ -9360,7 +9358,6 @@ cc_library(
         ":X86VectorTransforms",
         ":XeGPUDialect",
         ":XeGPUTransforms",
-        ":config",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 258cc88..fdf89d0 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -36,7 +36,7 @@ expand_template(
         "\"@MLIR_BINARY_DIR@\"": "os.environ[\"TEST_UNDECLARED_OUTPUTS_DIR\"]",
         # All disabled, but required to substituted because they are not in quotes.
         "@LLVM_BUILD_EXAMPLES@": "0",
-        "@MLIR_ENABLE_CUDA_CONVERSIONS@": "0",
+        "@LLVM_HAS_NVPTX_TARGET@": "0",
         "@MLIR_ENABLE_CUDA_RUNNER@": "0",
         "@MLIR_ENABLE_ROCM_CONVERSIONS@": "0",
         "@MLIR_ENABLE_ROCM_RUNNER@": "0",
@@ -608,6 +608,7 @@ cc_library(
         ":TestDialect",
         "//mlir:FuncDialect",
         "//mlir:FuncToLLVM",
+        "//mlir:IR",
         "//mlir:LLVMCommonConversion",
         "//mlir:LLVMDialect",
         "//mlir:Pass",
@@ -951,10 +952,10 @@ cc_library(
         "//mlir:ArmSMEToSCF",
         "//mlir:ArmSMETransforms",
         "//mlir:ArmSVETransforms",
-	"//mlir:FuncDialect",
+        "//mlir:FuncDialect",
         "//mlir:IR",
         "//mlir:Pass",
-	"//mlir:SCFToControlFlow",
+        "//mlir:SCFToControlFlow",
         "//mlir:Transforms",
         "//mlir:VectorToArmSME",
         "//mlir:VectorToSCF",
diff --git a/utils/bazel/llvm_configs/config.h.cmake b/utils/bazel/llvm_configs/config.h.cmake
index 977c182..ff30741 100644
--- a/utils/bazel/llvm_configs/config.h.cmake
+++ b/utils/bazel/llvm_configs/config.h.cmake
@@ -209,9 +209,6 @@
 /* Define to 1 if you have the <sys/types.h> header file. */
 #cmakedefine HAVE_SYS_TYPES_H ${HAVE_SYS_TYPES_H}
 
-/* Define if the setupterm() function is supported this platform. */
-#cmakedefine LLVM_ENABLE_TERMINFO ${LLVM_ENABLE_TERMINFO}
-
 /* Define to 1 if you have the <termios.h> header file. */
 #cmakedefine HAVE_TERMIOS_H ${HAVE_TERMIOS_H}
author	Michael Kruse <llvm-project@meinersbur.de>	2024-05-25 17:21:09 +0200
committer	Michael Kruse <llvm-project@meinersbur.de>	2024-05-25 17:21:09 +0200
commit	062fdd4f4439c00437fef07488e994a6ff10bb5d (patch)
tree	79297e3188951f7b98d10f3d67a92f4df75bac80
parent	0e864bbd4142cf202aa9ffd66eb67c9528c0f452 (diff)
parent	9da81cee219da78ab44357310a3bcf481bdba26c (diff)
download	llvm-users/meinersbur/ide_folders_llvm.zip llvm-users/meinersbur/ide_folders_llvm.tar.gz llvm-users/meinersbur/ide_folders_llvm.tar.bz2