204 files changed, 18248 insertions, 1578 deletions
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index dc3d918..e44d956 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -2215,7 +2215,7 @@ DataAggregator::writeAggregatedFile(StringRef OutputFilename) const {
     OutFile << "boltedcollection\n";
   if (opts::BasicAggregation) {
     OutFile << "no_lbr";
-    for (const StringMapEntry<std::nullopt_t> &Entry : EventNames)
+    for (const StringMapEntry<EmptyStringSetTag> &Entry : EventNames)
       OutFile << " " << Entry.getKey();
     OutFile << "\n";
 
@@ -2291,7 +2291,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
 
   ListSeparator LS(",");
   raw_string_ostream EventNamesOS(BP.Header.EventNames);
-  for (const StringMapEntry<std::nullopt_t> &EventEntry : EventNames)
+  for (const StringMapEntry<EmptyStringSetTag> &EventEntry : EventNames)
     EventNamesOS << LS << EventEntry.first().str();
 
   BP.Header.Flags = opts::BasicAggregation ? BinaryFunction::PF_BASIC
diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp
index 1632aa1..5c631f9 100644
--- a/bolt/lib/Profile/YAMLProfileWriter.cpp
+++ b/bolt/lib/Profile/YAMLProfileWriter.cpp
@@ -382,7 +382,7 @@ std::error_code YAMLProfileWriter::writeProfile(const RewriteInstance &RI) {
   StringSet<> EventNames = RI.getProfileReader()->getEventNames();
   if (!EventNames.empty()) {
     std::string Sep;
-    for (const StringMapEntry<std::nullopt_t> &EventEntry : EventNames) {
+    for (const StringMapEntry<EmptyStringSetTag> &EventEntry : EventNames) {
       BP.Header.EventNames += Sep + EventEntry.first().str();
       Sep = ",";
     }
diff --git a/clang/lib/Basic/Targets/AVR.cpp b/clang/lib/Basic/Targets/AVR.cpp
index 2673669..90b4ac1 100644
--- a/clang/lib/Basic/Targets/AVR.cpp
+++ b/clang/lib/Basic/Targets/AVR.cpp
@@ -30,13 +30,13 @@ struct LLVM_LIBRARY_VISIBILITY MCUInfo {
 
 // NOTE: This list has been synchronized with gcc-avr 5.4.0 and avr-libc 2.0.0.
 static MCUInfo AVRMcus[] = {
-    {"avr1", NULL, "1", 0},
+    {"avr1", nullptr, "1", 0},
     {"at90s1200", "__AVR_AT90S1200__", "1", 0},
     {"attiny11", "__AVR_ATtiny11__", "1", 0},
     {"attiny12", "__AVR_ATtiny12__", "1", 0},
     {"attiny15", "__AVR_ATtiny15__", "1", 0},
     {"attiny28", "__AVR_ATtiny28__", "1", 0},
-    {"avr2", NULL, "2", 1},
+    {"avr2", nullptr, "2", 1},
     {"at90s2313", "__AVR_AT90S2313__", "2", 1},
     {"at90s2323", "__AVR_AT90S2323__", "2", 1},
     {"at90s2333", "__AVR_AT90S2333__", "2", 1},
@@ -50,7 +50,7 @@ static MCUInfo AVRMcus[] = {
     {"at90s8515", "__AVR_AT90S8515__", "2", 1},
     {"at90c8534", "__AVR_AT90c8534__", "2", 1},
     {"at90s8535", "__AVR_AT90S8535__", "2", 1},
-    {"avr25", NULL, "25", 1},
+    {"avr25", nullptr, "25", 1},
     {"ata5272", "__AVR_ATA5272__", "25", 1},
     {"ata6616c", "__AVR_ATA6616c__", "25", 1},
     {"attiny13", "__AVR_ATtiny13__", "25", 1},
@@ -80,13 +80,13 @@ static MCUInfo AVRMcus[] = {
     {"attiny48", "__AVR_ATtiny48__", "25", 1},
     {"attiny88", "__AVR_ATtiny88__", "25", 1},
     {"attiny828", "__AVR_ATtiny828__", "25", 1},
-    {"avr3", NULL, "3", 1},
+    {"avr3", nullptr, "3", 1},
     {"at43usb355", "__AVR_AT43USB355__", "3", 1},
     {"at76c711", "__AVR_AT76C711__", "3", 1},
-    {"avr31", NULL, "31", 1},
+    {"avr31", nullptr, "31", 1},
     {"atmega103", "__AVR_ATmega103__", "31", 1},
     {"at43usb320", "__AVR_AT43USB320__", "31", 1},
-    {"avr35", NULL, "35", 1},
+    {"avr35", nullptr, "35", 1},
     {"attiny167", "__AVR_ATtiny167__", "35", 1},
     {"at90usb82", "__AVR_AT90USB82__", "35", 1},
     {"at90usb162", "__AVR_AT90USB162__", "35", 1},
@@ -97,7 +97,7 @@ static MCUInfo AVRMcus[] = {
     {"atmega16u2", "__AVR_ATmega16U2__", "35", 1},
     {"atmega32u2", "__AVR_ATmega32U2__", "35", 1},
     {"attiny1634", "__AVR_ATtiny1634__", "35", 1},
-    {"avr4", NULL, "4", 1},
+    {"avr4", nullptr, "4", 1},
     {"atmega8", "__AVR_ATmega8__", "4", 1},
     {"ata6289", "__AVR_ATA6289__", "4", 1},
     {"atmega8a", "__AVR_ATmega8A__", "4", 1},
@@ -123,7 +123,7 @@ static MCUInfo AVRMcus[] = {
     {"at90pwm3", "__AVR_AT90PWM3__", "4", 1},
     {"at90pwm3b", "__AVR_AT90PWM3B__", "4", 1},
     {"at90pwm81", "__AVR_AT90PWM81__", "4", 1},
-    {"avr5", NULL, "5", 1},
+    {"avr5", nullptr, "5", 1},
     {"ata5702m322", "__AVR_ATA5702M322__", "5", 1},
     {"ata5782", "__AVR_ATA5782__", "5", 1},
     {"ata5790", "__AVR_ATA5790__", "5", 1},
@@ -230,7 +230,7 @@ static MCUInfo AVRMcus[] = {
     {"at90scr100", "__AVR_AT90SCR100__", "5", 1},
     {"at94k", "__AVR_AT94K__", "5", 1},
     {"m3000", "__AVR_AT000__", "5", 1},
-    {"avr51", NULL, "51", 2},
+    {"avr51", nullptr, "51", 2},
     {"atmega128", "__AVR_ATmega128__", "51", 2},
     {"atmega128a", "__AVR_ATmega128A__", "51", 2},
     {"atmega1280", "__AVR_ATmega1280__", "51", 2},
@@ -243,12 +243,12 @@ static MCUInfo AVRMcus[] = {
     {"at90can128", "__AVR_AT90CAN128__", "51", 2},
     {"at90usb1286", "__AVR_AT90USB1286__", "51", 2},
     {"at90usb1287", "__AVR_AT90USB1287__", "51", 2},
-    {"avr6", NULL, "6", 4},
+    {"avr6", nullptr, "6", 4},
     {"atmega2560", "__AVR_ATmega2560__", "6", 4},
     {"atmega2561", "__AVR_ATmega2561__", "6", 4},
     {"atmega256rfr2", "__AVR_ATmega256RFR2__", "6", 4},
     {"atmega2564rfr2", "__AVR_ATmega2564RFR2__", "6", 4},
-    {"avrxmega2", NULL, "102", 1},
+    {"avrxmega2", nullptr, "102", 1},
     {"atxmega16a4", "__AVR_ATxmega16A4__", "102", 1},
     {"atxmega16a4u", "__AVR_ATxmega16A4U__", "102", 1},
     {"atxmega16c4", "__AVR_ATxmega16C4__", "102", 1},
@@ -262,7 +262,7 @@ static MCUInfo AVRMcus[] = {
     {"atxmega32e5", "__AVR_ATxmega32E5__", "102", 1},
     {"atxmega16e5", "__AVR_ATxmega16E5__", "102", 1},
     {"atxmega8e5", "__AVR_ATxmega8E5__", "102", 1},
-    {"avrxmega4", NULL, "104", 1},
+    {"avrxmega4", nullptr, "104", 1},
     {"atxmega64a3", "__AVR_ATxmega64A3__", "104", 1},
     {"atxmega64a3u", "__AVR_ATxmega64A3U__", "104", 1},
     {"atxmega64a4u", "__AVR_ATxmega64A4U__", "104", 1},
@@ -271,10 +271,10 @@ static MCUInfo AVRMcus[] = {
     {"atxmega64c3", "__AVR_ATxmega64C3__", "104", 1},
     {"atxmega64d3", "__AVR_ATxmega64D3__", "104", 1},
     {"atxmega64d4", "__AVR_ATxmega64D4__", "104", 1},
-    {"avrxmega5", NULL, "105", 1},
+    {"avrxmega5", nullptr, "105", 1},
     {"atxmega64a1", "__AVR_ATxmega64A1__", "105", 1},
     {"atxmega64a1u", "__AVR_ATxmega64A1U__", "105", 1},
-    {"avrxmega6", NULL, "106", 6},
+    {"avrxmega6", nullptr, "106", 6},
     {"atxmega128a3", "__AVR_ATxmega128A3__", "106", 2},
     {"atxmega128a3u", "__AVR_ATxmega128A3U__", "106", 2},
     {"atxmega128b1", "__AVR_ATxmega128B1__", "106", 2},
@@ -294,11 +294,11 @@ static MCUInfo AVRMcus[] = {
     {"atxmega256d3", "__AVR_ATxmega256D3__", "106", 4},
     {"atxmega384c3", "__AVR_ATxmega384C3__", "106", 6},
     {"atxmega384d3", "__AVR_ATxmega384D3__", "106", 6},
-    {"avrxmega7", NULL, "107", 2},
+    {"avrxmega7", nullptr, "107", 2},
     {"atxmega128a1", "__AVR_ATxmega128A1__", "107", 2},
     {"atxmega128a1u", "__AVR_ATxmega128A1U__", "107", 2},
     {"atxmega128a4u", "__AVR_ATxmega128A4U__", "107", 2},
-    {"avrtiny", NULL, "100", 0},
+    {"avrtiny", nullptr, "100", 0},
     {"attiny4", "__AVR_ATtiny4__", "100", 0},
     {"attiny5", "__AVR_ATtiny5__", "100", 0},
     {"attiny9", "__AVR_ATtiny9__", "100", 0},
@@ -307,7 +307,7 @@ static MCUInfo AVRMcus[] = {
     {"attiny40", "__AVR_ATtiny40__", "100", 0},
     {"attiny102", "__AVR_ATtiny102__", "100", 0},
     {"attiny104", "__AVR_ATtiny104__", "100", 0},
-    {"avrxmega3", NULL, "103", 1},
+    {"avrxmega3", nullptr, "103", 1},
     {"attiny202", "__AVR_ATtiny202__", "103", 1},
     {"attiny402", "__AVR_ATtiny402__", "103", 1},
     {"attiny204", "__AVR_ATtiny204__", "103", 1},
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
index 5010137..527dfd2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
@@ -126,7 +126,7 @@ class OpenACCClauseCIREmitter final
         .CaseLower("default", mlir::acc::DeviceType::Default)
         .CaseLower("host", mlir::acc::DeviceType::Host)
         .CaseLower("multicore", mlir::acc::DeviceType::Multicore)
-        .CasesLower("nvidia", "acc_device_nvidia",
+        .CasesLower({"nvidia", "acc_device_nvidia"},
                     mlir::acc::DeviceType::Nvidia)
         .CaseLower("radeon", mlir::acc::DeviceType::Radeon);
   }
diff --git a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
index 6da65b6..8a1cab3 100644
--- a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
@@ -375,28 +375,28 @@ static Value *MakeCpAsync(unsigned IntrinsicID, unsigned IntrinsicIDS,
                                        CGF.EmitScalarExpr(E->getArg(1))});
 }
 
-static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
-                           const CallExpr *E, CodeGenFunction &CGF) {
+static bool EnsureNativeHalfSupport(unsigned BuiltinID, const CallExpr *E,
+                                    CodeGenFunction &CGF) {
   auto &C = CGF.CGM.getContext();
-  if (!(C.getLangOpts().NativeHalfType ||
-        !C.getTargetInfo().useFP16ConversionIntrinsics())) {
+  if (!C.getLangOpts().NativeHalfType &&
+      C.getTargetInfo().useFP16ConversionIntrinsics()) {
     CGF.CGM.Error(E->getExprLoc(), C.BuiltinInfo.getQuotedName(BuiltinID) +
                                        " requires native half type support.");
-    return nullptr;
+    return false;
   }
+  return true;
+}
 
-  if (BuiltinID == NVPTX::BI__nvvm_ldg_h || BuiltinID == NVPTX::BI__nvvm_ldg_h2)
-    return MakeLdg(CGF, E);
-
-  if (IntrinsicID == Intrinsic::nvvm_ldu_global_f)
-    return MakeLdu(IntrinsicID, CGF, E);
+static Value *MakeHalfType(Function *Intrinsic, unsigned BuiltinID,
+                           const CallExpr *E, CodeGenFunction &CGF) {
+  if (!EnsureNativeHalfSupport(BuiltinID, E, CGF))
+    return nullptr;
 
   SmallVector<Value *, 16> Args;
-  auto *F = CGF.CGM.getIntrinsic(IntrinsicID);
-  auto *FTy = F->getFunctionType();
+  auto *FTy = Intrinsic->getFunctionType();
   unsigned ICEArguments = 0;
   ASTContext::GetBuiltinTypeError Error;
-  C.GetBuiltinType(BuiltinID, Error, &ICEArguments);
+  CGF.CGM.getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
   assert(Error == ASTContext::GE_None && "Should not codegen an error");
   for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) {
     assert((ICEArguments & (1 << i)) == 0);
@@ -407,8 +407,14 @@ static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
     Args.push_back(ArgValue);
   }
 
-  return CGF.Builder.CreateCall(F, Args);
+  return CGF.Builder.CreateCall(Intrinsic, Args);
 }
+
+static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
+                           const CallExpr *E, CodeGenFunction &CGF) {
+  return MakeHalfType(CGF.CGM.getIntrinsic(IntrinsicID), BuiltinID, E, CGF);
+}
+
 } // namespace
 
 Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
@@ -913,9 +919,14 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
   }
   // The following builtins require half type support
   case NVPTX::BI__nvvm_ex2_approx_f16:
-    return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16, BuiltinID, E, *this);
+    return MakeHalfType(
+        CGM.getIntrinsic(Intrinsic::nvvm_ex2_approx, Builder.getHalfTy()),
+        BuiltinID, E, *this);
   case NVPTX::BI__nvvm_ex2_approx_f16x2:
-    return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16x2, BuiltinID, E, *this);
+    return MakeHalfType(
+        CGM.getIntrinsic(Intrinsic::nvvm_ex2_approx,
+                         FixedVectorType::get(Builder.getHalfTy(), 2)),
+        BuiltinID, E, *this);
   case NVPTX::BI__nvvm_ff2f16x2_rn:
     return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rn, BuiltinID, E, *this);
   case NVPTX::BI__nvvm_ff2f16x2_rn_relu:
@@ -1049,12 +1060,22 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
   case NVPTX::BI__nvvm_fabs_d:
     return Builder.CreateUnaryIntrinsic(Intrinsic::fabs,
                                         EmitScalarExpr(E->getArg(0)));
+  case NVPTX::BI__nvvm_ex2_approx_d:
+  case NVPTX::BI__nvvm_ex2_approx_f:
+    return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_ex2_approx,
+                                        EmitScalarExpr(E->getArg(0)));
+  case NVPTX::BI__nvvm_ex2_approx_ftz_f:
+    return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_ex2_approx_ftz,
+                                        EmitScalarExpr(E->getArg(0)));
   case NVPTX::BI__nvvm_ldg_h:
   case NVPTX::BI__nvvm_ldg_h2:
-    return MakeHalfType(Intrinsic::not_intrinsic, BuiltinID, E, *this);
+    return EnsureNativeHalfSupport(BuiltinID, E, *this) ? MakeLdg(*this, E)
+                                                        : nullptr;
   case NVPTX::BI__nvvm_ldu_h:
   case NVPTX::BI__nvvm_ldu_h2:
-    return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
+    return EnsureNativeHalfSupport(BuiltinID, E, *this)
+               ? MakeLdu(Intrinsic::nvvm_ldu_global_f, *this, E)
+               : nullptr;
   case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
     return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4,
                        Intrinsic::nvvm_cp_async_ca_shared_global_4_s, *this, E,
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 71c5280..51618d1 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -2540,10 +2540,14 @@ bool Driver::HandleImmediateArgs(Compilation &C) {
   }
 
   if (C.getArgs().hasArg(options::OPT_print_runtime_dir)) {
-    if (std::optional<std::string> RuntimePath = TC.getRuntimePath())
-      llvm::outs() << *RuntimePath << '\n';
-    else
-      llvm::outs() << TC.getCompilerRTPath() << '\n';
+    for (auto RuntimePath :
+         {TC.getRuntimePath(), std::make_optional(TC.getCompilerRTPath())}) {
+      if (RuntimePath && getVFS().exists(*RuntimePath)) {
+        llvm::outs() << *RuntimePath << '\n';
+        return false;
+      }
+    }
+    llvm::outs() << "(runtime dir is not present)" << '\n';
     return false;
   }
 
diff --git a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
index fe4277e..ee243ab 100644
--- a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
+++ b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #ifndef _HLSL_COMPAT_OVERLOADS_H_
-#define _HLSl_COMPAT_OVERLOADS_H_
+#define _HLSL_COMPAT_OVERLOADS_H_
 
 namespace hlsl {
 
diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c b/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
index 035c4c6..60a35f4 100644
--- a/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
+++ b/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
@@ -8,7 +8,7 @@
 typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
 
 // CHECK: call half @llvm.nvvm.ex2.approx.f16(half {{.*}})
-// CHECK: call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> {{.*}})
+// CHECK: call <2 x half> @llvm.nvvm.ex2.approx.v2f16(<2 x half> {{.*}})
 // CHECK: call half @llvm.nvvm.fma.rn.relu.f16(half {{.*}}, half {{.*}}, half {{.*}})
 // CHECK: call half @llvm.nvvm.fma.rn.ftz.relu.f16(half {{.*}}, half {{.*}}, half {{.*}})
 // CHECK: call <2 x half> @llvm.nvvm.fma.rn.relu.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type.c b/clang/test/CodeGen/builtins-nvptx-native-half-type.c
index 01a004e..1f16c7e 100644
--- a/clang/test/CodeGen/builtins-nvptx-native-half-type.c
+++ b/clang/test/CodeGen/builtins-nvptx-native-half-type.c
@@ -41,7 +41,7 @@ __device__ void nvvm_ex2_sm75() {
 #if __CUDA_ARCH__ >= 750
   // CHECK_PTX70_SM75: call half @llvm.nvvm.ex2.approx.f16
   __nvvm_ex2_approx_f16(0.1f16);
-  // CHECK_PTX70_SM75: call <2 x half> @llvm.nvvm.ex2.approx.f16x2
+  // CHECK_PTX70_SM75: call <2 x half> @llvm.nvvm.ex2.approx.v2f16
   __nvvm_ex2_approx_f16x2({0.1f16, 0.7f16});
 #endif
   // CHECK: ret void
diff --git a/flang/lib/Optimizer/Builder/TemporaryStorage.cpp b/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
index 7e329e3..5db40af 100644
--- a/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
+++ b/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
@@ -258,13 +258,9 @@ void fir::factory::AnyVariableStack::pushValue(mlir::Location loc,
                                                fir::FirOpBuilder &builder,
                                                mlir::Value variable) {
   hlfir::Entity entity{variable};
-  mlir::Type storageElementType =
-      hlfir::getFortranElementType(retValueBox.getType());
-  auto [box, maybeCleanUp] =
-      hlfir::convertToBox(loc, builder, entity, storageElementType);
+  mlir::Value box =
+      hlfir::genVariableBox(loc, builder, entity, entity.getBoxType());
   fir::runtime::genPushDescriptor(loc, builder, opaquePtr, fir::getBase(box));
-  if (maybeCleanUp)
-    (*maybeCleanUp)();
 }
 
 void fir::factory::AnyVariableStack::resetFetchPosition(
diff --git a/flang/test/HLFIR/order_assignments/forall-pointer-assignment-codegen.fir b/flang/test/HLFIR/order_assignments/forall-pointer-assignment-codegen.fir
index 1d19876..855b62c 100644
--- a/flang/test/HLFIR/order_assignments/forall-pointer-assignment-codegen.fir
+++ b/flang/test/HLFIR/order_assignments/forall-pointer-assignment-codegen.fir
@@ -91,10 +91,8 @@ func.func @test_need_to_save_rhs(%n: i64, %arg1: !fir.box<!fir.array<?x!ptr_wrap
 // CHECK:             %[[VAL_21:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_20]])  : (!fir.box<!fir.array<?x!fir.type<ptr_wrapper{p:!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>}>>>, i64) -> !fir.ref<!fir.type<ptr_wrapper{p:!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>}>>
 // CHECK:             %[[VAL_22:.*]] = hlfir.designate %[[VAL_21]]{"p"}   {fortran_attrs = #fir.var_attrs<pointer>} : (!fir.ref<!fir.type<ptr_wrapper{p:!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>}>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>>
 // CHECK:             %[[VAL_23:.*]] = fir.load %[[VAL_22]] : !fir.ref<!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>>
-// CHECK:             %[[VAL_24:.*]] = fir.box_addr %[[VAL_23]] : (!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>) -> !fir.ptr<!fir.type<t{i:i64}>>
-// CHECK:             %[[VAL_25:.*]] = fir.embox %[[VAL_24]] : (!fir.ptr<!fir.type<t{i:i64}>>) -> !fir.box<!fir.type<t{i:i64}>>
-// CHECK:             %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (!fir.box<!fir.type<t{i:i64}>>) -> !fir.box<none>
-// CHECK:             fir.call @_FortranAPushDescriptor(%[[VAL_16]], %[[VAL_26]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
+// CHECK:             %[[VAL_24:.*]] = fir.convert %[[VAL_23]] : (!fir.box<!fir.ptr<!fir.type<t{i:i64}>>>) -> !fir.box<none>
+// CHECK:             fir.call @_FortranAPushDescriptor(%[[VAL_16]], %[[VAL_24]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
 // CHECK:           }
 // CHECK:           %[[VAL_27:.*]] = fir.convert %[[VAL_4]] : (i64) -> index
 // CHECK:           %[[VAL_28:.*]] = fir.convert %[[VAL_0]] : (i64) -> index
diff --git a/flang/test/Lower/forall-polymorphic.f90 b/flang/test/Lower/forall-pointer-assignment.f90
index 656b6ec..ec142e3 100644
--- a/flang/test/Lower/forall-polymorphic.f90
+++ b/flang/test/Lower/forall-pointer-assignment.f90
@@ -1,4 +1,4 @@
-! Test lower of FORALL polymorphic pointer assignment 
+! Test lower of FORALL pointer assignment 
 ! RUN: bbc -emit-fir %s -o - | FileCheck %s
 
 
@@ -128,3 +128,47 @@ subroutine forallPolymorphic3()
 ! CHECK: }
 
 end subroutine forallPolymorphic3
+
+
+!! Test the LHS of a pointer assignment gets the isPointer flag from the
+!! RHS that is a reference to a function that returns a pointer.
+! CHECK-LABEL: c.func @_QPforallpointerassignment1
+  subroutine forallPointerAssignment1()
+    type base
+        real, pointer :: data => null()
+    end type
+
+    interface
+      pure function makeData (i)
+        real, pointer :: makeData
+        integer*4, intent(in) :: i
+      end function
+    end interface
+
+    type(base) :: co1(10)
+
+    forall (i=1:10)
+        co1(i)%data => makeData (i)
+    end forall
+
+! CHECK: %[[V_3:[0-9]+]] = fir.alloca i64
+! CHECK: %[[V_3:[0-9]+]] = fir.alloca i32 {bindc_name = "i"}
+! CHECK: %[[V_4:[0-9]+]] = fir.alloca !fir.box<!fir.ptr<f32>> {bindc_name = ".result"}
+! CHECK: %[[V_25:[0-9]+]] = fir.convert %c1_i32 : (i32) -> index
+! CHECK: %[[V_26:[0-9]+]] = fir.convert %c10_i32 : (i32) -> index
+! CHECK: %[[V_27:[0-9]+]] = fir.address_of(@{{_QQcl.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
+! CHECK: %[[V_28:[0-9]+]] = fir.convert %[[V_27]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
+! CHECK: %[[V_29:[0-9]+]] = fir.call @_FortranACreateDescriptorStack(%[[V_28]], %c{{.*}}) : (!fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
+! CHECK: fir.do_loop %arg0 = %[[V_25]] to %[[V_26]] step %c1
+! CHECK: {
+! CHECK: %[[V_32:[0-9]+]] = fir.convert %arg0 : (index) -> i32
+! CHECK: fir.store %[[V_32]] to %[[V_3]] : !fir.ref<i32>
+! CHECK: %[[V_33:[0-9]+]] = fir.call @_QPmakedata(%[[V_3]]) proc_attrs<pure> fastmath<contract> : (!fir.ref<i32>) -> !fir.box<!fir.ptr<f32>>
+! CHECK: fir.save_result %[[V_33]] to %[[V_4]] : !fir.box<!fir.ptr<f32>>, !fir.ref<!fir.box<!fir.ptr<f32>>>
+! CHECK: %[[V_34:[0-9]+]] = fir.declare %[[V_4]] {uniq_name = ".tmp.func_result"} : (!fir.ref<!fir.box<!fir.ptr<f32>>>) -> !fir.ref<!fir.box<!fir.ptr<f32>>>
+! CHECK: %[[V_35:[0-9]+]] = fir.load %[[V_34]] : !fir.ref<!fir.box<!fir.ptr<f32>>>
+! CHECK: %[[V_36:[0-9]+]] = fir.convert %[[V_35]] : (!fir.box<!fir.ptr<f32>>) -> !fir.box<none>
+! CHECK: fir.call @_FortranAPushDescriptor(%[[V_29]], %[[V_36]]) : (!fir.llvm_ptr<i8>, !fir.box<none>) -> ()
+! CHECK: }
+
+  end subroutine forallPointerAssignment1
diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index 8fba6db..dd9bf8a 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -426,6 +426,10 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_constexpr_algorithms``                         ``202306L``
     ---------------------------------------------------------- -----------------
+    ``__cpp_lib_constexpr_flat_map``                           ``202502L``
+    ---------------------------------------------------------- -----------------
+    ``__cpp_lib_constexpr_flat_set``                           ``202502L``
+    ---------------------------------------------------------- -----------------
     ``__cpp_lib_constexpr_forward_list``                       ``202502L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_constexpr_list``                               ``202502L``
diff --git a/libcxx/include/__flat_set/flat_multiset.h b/libcxx/include/__flat_set/flat_multiset.h
index 7be0b2d..0f6bae5 100644
--- a/libcxx/include/__flat_set/flat_multiset.h
+++ b/libcxx/include/__flat_set/flat_multiset.h
@@ -95,16 +95,16 @@ public:
 
 public:
   // [flat.multiset.cons], constructors
-  _LIBCPP_HIDE_FROM_ABI flat_multiset() noexcept(is_nothrow_default_constructible_v<_KeyContainer> &&
-                                                 is_nothrow_default_constructible_v<_Compare>)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset() noexcept(
+      is_nothrow_default_constructible_v<_KeyContainer> && is_nothrow_default_constructible_v<_Compare>)
       : __keys_(), __compare_() {}
 
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(const flat_multiset&) = default;
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(const flat_multiset&) = default;
 
   // The copy/move constructors are not specified in the spec, which means they should be defaulted.
   // However, the move constructor can potentially leave a moved-from object in an inconsistent
   // state if an exception is thrown.
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(flat_multiset&& __other) noexcept(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(flat_multiset&& __other) noexcept(
       is_nothrow_move_constructible_v<_KeyContainer> && is_nothrow_move_constructible_v<_Compare>)
 #  if _LIBCPP_HAS_EXCEPTIONS
       try
@@ -121,14 +121,16 @@ public:
 #  endif // _LIBCPP_HAS_EXCEPTIONS
   }
 
-  _LIBCPP_HIDE_FROM_ABI explicit flat_multiset(const key_compare& __comp) : __keys_(), __compare_(__comp) {}
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit flat_multiset(const key_compare& __comp)
+      : __keys_(), __compare_(__comp) {}
 
-  _LIBCPP_HIDE_FROM_ABI explicit flat_multiset(container_type __keys, const key_compare& __comp = key_compare())
+  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit flat_multiset(container_type __keys, const key_compare& __comp = key_compare())
       : __keys_(std::move(__keys)), __compare_(__comp) {
     ranges::sort(__keys_, __compare_);
   }
 
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(sorted_equivalent_t, container_type __keys, const key_compare& __comp = key_compare())
       : __keys_(std::move(__keys)), __compare_(__comp) {
     _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted");
@@ -136,7 +138,7 @@ public:
 
   template <class _InputIterator>
     requires __has_input_iterator_category<_InputIterator>::value
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(_InputIterator __first, _InputIterator __last, const key_compare& __comp = key_compare())
       : __keys_(), __compare_(__comp) {
     insert(__first, __last);
@@ -144,48 +146,53 @@ public:
 
   template <class _InputIterator>
     requires __has_input_iterator_category<_InputIterator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(
       sorted_equivalent_t, _InputIterator __first, _InputIterator __last, const key_compare& __comp = key_compare())
       : __keys_(__first, __last), __compare_(__comp) {
     _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted");
   }
 
   template <_ContainerCompatibleRange<value_type> _Range>
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t __fr, _Range&& __rg)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(from_range_t __fr, _Range&& __rg)
       : flat_multiset(__fr, std::forward<_Range>(__rg), key_compare()) {}
 
   template <_ContainerCompatibleRange<value_type> _Range>
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t, _Range&& __rg, const key_compare& __comp) : flat_multiset(__comp) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(from_range_t, _Range&& __rg, const key_compare& __comp)
+      : flat_multiset(__comp) {
     insert_range(std::forward<_Range>(__rg));
   }
 
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(initializer_list<value_type> __il, const key_compare& __comp = key_compare())
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(initializer_list<value_type> __il, const key_compare& __comp = key_compare())
       : flat_multiset(__il.begin(), __il.end(), __comp) {}
 
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(sorted_equivalent_t, initializer_list<value_type> __il, const key_compare& __comp = key_compare())
       : flat_multiset(sorted_equivalent, __il.begin(), __il.end(), __comp) {}
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI explicit flat_multiset(const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit flat_multiset(const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_() {}
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(const key_compare& __comp, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(const key_compare& __comp, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_(__comp) {}
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(const container_type& __keys, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(const container_type& __keys, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __keys)), __compare_() {
     ranges::sort(__keys_, __compare_);
   }
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(const container_type& __keys, const key_compare& __comp, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __keys)), __compare_(__comp) {
     ranges::sort(__keys_, __compare_);
@@ -193,14 +200,15 @@ public:
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(sorted_equivalent_t, const container_type& __keys, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(sorted_equivalent_t, const container_type& __keys, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __keys)), __compare_() {
     _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted");
   }
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(sorted_equivalent_t, const container_type& __keys, const key_compare& __comp, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __keys)), __compare_(__comp) {
     _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted");
@@ -208,13 +216,14 @@ public:
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(const flat_multiset& __other, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(const flat_multiset& __other, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __other.__keys_)),
         __compare_(__other.__compare_) {}
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(flat_multiset&& __other, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(flat_multiset&& __other, const _Allocator& __alloc)
 #  if _LIBCPP_HAS_EXCEPTIONS
       try
 #  endif // _LIBCPP_HAS_EXCEPTIONS
@@ -230,14 +239,15 @@ public:
 
   template <class _InputIterator, class _Allocator>
     requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator<container_type, _Allocator>::value)
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(_InputIterator __first, _InputIterator __last, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(_InputIterator __first, _InputIterator __last, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_() {
     insert(__first, __last);
   }
 
   template <class _InputIterator, class _Allocator>
     requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator<container_type, _Allocator>::value)
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(_InputIterator __first, _InputIterator __last, const key_compare& __comp, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_(__comp) {
     insert(__first, __last);
@@ -245,7 +255,7 @@ public:
 
   template <class _InputIterator, class _Allocator>
     requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator<container_type, _Allocator>::value)
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(sorted_equivalent_t, _InputIterator __first, _InputIterator __last, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __first, __last)), __compare_() {
     _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted");
@@ -253,53 +263,57 @@ public:
 
   template <class _InputIterator, class _Allocator>
     requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator<container_type, _Allocator>::value)
-  _LIBCPP_HIDE_FROM_ABI
-  flat_multiset(sorted_equivalent_t,
-                _InputIterator __first,
-                _InputIterator __last,
-                const key_compare& __comp,
-                const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(
+      sorted_equivalent_t,
+      _InputIterator __first,
+      _InputIterator __last,
+      const key_compare& __comp,
+      const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc, __first, __last)), __compare_(__comp) {
     _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted");
   }
 
   template <_ContainerCompatibleRange<value_type> _Range, class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t, _Range&& __rg, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(from_range_t, _Range&& __rg, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_() {
     insert_range(std::forward<_Range>(__rg));
   }
 
   template <_ContainerCompatibleRange<value_type> _Range, class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t, _Range&& __rg, const key_compare& __comp, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(from_range_t, _Range&& __rg, const key_compare& __comp, const _Allocator& __alloc)
       : __keys_(std::make_obj_using_allocator<container_type>(__alloc)), __compare_(__comp) {
     insert_range(std::forward<_Range>(__rg));
   }
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(initializer_list<value_type> __il, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(initializer_list<value_type> __il, const _Allocator& __alloc)
       : flat_multiset(__il.begin(), __il.end(), __alloc) {}
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multiset(initializer_list<value_type> __il, const key_compare& __comp, const _Allocator& __alloc)
       : flat_multiset(__il.begin(), __il.end(), __comp, __alloc) {}
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(sorted_equivalent_t, initializer_list<value_type> __il, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multiset(sorted_equivalent_t, initializer_list<value_type> __il, const _Allocator& __alloc)
       : flat_multiset(sorted_equivalent, __il.begin(), __il.end(), __alloc) {}
 
   template <class _Allocator>
     requires uses_allocator<container_type, _Allocator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multiset(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset(
       sorted_equivalent_t, initializer_list<value_type> __il, const key_compare& __comp, const _Allocator& __alloc)
       : flat_multiset(sorted_equivalent, __il.begin(), __il.end(), __comp, __alloc) {}
 
-  _LIBCPP_HIDE_FROM_ABI flat_multiset& operator=(initializer_list<value_type> __il) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset& operator=(initializer_list<value_type> __il) {
     clear();
     insert(__il);
     return *this;
@@ -308,9 +322,9 @@ public:
   // copy/move assignment are not specified in the spec (defaulted)
   // but move assignment can potentially leave moved from object in an inconsistent
   // state if an exception is thrown
-  _LIBCPP_HIDE_FROM_ABI flat_multiset& operator=(const flat_multiset&) = default;
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset& operator=(const flat_multiset&) = default;
 
-  _LIBCPP_HIDE_FROM_ABI flat_multiset& operator=(flat_multiset&& __other) noexcept(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multiset& operator=(flat_multiset&& __other) noexcept(
       is_nothrow_move_assignable_v<_KeyContainer> && is_nothrow_move_assignable_v<_Compare>) {
     auto __clear_other_guard = std::__make_scope_guard([&]() noexcept { __other.clear() /* noexcept */; });
     auto __clear_self_guard  = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
@@ -321,30 +335,52 @@ public:
   }
 
   // iterators
-  _LIBCPP_HIDE_FROM_ABI iterator begin() noexcept { return iterator(std::as_const(__keys_).begin()); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator begin() const noexcept { return const_iterator(__keys_.begin()); }
-  _LIBCPP_HIDE_FROM_ABI iterator end() noexcept { return iterator(std::as_const(__keys_).end()); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator end() const noexcept { return const_iterator(__keys_.end()); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator begin() noexcept {
+    return iterator(std::as_const(__keys_).begin());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator begin() const noexcept {
+    return const_iterator(__keys_.begin());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator end() noexcept {
+    return iterator(std::as_const(__keys_).end());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator end() const noexcept {
+    return const_iterator(__keys_.end());
+  }
 
-  _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() noexcept { return reverse_iterator(end()); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const noexcept { return const_reverse_iterator(end()); }
-  _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() noexcept { return reverse_iterator(begin()); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const noexcept { return const_reverse_iterator(begin()); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 reverse_iterator rbegin() noexcept {
+    return reverse_iterator(end());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator rbegin() const noexcept {
+    return const_reverse_iterator(end());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 reverse_iterator rend() noexcept {
+    return reverse_iterator(begin());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator rend() const noexcept {
+    return const_reverse_iterator(begin());
+  }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const noexcept { return begin(); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator cend() const noexcept { return end(); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const noexcept { return const_reverse_iterator(end()); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const noexcept { return const_reverse_iterator(begin()); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator cbegin() const noexcept { return begin(); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator cend() const noexcept { return end(); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator crbegin() const noexcept {
+    return const_reverse_iterator(end());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator crend() const noexcept {
+    return const_reverse_iterator(begin());
+  }
 
   // capacity
-  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI bool empty() const noexcept { return __keys_.empty(); }
-  _LIBCPP_HIDE_FROM_ABI size_type size() const noexcept { return __keys_.size(); }
-  _LIBCPP_HIDE_FROM_ABI size_type max_size() const noexcept { return __keys_.max_size(); }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool empty() const noexcept {
+    return __keys_.empty();
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type size() const noexcept { return __keys_.size(); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type max_size() const noexcept { return __keys_.max_size(); }
 
   // [flat.multiset.modifiers], modifiers
   template <class... _Args>
     requires is_constructible_v<value_type, _Args...>
-  _LIBCPP_HIDE_FROM_ABI iterator emplace(_Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator emplace(_Args&&... __args) {
     if constexpr (sizeof...(__args) == 1 && (is_same_v<remove_cvref_t<_Args>, _Key> && ...)) {
       return __emplace(std::forward<_Args>(__args)...);
     } else {
@@ -354,7 +390,7 @@ public:
 
   template <class... _Args>
     requires is_constructible_v<value_type, _Args...>
-  _LIBCPP_HIDE_FROM_ABI iterator emplace_hint(const_iterator __hint, _Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator emplace_hint(const_iterator __hint, _Args&&... __args) {
     if constexpr (sizeof...(__args) == 1 && (is_same_v<remove_cvref_t<_Args>, _Key> && ...)) {
       return __emplace_hint(std::move(__hint), std::forward<_Args>(__args)...);
     } else {
@@ -362,21 +398,23 @@ public:
     }
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return emplace(__x); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const value_type& __x) { return emplace(__x); }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(value_type&& __x) { return emplace(std::move(__x)); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(value_type&& __x) {
+    return emplace(std::move(__x));
+  }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, const value_type& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __hint, const value_type& __x) {
     return emplace_hint(__hint, __x);
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, value_type&& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __hint, value_type&& __x) {
     return emplace_hint(__hint, std::move(__x));
   }
 
   template <class _InputIterator>
     requires __has_input_iterator_category<_InputIterator>::value
-  _LIBCPP_HIDE_FROM_ABI void insert(_InputIterator __first, _InputIterator __last) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert(_InputIterator __first, _InputIterator __last) {
     if constexpr (sized_sentinel_for<_InputIterator, _InputIterator>) {
       __reserve(__last - __first);
     }
@@ -385,7 +423,8 @@ public:
 
   template <class _InputIterator>
     requires __has_input_iterator_category<_InputIterator>::value
-  _LIBCPP_HIDE_FROM_ABI void insert(sorted_equivalent_t, _InputIterator __first, _InputIterator __last) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void
+  insert(sorted_equivalent_t, _InputIterator __first, _InputIterator __last) {
     if constexpr (sized_sentinel_for<_InputIterator, _InputIterator>) {
       __reserve(__last - __first);
     }
@@ -394,7 +433,7 @@ public:
   }
 
   template <_ContainerCompatibleRange<value_type> _Range>
-  _LIBCPP_HIDE_FROM_ABI void insert_range(_Range&& __range) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert_range(_Range&& __range) {
     if constexpr (ranges::sized_range<_Range>) {
       __reserve(ranges::size(__range));
     }
@@ -402,26 +441,29 @@ public:
     __append_sort_merge</*WasSorted = */ false>(std::forward<_Range>(__range));
   }
 
-  _LIBCPP_HIDE_FROM_ABI void insert(initializer_list<value_type> __il) { insert(__il.begin(), __il.end()); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert(initializer_list<value_type> __il) {
+    insert(__il.begin(), __il.end());
+  }
 
-  _LIBCPP_HIDE_FROM_ABI void insert(sorted_equivalent_t, initializer_list<value_type> __il) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void
+  insert(sorted_equivalent_t, initializer_list<value_type> __il) {
     insert(sorted_equivalent, __il.begin(), __il.end());
   }
 
-  _LIBCPP_HIDE_FROM_ABI container_type extract() && {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 container_type extract() && {
     auto __guard = std::__make_scope_guard([&]() noexcept { clear() /* noexcept */; });
     auto __ret   = std::move(__keys_);
     return __ret;
   }
 
-  _LIBCPP_HIDE_FROM_ABI void replace(container_type&& __keys) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void replace(container_type&& __keys) {
     _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys, __compare_), "Key container is not sorted");
     auto __guard = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
     __keys_      = std::move(__keys);
     __guard.__complete();
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator erase(iterator __position) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator erase(iterator __position) {
     auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
     auto __key_iter   = __keys_.erase(__position.__base());
     __on_failure.__complete();
@@ -431,7 +473,7 @@ public:
   // The following overload is the same as the iterator overload
   // iterator erase(const_iterator __position);
 
-  _LIBCPP_HIDE_FROM_ABI size_type erase(const key_type& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type erase(const key_type& __x) {
     auto [__first, __last] = equal_range(__x);
     auto __res             = __last - __first;
     erase(__first, __last);
@@ -441,21 +483,21 @@ public:
   template <class _Kp>
     requires(__is_transparent_v<_Compare> && !is_convertible_v<_Kp &&, iterator> &&
              !is_convertible_v<_Kp &&, const_iterator>)
-  _LIBCPP_HIDE_FROM_ABI size_type erase(_Kp&& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type erase(_Kp&& __x) {
     auto [__first, __last] = equal_range(__x);
     auto __res             = __last - __first;
     erase(__first, __last);
     return __res;
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __first, const_iterator __last) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator erase(const_iterator __first, const_iterator __last) {
     auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
     auto __key_it     = __keys_.erase(__first.__base(), __last.__base());
     __on_failure.__complete();
     return iterator(std::move(__key_it));
   }
 
-  _LIBCPP_HIDE_FROM_ABI void swap(flat_multiset& __y) noexcept {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void swap(flat_multiset& __y) noexcept {
     // warning: The spec has unconditional noexcept, which means that
     // if any of the following functions throw an exception,
     // std::terminate will be called
@@ -464,126 +506,139 @@ public:
     ranges::swap(__keys_, __y.__keys_);
   }
 
-  _LIBCPP_HIDE_FROM_ABI void clear() noexcept { __keys_.clear(); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void clear() noexcept { __keys_.clear(); }
 
   // observers
-  _LIBCPP_HIDE_FROM_ABI key_compare key_comp() const { return __compare_; }
-  _LIBCPP_HIDE_FROM_ABI value_compare value_comp() const { return __compare_; }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 key_compare key_comp() const { return __compare_; }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 value_compare value_comp() const { return __compare_; }
 
   // map operations
-  _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __x) { return __find_impl(*this, __x); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator find(const key_type& __x) {
+    return __find_impl(*this, __x);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __x) const { return __find_impl(*this, __x); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator find(const key_type& __x) const {
+    return __find_impl(*this, __x);
+  }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI iterator find(const _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator find(const _Kp& __x) {
     return __find_impl(*this, __x);
   }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI const_iterator find(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator find(const _Kp& __x) const {
     return __find_impl(*this, __x);
   }
 
-  _LIBCPP_HIDE_FROM_ABI size_type count(const key_type& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type count(const key_type& __x) const {
     auto [__first, __last] = equal_range(__x);
     return __last - __first;
   }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI size_type count(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type count(const _Kp& __x) const {
     auto [__first, __last] = equal_range(__x);
     return __last - __first;
   }
 
-  _LIBCPP_HIDE_FROM_ABI bool contains(const key_type& __x) const { return find(__x) != end(); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool contains(const key_type& __x) const {
+    return find(__x) != end();
+  }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI bool contains(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool contains(const _Kp& __x) const {
     return find(__x) != end();
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const key_type& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator lower_bound(const key_type& __x) {
     const auto& __keys = __keys_;
     return iterator(std::lower_bound(__keys.begin(), __keys.end(), __x, __compare_));
   }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const key_type& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator lower_bound(const key_type& __x) const {
     return const_iterator(std::lower_bound(__keys_.begin(), __keys_.end(), __x, __compare_));
   }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator lower_bound(const _Kp& __x) {
     const auto& __keys = __keys_;
     return iterator(std::lower_bound(__keys.begin(), __keys.end(), __x, __compare_));
   }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator lower_bound(const _Kp& __x) const {
     return const_iterator(std::lower_bound(__keys_.begin(), __keys_.end(), __x, __compare_));
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const key_type& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator upper_bound(const key_type& __x) {
     const auto& __keys = __keys_;
     return iterator(std::upper_bound(__keys.begin(), __keys.end(), __x, __compare_));
   }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const key_type& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator upper_bound(const key_type& __x) const {
     return const_iterator(std::upper_bound(__keys_.begin(), __keys_.end(), __x, __compare_));
   }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator upper_bound(const _Kp& __x) {
     const auto& __keys = __keys_;
     return iterator(std::upper_bound(__keys.begin(), __keys.end(), __x, __compare_));
   }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator upper_bound(const _Kp& __x) const {
     return const_iterator(std::upper_bound(__keys_.begin(), __keys_.end(), __x, __compare_));
   }
 
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, iterator> equal_range(const key_type& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, iterator> equal_range(const key_type& __x) {
     return __equal_range_impl(*this, __x);
   }
 
-  _LIBCPP_HIDE_FROM_ABI pair<const_iterator, const_iterator> equal_range(const key_type& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<const_iterator, const_iterator>
+  equal_range(const key_type& __x) const {
     return __equal_range_impl(*this, __x);
   }
 
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, iterator> equal_range(const _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, iterator> equal_range(const _Kp& __x) {
     return __equal_range_impl(*this, __x);
   }
   template <class _Kp>
     requires __is_transparent_v<_Compare>
-  _LIBCPP_HIDE_FROM_ABI pair<const_iterator, const_iterator> equal_range(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<const_iterator, const_iterator>
+  equal_range(const _Kp& __x) const {
     return __equal_range_impl(*this, __x);
   }
 
-  friend _LIBCPP_HIDE_FROM_ABI bool operator==(const flat_multiset& __x, const flat_multiset& __y) {
+  friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool
+  operator==(const flat_multiset& __x, const flat_multiset& __y) {
     return ranges::equal(__x, __y);
   }
 
-  friend _LIBCPP_HIDE_FROM_ABI auto operator<=>(const flat_multiset& __x, const flat_multiset& __y) {
+  friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 auto
+  operator<=>(const flat_multiset& __x, const flat_multiset& __y) {
     return std::lexicographical_compare_three_way(
         __x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
   }
 
-  friend _LIBCPP_HIDE_FROM_ABI void swap(flat_multiset& __x, flat_multiset& __y) noexcept { __x.swap(__y); }
+  friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void
+  swap(flat_multiset& __x, flat_multiset& __y) noexcept {
+    __x.swap(__y);
+  }
 
 private:
   template <bool _WasSorted, class... _Args>
-  _LIBCPP_HIDE_FROM_ABI void __append_sort_merge(_Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void __append_sort_merge(_Args&&... __args) {
     auto __on_failure    = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
     size_type __old_size = size();
     __flat_set_utils::__append(*this, std::forward<_Args>(__args)...);
@@ -598,13 +653,13 @@ private:
   }
 
   template <class _Kp>
-  _LIBCPP_HIDE_FROM_ABI iterator __emplace(_Kp&& __key) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator __emplace(_Kp&& __key) {
     auto __it = upper_bound(__key);
     return __flat_set_utils::__emplace_exact_pos(*this, __it, std::forward<_Kp>(__key));
   }
 
   template <class _Kp>
-  _LIBCPP_HIDE_FROM_ABI iterator __emplace_hint(const_iterator __hint, _Kp&& __key) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator __emplace_hint(const_iterator __hint, _Kp&& __key) {
     auto __prev_larger  = __hint != cbegin() && __compare_(__key, *std::prev(__hint));
     auto __next_smaller = __hint != cend() && __compare_(*__hint, __key);
 
@@ -636,7 +691,7 @@ private:
   }
 
   template <class _Self, class _Kp>
-  _LIBCPP_HIDE_FROM_ABI static auto __find_impl(_Self&& __self, const _Kp& __key) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static auto __find_impl(_Self&& __self, const _Kp& __key) {
     auto __it   = __self.lower_bound(__key);
     auto __last = __self.end();
     if (__it == __last || __self.__compare_(__key, *__it)) {
@@ -646,29 +701,30 @@ private:
   }
 
   template <class _Self, class _Kp>
-  _LIBCPP_HIDE_FROM_ABI static auto __equal_range_impl(_Self&& __self, const _Kp& __key) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static auto __equal_range_impl(_Self&& __self, const _Kp& __key) {
     using __iter = _If<is_const_v<__libcpp_remove_reference_t<_Self>>, const_iterator, iterator>;
     auto [__key_first, __key_last] =
         std::equal_range(__self.__keys_.begin(), __self.__keys_.end(), __key, __self.__compare_);
     return std::make_pair(__iter(__key_first), __iter(__key_last));
   }
 
-  _LIBCPP_HIDE_FROM_ABI void __reserve(size_t __size) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void __reserve(size_t __size) {
     if constexpr (__container_traits<_KeyContainer>::__reservable) {
       __keys_.reserve(__size);
     }
   }
 
   template <class _Key2, class _Compare2, class _KeyContainer2, class _Predicate>
-  friend typename flat_multiset<_Key2, _Compare2, _KeyContainer2>::size_type
+  friend typename flat_multiset<_Key2, _Compare2, _KeyContainer2>::size_type _LIBCPP_CONSTEXPR_SINCE_CXX26
   erase_if(flat_multiset<_Key2, _Compare2, _KeyContainer2>&, _Predicate);
 
   _KeyContainer __keys_;
   _LIBCPP_NO_UNIQUE_ADDRESS key_compare __compare_;
 
   struct __key_equiv {
-    _LIBCPP_HIDE_FROM_ABI __key_equiv(key_compare __c) : __comp_(__c) {}
-    _LIBCPP_HIDE_FROM_ABI bool operator()(const_reference __x, const_reference __y) const {
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __key_equiv(key_compare __c) : __comp_(__c) {}
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool
+    operator()(const_reference __x, const_reference __y) const {
       return !__comp_(std::get<0>(__x), std::get<0>(__y)) && !__comp_(std::get<0>(__y), std::get<0>(__x));
     }
     key_compare __comp_;
@@ -757,7 +813,7 @@ struct uses_allocator<flat_multiset<_Key, _Compare, _KeyContainer>, _Allocator>
     : bool_constant<uses_allocator_v<_KeyContainer, _Allocator> > {};
 
 template <class _Key, class _Compare, class _KeyContainer, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI typename flat_multiset<_Key, _Compare, _KeyContainer>::size_type
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 typename flat_multiset<_Key, _Compare, _KeyContainer>::size_type
 erase_if(flat_multiset<_Key, _Compare, _KeyContainer>& __flat_multiset, _Predicate __pred) {
   auto __guard = std::__make_exception_guard([&] { __flat_multiset.clear(); });
   auto __it =
diff --git a/libcxx/include/version b/libcxx/include/version
index 0fef1bb..b41cc9e 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -71,6 +71,8 @@ __cpp_lib_constexpr_charconv                            202207L <charconv>
 __cpp_lib_constexpr_cmath                               202202L <cmath> <cstdlib>
 __cpp_lib_constexpr_complex                             201711L <complex>
 __cpp_lib_constexpr_dynamic_alloc                       201907L <memory>
+__cpp_lib_constexpr_flat_map                            202502L <flat_map>
+__cpp_lib_constexpr_flat_set                            202502L <flat_set>
 __cpp_lib_constexpr_forward_list                        202502L <forward_list>
 __cpp_lib_constexpr_functional                          201907L <functional>
 __cpp_lib_constexpr_iterator                            201811L <iterator>
@@ -552,6 +554,8 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # define __cpp_lib_bitset                               202306L
 # undef  __cpp_lib_constexpr_algorithms
 # define __cpp_lib_constexpr_algorithms                 202306L
+# define __cpp_lib_constexpr_flat_map                   202502L
+# define __cpp_lib_constexpr_flat_set                   202502L
 # define __cpp_lib_constexpr_forward_list               202502L
 # define __cpp_lib_constexpr_list                       202502L
 # if !defined(_LIBCPP_ABI_VCRUNTIME)
diff --git a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert.temporary.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert.temporary.pass.cpp
index 248f282..acd20ce 100644
--- a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert.temporary.pass.cpp
+++ b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert.temporary.pass.cpp
@@ -21,7 +21,7 @@
 #include "../flat_helpers.h"
 #include "test_macros.h"
 
-bool test() {
+constexpr bool test() {
   using M = std::flat_multiset<TrackCopyMove>;
   {
     M m;
@@ -43,6 +43,9 @@ bool test() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp
index 57a581c..c2fcd86 100644
--- a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp
+++ b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp
@@ -20,27 +20,36 @@
 #include <cassert>
 #include <flat_set>
 #include <ranges>
-#include <sstream>
 #include <vector>
 
 #include "../flat_helpers.h"
+#include "test_iterators.h"
 #include "test_macros.h"
 
-void test() {
+constexpr bool test() {
   NotQuiteSequenceContainer<int> v;
   std::flat_multiset s(v);
-  std::istringstream ints("0 1 1 0");
-  auto r = std::ranges::subrange(std::istream_iterator<int>(ints), std::istream_iterator<int>()) |
-           std::views::transform([](int i) { return i * i; });
+
+  int ar[]   = {0, 1, 1, 0};
+  using Iter = cpp20_input_iterator<const int*>;
+  using Sent = sentinel_wrapper<Iter>;
+  using R    = std::ranges::subrange<Iter, Sent>;
+  auto r     = R(Iter(ar), Sent(Iter(ar + 4)));
+
   static_assert(
       ![](auto& t) { return requires { t.insert_range(t.end(), r); }; }(v),
       "This test is to test the case where the underlying container does not provide insert_range");
   s.insert_range(r);
   assert(std::ranges::equal(s, std::vector<int>{0, 0, 1, 1}));
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp
index 52f7743..88a76d3 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp
@@ -24,7 +24,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<int>, KeyContainer>;
   M m;
@@ -38,15 +38,23 @@ void test_one() {
   assert(m.empty());
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp
index 4e3d141..fb9c38f 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp
@@ -24,7 +24,7 @@
 #include "test_allocator.h"
 #include "test_macros.h"
 
-void test() {
+constexpr bool test() {
   {
     using A1 = limited_allocator<int, 10>;
     using C  = std::flat_multiset<int, std::less<int>, std::vector<int, A1>>;
@@ -59,10 +59,15 @@ void test() {
     assert(c.max_size() <= max_dist);
     assert(c.max_size() <= alloc_max_size(std::allocator<char>()));
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp
index 4aff08b..156bb27 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=200000000
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=800000000
 
 // <flat_set>
 
@@ -23,7 +25,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using M = std::flat_multiset<int, std::less<int>, KeyContainer>;
   using S = typename M::size_type;
   {
@@ -46,7 +48,7 @@ void test_one() {
   }
   {
     M m;
-    S s = 500000;
+    S s = 5000;
     for (std::size_t i = 0u; i < s; ++i) {
       m.emplace(i);
       m.emplace(i);
@@ -57,15 +59,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp
index 4fffcb3..2426fbc 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp
@@ -14,6 +14,7 @@
 //   explicit flat_multiset(const Allocator& a);
 
 #include <cassert>
+#include <deque>
 #include <flat_set>
 #include <functional>
 #include <vector>
@@ -22,7 +23,8 @@
 #include "test_allocator.h"
 #include "../../../test_compare.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true
@@ -30,8 +32,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     static_assert(std::is_constructible_v<M1, const A1&>);
@@ -40,24 +42,37 @@ void test() {
     static_assert(!std::is_constructible_v<M2, const A1&>);
   }
   {
-    // explicit
-    using M = std::flat_multiset<int, std::less<int>, std::vector<int, test_allocator<int>>>;
-
-    static_assert(std::is_constructible_v<M, test_allocator<int>>);
-    static_assert(!std::is_convertible_v<test_allocator<int>, M>);
-  }
-  {
     using A = test_allocator<short>;
-    using M = std::flat_multiset<int, std::less<int>, std::vector<int, test_allocator<int>>>;
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, test_allocator<int>>>;
     M m(A(0, 5));
     assert(m.empty());
     assert(m.begin() == m.end());
     assert(std::move(m).extract().get_allocator().get_id() == 5);
   }
+  {
+    // explicit
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, test_allocator<int>>>;
+
+    static_assert(std::is_constructible_v<M, test_allocator<int>>);
+    static_assert(!std::is_convertible_v<test_allocator<int>, M>);
+  }
+}
+
+constexpr bool test() {
+  test<std::vector>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/assign_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/assign_initializer_list.pass.cpp
index ae81ab0..a895117 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/assign_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/assign_initializer_list.pass.cpp
@@ -26,7 +26,7 @@
 #include "test_allocator.h"
 
 template <class KeyContainer>
-void test() {
+constexpr void test() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   {
@@ -53,16 +53,24 @@ void test() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test<std::vector<int>>();
   test<std::vector<double>>();
-  test<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>>();
   test<MinSequenceContainer<int>>();
   test<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp
index 6b68589..43ebea7 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp
@@ -20,11 +20,35 @@
 #include <type_traits>
 #include <vector>
 
+#include "MinSequenceContainer.h"
+#include "min_allocator.h"
 #include "test_macros.h"
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-void test() {
+template <class KeyContainer>
+constexpr void test_compare() {
+  using Key = typename KeyContainer::value_type;
+  {
+    // The one-argument ctor is explicit.
+    using C = test_less<Key>;
+    static_assert(std::is_constructible_v<std::flat_multiset<Key, C>, C>);
+    static_assert(!std::is_convertible_v<C, std::flat_multiset<Key, C>>);
+
+    static_assert(std::is_constructible_v<std::flat_multiset<Key>, std::less<Key>>);
+    static_assert(!std::is_convertible_v<std::less<Key>, std::flat_multiset<Key>>);
+  }
+  {
+    using C = test_less<Key>;
+    auto m  = std::flat_multiset<Key, C>(C(3));
+    assert(m.empty());
+    assert(m.begin() == m.end());
+    assert(m.key_comp() == C(3));
+  }
+}
+
+template <template <class...> class KeyContainer>
+constexpr void test_compare_alloc() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true
@@ -32,8 +56,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     static_assert(std::is_constructible_v<M1, const C&, const A1&>);
@@ -42,25 +66,9 @@ void test() {
     static_assert(!std::is_constructible_v<M2, const C&, const A1&>);
   }
   {
-    using C = test_less<int>;
-    auto m  = std::flat_multiset<int, C>(C(3));
-    assert(m.empty());
-    assert(m.begin() == m.end());
-    assert(m.key_comp() == C(3));
-  }
-  {
-    // The one-argument ctor is explicit.
-    using C = test_less<int>;
-    static_assert(std::is_constructible_v<std::flat_multiset<int, C>, C>);
-    static_assert(!std::is_convertible_v<C, std::flat_multiset<int, C>>);
-
-    static_assert(std::is_constructible_v<std::flat_multiset<int>, std::less<int>>);
-    static_assert(!std::is_convertible_v<std::less<int>, std::flat_multiset<int>>);
-  }
-  {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
-    auto m   = std::flat_multiset<int, C, std::vector<int, A1>>(C(4), A1(5));
+    auto m   = std::flat_multiset<int, C, KeyContainer<int, A1>>(C(4), A1(5));
     assert(m.empty());
     assert(m.begin() == m.end());
     assert(m.key_comp() == C(4));
@@ -68,9 +76,9 @@ void test() {
   }
   {
     // explicit(false)
-    using C                                           = test_less<int>;
-    using A1                                          = test_allocator<int>;
-    std::flat_multiset<int, C, std::deque<int, A1>> m = {C(4), A1(5)};
+    using C                                             = test_less<int>;
+    using A1                                            = test_allocator<int>;
+    std::flat_multiset<int, C, KeyContainer<int, A1>> m = {C(4), A1(5)};
     assert(m.empty());
     assert(m.begin() == m.end());
     assert(m.key_comp() == C(4));
@@ -78,8 +86,29 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test_compare<std::vector<int>>();
+  test_compare<MinSequenceContainer<int>>();
+  test_compare<std::vector<int, min_allocator<int>>>();
+
+  test_compare_alloc<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test_compare<std::deque<int>>();
+    test_compare_alloc<std::deque>();
+  }
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/containers.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/containers.pass.cpp
index 78eac42..1a47600 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/containers.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/containers.pass.cpp
@@ -35,7 +35,8 @@ void conversion_test(T);
 template <class T, class... Args>
 concept ImplicitlyConstructible = requires(Args&&... args) { conversion_test<T>({std::forward<Args>(args)...}); };
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true
@@ -43,8 +44,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     static_assert(std::is_constructible_v<M1, const V1&, const A1&>);
@@ -59,15 +60,15 @@ void test() {
   }
   {
     // flat_multiset(container_type)
-    using M             = std::flat_multiset<int>;
-    std::vector<int> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3};
-    auto m              = M(ks);
-    int expected[]      = {1, 1, 1, 2, 2, 2, 3, 3, 3};
+    using M              = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
+    KeyContainer<int> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3};
+    auto m               = M(ks);
+    int expected[]       = {1, 1, 1, 2, 2, 2, 3, 3, 3};
     assert(std::ranges::equal(m, expected));
 
     // explicit(false)
-    static_assert(std::is_constructible_v<M, const std::vector<int>&>);
-    static_assert(!ImplicitlyConstructible<M, const std::vector<int>&>);
+    static_assert(std::is_constructible_v<M, const KeyContainer<int>&>);
+    static_assert(!ImplicitlyConstructible<M, const KeyContainer<int>&>);
 
     m = M(std::move(ks));
     assert(ks.empty()); // it was moved-from
@@ -77,7 +78,7 @@ void test() {
     // flat_multiset(container_type)
     // move-only
     int expected[] = {3, 3, 2, 1};
-    using Ks       = std::deque<MoveOnly, min_allocator<MoveOnly>>;
+    using Ks       = KeyContainer<MoveOnly, min_allocator<MoveOnly>>;
     using M        = std::flat_multiset<MoveOnly, std::greater<MoveOnly>, Ks>;
     Ks ks;
     ks.push_back(1);
@@ -92,8 +93,8 @@ void test() {
     // flat_multiset(container_type)
     // container's allocators are used
     using A = test_allocator<int>;
-    using M = std::flat_multiset<int, std::less<int>, std::deque<int, A>>;
-    auto ks = std::deque<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A>>;
+    auto ks = KeyContainer<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
     auto m  = M(std::move(ks));
     assert(ks.empty()); // it was moved-from
     assert((m == M{1, 1, 1, 2, 2, 2, 3, 3, 3}));
@@ -102,22 +103,22 @@ void test() {
   }
   {
     // flat_multiset(container_type, key_compare)
-    using C             = test_less<int>;
-    using M             = std::flat_multiset<int, C>;
-    std::vector<int> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3};
-    auto m              = M(ks, C(4));
+    using C              = test_less<int>;
+    using M              = std::flat_multiset<int, C, KeyContainer<int>>;
+    KeyContainer<int> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3};
+    auto m               = M(ks, C(4));
     assert(std::ranges::equal(m, std::vector<int>{1, 1, 1, 2, 2, 2, 3, 3, 3}));
     assert(m.key_comp() == C(4));
 
     // explicit
-    static_assert(std::is_constructible_v<M, const std::vector<int>&, const C&>);
-    static_assert(!ImplicitlyConstructible<M, const std::vector<int>&, const C&>);
+    static_assert(std::is_constructible_v<M, const KeyContainer<int>&, const C&>);
+    static_assert(!ImplicitlyConstructible<M, const KeyContainer<int>&, const C&>);
   }
   {
     // flat_multiset(container_type , const Allocator&)
     using A = test_allocator<int>;
-    using M = std::flat_multiset<int, std::less<int>, std::deque<int, A>>;
-    auto ks = std::deque<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A>>;
+    auto ks = KeyContainer<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
     auto m  = M(ks, A(4)); // replaces the allocators
     assert(!ks.empty());   // it was an lvalue above
     assert((m == M{1, 1, 1, 2, 2, 2, 3, 3, 3}));
@@ -125,7 +126,7 @@ void test() {
     assert(keys.get_allocator() == A(4));
 
     // explicit(false)
-    static_assert(ImplicitlyConstructible<M, const std::deque<int, A>&, const A&>);
+    static_assert(ImplicitlyConstructible<M, const KeyContainer<int, A>&, const A&>);
     M m2 = {ks, A(4)};   // implicit ctor
     assert(!ks.empty()); // it was an lvalue above
     assert(m2 == m);
@@ -134,19 +135,19 @@ void test() {
   }
   {
     // flat_multiset(container_type , const Allocator&)
-    using C                = test_less<int>;
-    using A                = test_allocator<int>;
-    using M                = std::flat_multiset<int, C, std::vector<int, A>>;
-    std::vector<int, A> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3};
-    auto m                 = M(ks, C(4), A(5));
-    assert(std::ranges::equal(m, std::vector<int, A>{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    using C                 = test_less<int>;
+    using A                 = test_allocator<int>;
+    using M                 = std::flat_multiset<int, C, KeyContainer<int, A>>;
+    KeyContainer<int, A> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3};
+    auto m                  = M(ks, C(4), A(5));
+    assert(std::ranges::equal(m, KeyContainer<int, A>{1, 1, 1, 2, 2, 2, 3, 3, 3}));
     assert(m.key_comp() == C(4));
     auto m_copy = m;
     auto keys   = std::move(m_copy).extract();
     assert(keys.get_allocator() == A(5));
 
     // explicit(false)
-    static_assert(ImplicitlyConstructible<M, const std::vector<int, A>&, const A&>);
+    static_assert(ImplicitlyConstructible<M, const KeyContainer<int, A>&, const A&>);
     M m2 = {ks, C(4), A(5)};
     assert(m2 == m);
     assert(m2.key_comp() == C(4));
@@ -155,8 +156,22 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy.pass.cpp
index b4f7220..55f3def 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy.pass.cpp
@@ -14,6 +14,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <deque>
 #include <flat_set>
 #include <vector>
 
@@ -21,10 +22,11 @@
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     using C = test_less<int>;
-    std::vector<int, test_allocator<int>> ks({1, 3, 5, 3, 1}, test_allocator<int>(6));
+    KeyContainer<int, test_allocator<int>> ks({1, 3, 5, 3, 1}, test_allocator<int>(6));
     const int expected[] = {1, 1, 3, 3, 5};
     using M              = std::flat_multiset<int, C, decltype(ks)>;
     auto mo              = M(ks, C(5));
@@ -43,7 +45,7 @@ void test() {
   }
   {
     using C              = test_less<int>;
-    using Ks             = std::vector<int, other_allocator<int>>;
+    using Ks             = KeyContainer<int, other_allocator<int>>;
     auto ks              = Ks({1, 3, 5, 3, 1}, other_allocator<int>(6));
     const int expected[] = {1, 1, 3, 3, 5};
     using M              = std::flat_multiset<int, C, Ks>;
@@ -63,8 +65,22 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_alloc.pass.cpp
index ec8ad82..ec9f14e 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_alloc.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_alloc.pass.cpp
@@ -23,7 +23,8 @@
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -31,8 +32,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     static_assert(std::is_constructible_v<M1, const M1&, const A1&>);
@@ -42,7 +43,7 @@ void test() {
   }
   {
     using C = test_less<int>;
-    std::vector<int, test_allocator<int>> ks({1, 3, 5, 5}, test_allocator<int>(6));
+    KeyContainer<int, test_allocator<int>> ks({1, 3, 5, 5}, test_allocator<int>(6));
     using M = std::flat_multiset<int, C, decltype(ks)>;
     auto mo = M(ks, C(5));
     auto m  = M(mo, test_allocator<int>(3));
@@ -59,8 +60,23 @@ void test() {
     assert(keys2.get_allocator() == test_allocator<int>(6));
   }
 }
+
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp
index 2b6176a..2e63a00 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp
@@ -13,6 +13,7 @@
 // flat_multiset& operator=(const flat_multiset& m);
 
 #include <algorithm>
+#include <deque>
 #include <flat_set>
 #include <functional>
 #include <vector>
@@ -22,11 +23,12 @@
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // test_allocator is not propagated
     using C = test_less<int>;
-    std::vector<int, test_allocator<int>> ks({1, 3, 5, 5}, test_allocator<int>(6));
+    KeyContainer<int, test_allocator<int>> ks({1, 3, 5, 5}, test_allocator<int>(6));
     using M = std::flat_multiset<int, C, decltype(ks)>;
     auto mo = M(ks, C(5));
     auto m  = M({{3, 4, 5, 4}}, C(3), test_allocator<int>(2));
@@ -46,7 +48,7 @@ void test() {
   {
     // other_allocator is propagated
     using C              = test_less<int>;
-    using Ks             = std::vector<int, other_allocator<int>>;
+    using Ks             = KeyContainer<int, other_allocator<int>>;
     auto ks              = Ks({1, 3, 5, 3}, other_allocator<int>(6));
     const int expected[] = {1, 3, 3, 5};
     using M              = std::flat_multiset<int, C, Ks>;
@@ -65,7 +67,7 @@ void test() {
     auto keys2 = std::move(mo).extract();
     assert(keys2.get_allocator() == other_allocator<int>(6));
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     // comparator is copied and invariant is preserved
     using M = std::flat_multiset<int, std::function<bool(int, int)>>;
     M mo    = M({1, 2}, std::less<int>());
@@ -103,8 +105,22 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/default.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/default.pass.cpp
index 16f9032..3a7ff86 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/default.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/default.pass.cpp
@@ -25,28 +25,29 @@
 #include "test_macros.h"
 
 struct DefaultCtableComp {
-  explicit DefaultCtableComp() { default_constructed_ = true; }
-  bool operator()(int, int) const { return false; }
+  constexpr explicit DefaultCtableComp() { default_constructed_ = true; }
+  constexpr bool operator()(int, int) const { return false; }
   bool default_constructed_ = false;
 };
 
 struct ThrowingCtorComp {
-  ThrowingCtorComp() noexcept(false) {}
-  bool operator()(const auto&, const auto&) const { return false; }
+  constexpr ThrowingCtorComp() noexcept(false) {}
+  constexpr bool operator()(const auto&, const auto&) const { return false; }
 };
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
-    std::flat_multiset<int> m;
+    std::flat_multiset<int, std::less<int>, KeyContainer<int>> m;
     assert(m.empty());
   }
   {
     // explicit(false)
-    std::flat_multiset<int> m = {};
+    std::flat_multiset<int, std::less<int>, KeyContainer<int>> m = {};
     assert(m.empty());
   }
   {
-    std::flat_multiset<int, DefaultCtableComp, std::deque<int, min_allocator<int>>> m;
+    std::flat_multiset<int, DefaultCtableComp, KeyContainer<int, min_allocator<int>>> m;
     assert(m.empty());
     assert(m.begin() == m.end());
     assert(m.key_comp().default_constructed_);
@@ -54,7 +55,7 @@ void test() {
   {
     using A1 = explicit_allocator<int>;
     {
-      std::flat_multiset<int, DefaultCtableComp, std::vector<int, A1>> m;
+      std::flat_multiset<int, DefaultCtableComp, KeyContainer<int, A1>> m;
       assert(m.empty());
       assert(m.key_comp().default_constructed_);
     }
@@ -67,30 +68,46 @@ void test() {
   }
 #if defined(_LIBCPP_VERSION)
   {
-    using C = std::flat_multiset<MoveOnly>;
+    using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>>;
     static_assert(std::is_nothrow_default_constructible_v<C>);
     C c;
   }
   {
-    using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, std::vector<MoveOnly, test_allocator<MoveOnly>>>;
+    using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, KeyContainer<MoveOnly, test_allocator<MoveOnly>>>;
     static_assert(std::is_nothrow_default_constructible_v<C>);
     C c;
   }
 #endif // _LIBCPP_VERSION
   {
-    using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, std::vector<MoveOnly, other_allocator<MoveOnly>>>;
+    using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, KeyContainer<MoveOnly, other_allocator<MoveOnly>>>;
     static_assert(!std::is_nothrow_default_constructible_v<C>);
     C c;
   }
   {
-    using C = std::flat_multiset<MoveOnly, ThrowingCtorComp>;
+    using C = std::flat_multiset<MoveOnly, ThrowingCtorComp, KeyContainer<MoveOnly>>;
     static_assert(!std::is_nothrow_default_constructible_v<C>);
     C c;
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque>();
+  }
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp
index f852f2f..f7243fa 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp
@@ -23,39 +23,56 @@
 #include "test_allocator.h"
 
 struct ThrowingDtorComp {
-  bool operator()(const auto&, const auto&) const;
-  ~ThrowingDtorComp() noexcept(false) {}
+  constexpr bool operator()(const auto&, const auto&) const;
+  constexpr ~ThrowingDtorComp() noexcept(false) {}
 };
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
-    using C = std::flat_multiset<MoveOnly, MoveOnly>;
+    using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, KeyContainer<MoveOnly>>;
     static_assert(std::is_nothrow_destructible_v<C>);
     C c;
   }
   {
-    using V = std::vector<MoveOnly, test_allocator<MoveOnly>>;
+    using V = KeyContainer<MoveOnly, test_allocator<MoveOnly>>;
     using C = std::flat_multiset<MoveOnly, std::less<MoveOnly>, V>;
     static_assert(std::is_nothrow_destructible_v<C>);
     C c;
   }
   {
-    using V = std::deque<MoveOnly, other_allocator<MoveOnly>>;
+    using V = KeyContainer<MoveOnly, other_allocator<MoveOnly>>;
     using C = std::flat_multiset<MoveOnly, std::greater<MoveOnly>, V>;
     static_assert(std::is_nothrow_destructible_v<C>);
     C c;
   }
 #if defined(_LIBCPP_VERSION)
   {
-    using C = std::flat_multiset<MoveOnly, ThrowingDtorComp>;
+    using C = std::flat_multiset<MoveOnly, ThrowingDtorComp, KeyContainer<MoveOnly>>;
     static_assert(!std::is_nothrow_destructible_v<C>);
     C c;
   }
 #endif // _LIBCPP_VERSION
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque>();
+  }
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/initializer_list.pass.cpp
index 10638d7..36f5def 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/initializer_list.pass.cpp
@@ -32,12 +32,13 @@
 #include "../../../test_compare.h"
 
 struct DefaultCtableComp {
-  explicit DefaultCtableComp() { default_constructed_ = true; }
-  bool operator()(int, int) const { return false; }
+  constexpr explicit DefaultCtableComp() { default_constructed_ = true; }
+  constexpr bool operator()(int, int) const { return false; }
   bool default_constructed_ = false;
 };
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -45,8 +46,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     using IL = std::initializer_list<int>;
@@ -60,10 +61,9 @@ void test() {
     static_assert(!std::is_constructible_v<M1, IL, const C&, const A2&>);
     static_assert(!std::is_constructible_v<M2, IL, const C&, const A1&>);
   }
-
   {
     // initializer_list<value_type> needs to match exactly
-    using M = std::flat_multiset<int>;
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     using C = typename M::key_compare;
     static_assert(std::is_constructible_v<M, std::initializer_list<int>>);
     static_assert(std::is_constructible_v<M, std::initializer_list<int>, C>);
@@ -78,11 +78,10 @@ void test() {
     static_assert(!std::is_constructible_v<M, std::initializer_list<const int>, C, std::allocator<int>>);
     static_assert(!std::is_constructible_v<M, std::initializer_list<const int>, std::allocator<int>>);
   }
-
   int expected[] = {1, 2, 2, 3, 3, 5};
   {
     // flat_multiset(initializer_list<value_type>);
-    using M                       = std::flat_multiset<int>;
+    using M                       = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     std::initializer_list<int> il = {5, 2, 2, 3, 1, 3};
     M m(il);
     assert(std::ranges::equal(m, expected));
@@ -90,13 +89,13 @@ void test() {
   {
     // flat_multiset(initializer_list<value_type>);
     // explicit(false)
-    using M = std::flat_multiset<int>;
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     M m     = {5, 2, 2, 3, 1, 3};
     assert(std::ranges::equal(m, expected));
   }
   {
     // flat_multiset(initializer_list<value_type>);
-    using M = std::flat_multiset<int, std::greater<int>, std::deque<int, min_allocator<int>>>;
+    using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>;
     M m     = {5, 2, 2, 3, 1, 3};
     assert(std::ranges::equal(m, expected | std::views::reverse));
   }
@@ -105,15 +104,14 @@ void test() {
     {
       // flat_multiset(initializer_list<value_type>);
       // different comparator
-      using M = std::flat_multiset<int, DefaultCtableComp, std::vector<int, A>>;
+      using M = std::flat_multiset<int, DefaultCtableComp, KeyContainer<int, A>>;
       M m     = {1, 2, 3};
       assert(m.size() == 3);
-      LIBCPP_ASSERT(*m.begin() == 1);
       assert(m.key_comp().default_constructed_);
     }
     {
       // flat_multiset(initializer_list<value_type>, const Allocator&);
-      using M = std::flat_multiset<int, std::greater<int>, std::deque<int, A>>;
+      using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, A>>;
       A a;
       M m({5, 2, 2, 3, 1, 3}, a);
       assert(std::ranges::equal(m, expected | std::views::reverse));
@@ -122,7 +120,7 @@ void test() {
   {
     // flat_multiset(initializer_list<value_type>, const key_compare&);
     using C = test_less<int>;
-    using M = std::flat_multiset<int, C>;
+    using M = std::flat_multiset<int, C, KeyContainer<int>>;
     auto m  = M({5, 2, 2, 3, 1, 3}, C(10));
     assert(std::ranges::equal(m, expected));
     assert(m.key_comp() == C(10));
@@ -132,10 +130,10 @@ void test() {
     assert(m2 == m);
     assert(m2.key_comp() == C(10));
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     // flat_multiset(initializer_list<value_type>, const key_compare&);
     // Sorting uses the comparator that was passed in
-    using M = std::flat_multiset<int, std::function<bool(int, int)>, std::deque<int, min_allocator<int>>>;
+    using M = std::flat_multiset<int, std::function<bool(int, int)>, KeyContainer<int, min_allocator<int>>>;
     auto m  = M({5, 2, 2, 1, 3, 3}, std::greater<int>());
     assert(std::ranges::equal(m, expected | std::views::reverse));
     assert(m.key_comp()(2, 1) == true);
@@ -143,15 +141,31 @@ void test() {
   {
     // flat_multiset(initializer_list<value_type> il, const key_compare& comp, const Alloc& a);
     using A = explicit_allocator<int>;
-    using M = std::flat_multiset<int, std::greater<int>, std::deque<int, A>>;
+    using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, A>>;
     A a;
     M m({5, 2, 2, 3, 1, 3}, {}, a);
     assert(std::ranges::equal(m, expected | std::views::reverse));
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque>();
+  }
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/iter_iter.pass.cpp
index da9aef3..0f757db 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/iter_iter.pass.cpp
@@ -30,7 +30,8 @@
 #include "test_macros.h"
 #include "../../../test_compare.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -38,8 +39,8 @@ void test() {
     using C     = test_less<int>;
     using A1    = test_allocator<int>;
     using A2    = other_allocator<int>;
-    using V1    = std::vector<int, A1>;
-    using V2    = std::vector<int, A2>;
+    using V1    = KeyContainer<int, A1>;
+    using V2    = KeyContainer<int, A2>;
     using M1    = std::flat_multiset<int, C, V1>;
     using M2    = std::flat_multiset<int, C, V2>;
     using Iter1 = typename M1::iterator;
@@ -60,7 +61,7 @@ void test() {
   {
     // flat_multiset(InputIterator , InputIterator)
     // cpp17_input_iterator
-    using M = std::flat_multiset<int>;
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     auto m  = M(cpp17_input_iterator<const int*>(ar), cpp17_input_iterator<const int*>(ar + 9));
     assert(std::ranges::equal(m, expected));
 
@@ -71,21 +72,21 @@ void test() {
   {
     // flat_multiset(InputIterator , InputIterator)
     // greater
-    using M = std::flat_multiset<int, std::greater<int>, std::deque<int, min_allocator<int>>>;
+    using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>;
     auto m  = M(cpp17_input_iterator<const int*>(ar), cpp17_input_iterator<const int*>(ar + 9));
     assert(std::ranges::equal(m, expected | std::views::reverse));
   }
   {
     // flat_multiset(InputIterator , InputIterator)
     // Test when the operands are of array type (also contiguous iterator type)
-    using M = std::flat_multiset<int, std::greater<int>, std::vector<int, min_allocator<int>>>;
+    using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>;
     auto m  = M(ar, ar);
     assert(m.empty());
   }
   {
     // flat_multiset(InputIterator , InputIterator, const key_compare&)
     using C = test_less<int>;
-    using M = std::flat_multiset<int, C, std::vector<int>>;
+    using M = std::flat_multiset<int, C, KeyContainer<int>>;
     auto m  = M(ar, ar + 9, C(3));
     assert(std::ranges::equal(m, expected));
     assert(m.key_comp() == C(3));
@@ -98,7 +99,7 @@ void test() {
   {
     // flat_multiset(InputIterator , InputIterator, const Allocator&)
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, std::less<int>, std::vector<int, A1>>;
+    using M  = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>;
     auto m   = M(ar, ar + 9, A1(5));
     assert(std::ranges::equal(m, expected));
     assert(std::move(m).extract().get_allocator() == A1(5));
@@ -107,7 +108,7 @@ void test() {
     // flat_multiset(InputIterator , InputIterator, const Allocator&)
     // explicit(false)
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, std::less<int>, std::vector<int, A1>>;
+    using M  = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>;
     M m      = {ar, ar + 9, A1(5)}; // implicit ctor
     assert(std::ranges::equal(m, expected));
     assert(std::move(m).extract().get_allocator() == A1(5));
@@ -116,7 +117,7 @@ void test() {
     // flat_multiset(InputIterator , InputIterator, const key_compare&, const Allocator&)
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, C, std::vector<int, A1>>;
+    using M  = std::flat_multiset<int, C, KeyContainer<int, A1>>;
     auto m   = M(ar, ar + 9, C(3), A1(5));
     assert(std::ranges::equal(m, expected));
     assert(m.key_comp() == C(3));
@@ -126,7 +127,7 @@ void test() {
     // flat_multiset(InputIterator , InputIterator, const key_compare&, const Allocator&)
     // explicit(false)
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, std::less<int>, std::deque<int, A1>>;
+    using M  = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>;
     M m      = {ar, ar + 9, {}, A1(5)}; // implicit ctor
     assert(std::ranges::equal(m, expected));
     LIBCPP_ASSERT(std::ranges::equal(m, expected));
@@ -134,8 +135,21 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move.pass.cpp
index 825ad75..7fb0c0e 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move.pass.cpp
@@ -25,11 +25,12 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     using C = test_less<int>;
     using A = test_allocator<int>;
-    using M = std::flat_multiset<int, C, std::deque<int, A>>;
+    using M = std::flat_multiset<int, C, KeyContainer<int, A>>;
     M mo    = M({1, 2, 1, 3}, C(5), A(7));
     M m     = std::move(mo);
     assert((m == M{1, 1, 2, 3}));
@@ -43,7 +44,7 @@ void test() {
   {
     using C = test_less<int>;
     using A = min_allocator<int>;
-    using M = std::flat_multiset<int, C, std::vector<int, A>>;
+    using M = std::flat_multiset<int, C, KeyContainer<int, A>>;
     M mo    = M({1, 2, 1, 3}, C(5), A());
     M m     = std::move(mo);
     assert((m == M{1, 1, 2, 3}));
@@ -54,9 +55,9 @@ void test() {
     assert(mo.key_comp() == C(5));
     assert(std::move(mo).extract().get_allocator() == A());
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     // A moved-from flat_multiset maintains its class invariant in the presence of moved-from comparators.
-    using M = std::flat_multiset<int, std::function<bool(int, int)>>;
+    using M = std::flat_multiset<int, std::function<bool(int, int)>, KeyContainer<int>>;
     M mo    = M({1, 2, 1, 3}, std::less<int>());
     M m     = std::move(mo);
     assert(m.size() == 4);
@@ -81,6 +82,16 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 template <class T>
 struct ThrowingMoveAllocator {
   using value_type                                    = T;
@@ -179,6 +190,9 @@ void test_move_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_move_noexcept();
 #if !defined(TEST_HAS_NO_EXCEPTIONS)
   test_move_exception();
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_alloc.pass.cpp
index ee8258e..1f095ed 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_alloc.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_alloc.pass.cpp
@@ -24,7 +24,8 @@
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -32,8 +33,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     static_assert(std::is_constructible_v<M1, M1&&, const A1&>);
@@ -45,7 +46,7 @@ void test() {
     int expected[] = {1, 1, 2, 2, 3};
     using C        = test_less<int>;
     using A        = test_allocator<int>;
-    using M        = std::flat_multiset<int, C, std::deque<int, A>>;
+    using M        = std::flat_multiset<int, C, KeyContainer<int, A>>;
     auto mo        = M(expected, expected + 5, C(5), A(7));
     auto m         = M(std::move(mo), A(3));
 
@@ -72,8 +73,21 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_assign.pass.cpp
index 96e046e..62e2181 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_assign.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_assign.pass.cpp
@@ -187,25 +187,12 @@ void test_move_assign_no_except() {
   }
 }
 
-void test() {
-  {
-    using C                           = test_less<int>;
-    using A1                          = test_allocator<int>;
-    using M                           = std::flat_multiset<int, C, std::vector<int, A1>>;
-    M mo                              = M({1, 1, 2, 3}, C(5), A1(7));
-    M m                               = M({}, C(3), A1(7));
-    std::same_as<M&> decltype(auto) r = m = std::move(mo);
-    assert(&r == &m);
-    assert((m == M{1, 1, 2, 3}));
-    assert(m.key_comp() == C(5));
-    auto ks = std::move(m).extract();
-    assert(ks.get_allocator() == A1(7));
-    assert(mo.empty());
-  }
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     using C                           = test_less<int>;
     using A1                          = other_allocator<int>;
-    using M                           = std::flat_multiset<int, C, std::deque<int, A1>>;
+    using M                           = std::flat_multiset<int, C, KeyContainer<int, A1>>;
     M mo                              = M({4, 4, 5}, C(5), A1(7));
     M m                               = M({1, 1, 2, 3, 4}, C(3), A1(7));
     std::same_as<M&> decltype(auto) r = m = std::move(mo);
@@ -218,7 +205,7 @@ void test() {
   }
   {
     using A                           = min_allocator<int>;
-    using M                           = std::flat_multiset<int, std::greater<int>, std::vector<int, A>>;
+    using M                           = std::flat_multiset<int, std::greater<int>, KeyContainer<int, A>>;
     M mo                              = M({5, 3, 4, 3}, A());
     M m                               = M({4, 1, 3, 2, 1}, A());
     std::same_as<M&> decltype(auto) r = m = std::move(mo);
@@ -228,10 +215,37 @@ void test() {
     assert(ks.get_allocator() == A());
     assert(mo.empty());
   }
+  {
+    using C                           = test_less<int>;
+    using A1                          = test_allocator<int>;
+    using M                           = std::flat_multiset<int, C, KeyContainer<int, A1>>;
+    M mo                              = M({1, 1, 2, 3}, C(5), A1(7));
+    M m                               = M({}, C(3), A1(7));
+    std::same_as<M&> decltype(auto) r = m = std::move(mo);
+    assert(&r == &m);
+    assert((m == M{1, 1, 2, 3}));
+    assert(m.key_comp() == C(5));
+    auto ks = std::move(m).extract();
+    assert(ks.get_allocator() == A1(7));
+    assert(mo.empty());
+  }
+}
+
+constexpr bool test() {
+  test<std::vector>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_move_assign_clears();
   test_move_assign_no_except();
 
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/range.pass.cpp
index 76485b4..36501a5 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/range.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/range.pass.cpp
@@ -56,7 +56,8 @@ static_assert(
     !std::
         is_constructible_v<Set, std::from_range_t, RangeOf<std::pair<int, int>>, std::less<int>, std::allocator<int>>);
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -64,8 +65,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     static_assert(std::is_constructible_v<M1, std::from_range_t, M1, const A1&>);
@@ -84,7 +85,7 @@ void test() {
   {
     // flat_multiset(from_range_t, R&&)
     // input_range && !common
-    using M    = std::flat_multiset<int>;
+    using M    = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     using Iter = cpp20_input_iterator<const int*>;
     using Sent = sentinel_wrapper<Iter>;
     using R    = std::ranges::subrange<Iter, Sent>;
@@ -98,17 +99,17 @@ void test() {
   {
     // flat_multiset(from_range_t, R&&)
     // greater
-    using M    = std::flat_multiset<int, std::greater<int>, std::deque<int, min_allocator<int>>>;
+    using M    = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>;
     using Iter = cpp20_input_iterator<const int*>;
     using Sent = sentinel_wrapper<Iter>;
     using R    = std::ranges::subrange<Iter, Sent>;
     auto m     = M(std::from_range, R(Iter(ar), Sent(Iter(ar + 9))));
-    assert(std::ranges::equal(m, std::deque<int, min_allocator<int>>{3, 3, 3, 2, 2, 2, 1, 1, 1}));
+    assert(std::ranges::equal(m, KeyContainer<int, min_allocator<int>>{3, 3, 3, 2, 2, 2, 1, 1, 1}));
   }
   {
     // flat_multiset(from_range_t, R&&)
     // contiguous range
-    using M = std::flat_multiset<int>;
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     using R = std::ranges::subrange<const int*>;
     auto m  = M(std::from_range, R(ar, ar + 9));
     assert(std::ranges::equal(m, expected));
@@ -116,7 +117,7 @@ void test() {
   {
     // flat_multiset(from_range_t, R&&, const key_compare&)
     using C = test_less<int>;
-    using M = std::flat_multiset<int, C, std::vector<int>>;
+    using M = std::flat_multiset<int, C, KeyContainer<int>>;
     using R = std::ranges::subrange<const int*>;
     auto m  = M(std::from_range, R(ar, ar + 9), C(3));
     assert(std::ranges::equal(m, expected));
@@ -130,7 +131,7 @@ void test() {
   {
     // flat_multiset(from_range_t, R&&, const Allocator&)
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, std::less<int>, std::vector<int, A1>>;
+    using M  = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>;
     using R  = std::ranges::subrange<const int*>;
     auto m   = M(std::from_range, R(ar, ar + 9), A1(5));
     assert(std::ranges::equal(m, expected));
@@ -140,7 +141,7 @@ void test() {
     // flat_multiset(from_range_t, R&&, const Allocator&)
     // explicit(false)
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, std::less<int>, std::deque<int, A1>>;
+    using M  = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>;
     using R  = std::ranges::subrange<const int*>;
     M m      = {std::from_range, R(ar, ar + 9), A1(5)}; // implicit ctor
     assert(std::ranges::equal(m, expected));
@@ -150,7 +151,7 @@ void test() {
     // flat_multiset(from_range_t, R&&, const key_compare&, const Allocator&)
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, C, std::vector<int, A1>>;
+    using M  = std::flat_multiset<int, C, KeyContainer<int, A1>>;
     using R  = std::ranges::subrange<const int*>;
     auto m   = M(std::from_range, R(ar, ar + 9), C(3), A1(5));
     assert(std::ranges::equal(m, expected));
@@ -161,7 +162,7 @@ void test() {
     // flat_multiset(from_range_t, R&&, const key_compare&, const Allocator&)
     // explicit(false)
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, std::less<int>, std::deque<int, A1>>;
+    using M  = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>;
     using R  = std::ranges::subrange<const int*>;
     M m      = {std::from_range, R(ar, ar + 9), {}, A1(5)}; // implicit ctor
     assert(std::ranges::equal(m, expected));
@@ -169,8 +170,21 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_container.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_container.pass.cpp
index 76759be..60fd70a 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_container.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_container.pass.cpp
@@ -30,7 +30,8 @@
 #include "test_macros.h"
 #include "../../../test_compare.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -38,8 +39,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     static_assert(std::is_constructible_v<M1, std::sorted_equivalent_t, const V1&, const A1&>);
@@ -52,11 +53,12 @@ void test() {
     static_assert(!std::is_constructible_v<M1, std::sorted_equivalent_t, const V1&, const C&, const A2&>);
     static_assert(!std::is_constructible_v<M2, std::sorted_equivalent_t, const V2&, const C&, const A1&>);
   }
+
   {
     // flat_multiset(sorted_equivalent_t, container_type)
-    using M             = std::flat_multiset<int>;
-    std::vector<int> ks = {1, 2, 2, 4, 10};
-    auto ks2            = ks;
+    using M              = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
+    KeyContainer<int> ks = {1, 2, 2, 4, 10};
+    auto ks2             = ks;
 
     auto m = M(std::sorted_equivalent, ks);
     assert((m == M{1, 2, 2, 4, 10}));
@@ -71,7 +73,7 @@ void test() {
   {
     // flat_multiset(sorted_equivalent_t, container_type)
     // non-default container, comparator and allocator type
-    using Ks = std::deque<int, min_allocator<int>>;
+    using Ks = KeyContainer<int, min_allocator<int>>;
     using M  = std::flat_multiset<int, std::greater<int>, Ks>;
     Ks ks    = {10, 4, 4, 2, 1};
     auto m   = M(std::sorted_equivalent, ks);
@@ -84,8 +86,8 @@ void test() {
     // flat_multiset(sorted_equivalent_t, container_type)
     // allocator copied into the containers
     using A = test_allocator<int>;
-    using M = std::flat_multiset<int, std::less<int>, std::deque<int, A>>;
-    auto ks = std::deque<int, A>({1, 2, 2, 4, 10}, A(4));
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A>>;
+    auto ks = KeyContainer<int, A>({1, 2, 2, 4, 10}, A(4));
     auto m  = M(std::sorted_equivalent, std::move(ks));
     assert(ks.empty()); // it was moved-from
     assert((m == M{1, 2, 2, 4, 10}));
@@ -93,9 +95,9 @@ void test() {
   }
   {
     // flat_multiset(sorted_equivalent_t, container_type ,  key_compare)
-    using C             = test_less<int>;
-    using M             = std::flat_multiset<int, C>;
-    std::vector<int> ks = {1, 2, 2, 4, 10};
+    using C              = test_less<int>;
+    using M              = std::flat_multiset<int, C, KeyContainer<int>>;
+    KeyContainer<int> ks = {1, 2, 2, 4, 10};
 
     auto m = M(std::sorted_equivalent, ks, C(4));
     assert((m == M{1, 2, 2, 4, 10}));
@@ -108,11 +110,11 @@ void test() {
   }
   {
     // flat_multiset(sorted_equivalent_t, container_type , key_compare, const Allocator&)
-    using C                = test_less<int>;
-    using A                = test_allocator<int>;
-    using M                = std::flat_multiset<int, C, std::vector<int, A>>;
-    std::vector<int, A> ks = {1, 2, 2, 4, 10};
-    auto m                 = M(std::sorted_equivalent, ks, C(4), A(5));
+    using C                 = test_less<int>;
+    using A                 = test_allocator<int>;
+    using M                 = std::flat_multiset<int, C, KeyContainer<int, A>>;
+    KeyContainer<int, A> ks = {1, 2, 2, 4, 10};
+    auto m                  = M(std::sorted_equivalent, ks, C(4), A(5));
     assert((m == M{1, 2, 2, 4, 10}));
     assert(m.key_comp() == C(4));
     assert(M(m).extract().get_allocator() == A(5));
@@ -126,8 +128,8 @@ void test() {
   {
     // flat_multiset(sorted_equivalent_t, container_type , const Allocator&)
     using A = test_allocator<int>;
-    using M = std::flat_multiset<int, std::less<int>, std::deque<int, A>>;
-    auto ks = std::deque<int, A>({1, 2, 2, 4, 10}, A(4));
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int, A>>;
+    auto ks = KeyContainer<int, A>({1, 2, 2, 4, 10}, A(4));
     auto m  = M(std::sorted_equivalent, ks, A(6)); // replaces the allocators
     assert(!ks.empty());                           // it was an lvalue above
     assert((m == M{1, 2, 2, 4, 10}));
@@ -140,8 +142,22 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_initializer_list.pass.cpp
index 955662d..ff10c97 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_initializer_list.pass.cpp
@@ -31,12 +31,13 @@
 #include "../../../test_compare.h"
 
 template <class T>
-std::initializer_list<T> il = {1, 2, 4, 4, 5};
+constexpr std::initializer_list<T> il = {1, 2, 4, 4, 5};
 
-void test() {
-  const auto il1 = il<int>;
-  const auto il2 = il<short>;
+constexpr auto il1 = il<int>;
+constexpr auto il2 = il<short>;
 
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -44,8 +45,8 @@ void test() {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
+    using V1 = KeyContainer<int, A1>;
+    using V2 = KeyContainer<int, A2>;
     using M1 = std::flat_multiset<int, C, V1>;
     using M2 = std::flat_multiset<int, C, V2>;
     using IL = std::initializer_list<int>;
@@ -62,7 +63,7 @@ void test() {
   }
   {
     // initializer_list<value_type> needs to match exactly
-    using M = std::flat_multiset<int>;
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     using C = typename M::key_compare;
     static_assert(std::is_constructible_v<M, std::sorted_equivalent_t, std::initializer_list<int>>);
     static_assert(std::is_constructible_v<M, std::sorted_equivalent_t, std::initializer_list<int>, C>);
@@ -88,7 +89,7 @@ void test() {
 
   {
     // flat_multiset(sorted_equivalent_t, initializer_list<value_type>);
-    using M       = std::flat_multiset<int>;
+    using M       = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
     auto m        = M(std::sorted_equivalent, il1);
     auto expected = M{1, 2, 4, 4, 5};
     assert(m == expected);
@@ -97,9 +98,9 @@ void test() {
     M m2 = {std::sorted_equivalent, il1};
     assert(m2 == m);
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     // flat_multiset(sorted_equivalent_t, initializer_list<value_type>, const key_compare&);
-    using M = std::flat_multiset<int, std::function<bool(int, int)>>;
+    using M = std::flat_multiset<int, std::function<bool(int, int)>, KeyContainer<int>>;
     auto m  = M(std::sorted_equivalent, il1, std::less<int>());
     assert(m == M({1, 2, 4, 4, 5}, std::less<>()));
     assert(m.key_comp()(1, 2) == true);
@@ -111,7 +112,7 @@ void test() {
   {
     // flat_multiset(sorted_equivalent_t, initializer_list<value_type>, const key_compare&);
     // greater
-    using M = std::flat_multiset<int, std::greater<int>, std::deque<int, min_allocator<int>>>;
+    using M = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>;
     std::initializer_list<int> il4{5, 4, 4, 2, 1};
     auto m = M(std::sorted_equivalent, il4, std::greater<int>());
     assert((m == M{5, 4, 4, 2, 1}));
@@ -119,7 +120,7 @@ void test() {
   {
     // flat_multiset(sorted_equivalent_t, initializer_list<value_type>,  const Allocator&)
     using A1      = test_allocator<short>;
-    using M       = std::flat_multiset<short, std::less<int>, std::deque<short, A1>>;
+    using M       = std::flat_multiset<short, std::less<int>, KeyContainer<short, A1>>;
     auto m        = M(std::sorted_equivalent, il2, A1(5));
     auto expected = M{1, 2, 4, 4, 5};
     assert(m == expected);
@@ -134,7 +135,7 @@ void test() {
     // flat_multiset(sorted_equivalent_t, initializer_list<value_type>, const key_compare&, const Allocator&);
     using C  = test_less<int>;
     using A1 = test_allocator<short>;
-    using M  = std::flat_multiset<short, C, std::vector<short, A1>>;
+    using M  = std::flat_multiset<short, C, KeyContainer<short, A1>>;
     auto m   = M(std::sorted_equivalent, il2, C(3), A1(5));
     assert((m == M{1, 2, 4, 4, 5}));
     assert(m.key_comp() == C(3));
@@ -144,15 +145,29 @@ void test() {
     // flat_multiset(sorted_equivalent_t, initializer_list<value_type>, const key_compare&, const Allocator&);
     // explicit(false)
     using A1 = test_allocator<short>;
-    using M  = std::flat_multiset<short, std::less<int>, std::deque<short, A1>>;
+    using M  = std::flat_multiset<short, std::less<int>, KeyContainer<short, A1>>;
     M m      = {std::sorted_equivalent, il2, {}, A1(5)}; // implicit ctor
     assert((m == M{1, 2, 4, 4, 5}));
     assert(std::move(m).extract().get_allocator() == A1(5));
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_iter_iter.pass.cpp
index 9ebe45d..a3c9981 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_iter_iter.pass.cpp
@@ -28,7 +28,8 @@
 #include "test_macros.h"
 #include "../../../test_compare.h"
 
-void test() {
+template <template <class...> class KeyContainer>
+constexpr void test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<container_type, Alloc> is true.
@@ -36,8 +37,8 @@ void test() {
     using C     = test_less<int>;
     using A1    = test_allocator<int>;
     using A2    = other_allocator<int>;
-    using V1    = std::vector<int, A1>;
-    using V2    = std::vector<int, A2>;
+    using V1    = KeyContainer<int, A1>;
+    using V2    = KeyContainer<int, A2>;
     using M1    = std::flat_multiset<int, C, V1>;
     using M2    = std::flat_multiset<int, C, V2>;
     using Iter1 = typename M1::iterator;
@@ -52,10 +53,12 @@ void test() {
     static_assert(!std::is_constructible_v<M1, std::sorted_equivalent_t, Iter1, Iter1, const C&, const A2&>);
     static_assert(!std::is_constructible_v<M2, std::sorted_equivalent_t, Iter2, Iter2, const C&, const A1&>);
   }
+
   {
     // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator);
     // cpp17_input_iterator
-    using M  = std::flat_multiset<int>;
+    using M = std::flat_multiset<int, std::less<int>, KeyContainer<int>>;
+
     int ar[] = {1, 2, 2, 4, 5};
     auto m = M(std::sorted_equivalent, cpp17_input_iterator<const int*>(ar), cpp17_input_iterator<const int*>(ar + 5));
     auto expected = M{1, 2, 2, 4, 5};
@@ -69,16 +72,16 @@ void test() {
     // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator);
     // contiguous iterator
     using C       = test_less<int>;
-    using M       = std::flat_multiset<int, C, std::vector<int, min_allocator<int>>>;
+    using M       = std::flat_multiset<int, C, KeyContainer<int, min_allocator<int>>>;
     int ar[]      = {1, 2, 4, 4, 5};
     auto m        = M(std::sorted_equivalent, ar, ar + 5);
     auto expected = M{1, 2, 4, 4, 5};
     assert(m == expected);
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&);
     // cpp_17_input_iterator
-    using M  = std::flat_multiset<int, std::function<bool(int, int)>>;
+    using M  = std::flat_multiset<int, std::function<bool(int, int)>, KeyContainer<int>>;
     int ar[] = {1, 2, 4, 4, 5};
     auto m   = M(std::sorted_equivalent,
                cpp17_input_iterator<const int*>(ar),
@@ -97,7 +100,7 @@ void test() {
   {
     // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&);
     // greater
-    using M  = std::flat_multiset<int, std::greater<int>, std::deque<int, min_allocator<int>>>;
+    using M  = std::flat_multiset<int, std::greater<int>, KeyContainer<int, min_allocator<int>>>;
     int ar[] = {5, 4, 4, 2, 1};
     auto m   = M(std::sorted_equivalent,
                cpp17_input_iterator<const int*>(ar),
@@ -109,7 +112,7 @@ void test() {
     // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&);
     // contiguous iterator
     using C   = test_less<int>;
-    using M   = std::flat_multiset<int, C, std::vector<int, min_allocator<int>>>;
+    using M   = std::flat_multiset<int, C, KeyContainer<int, min_allocator<int>>>;
     int ar[1] = {42};
     auto m    = M(std::sorted_equivalent, ar, ar, C(5));
     assert(m.empty());
@@ -118,7 +121,7 @@ void test() {
   {
     // flat_multiset(sorted_equivalent_t, InputIterator , InputIterator, const Allocator&)
     using A1      = test_allocator<int>;
-    using M       = std::flat_multiset<int, std::less<int>, std::vector<int, A1>>;
+    using M       = std::flat_multiset<int, std::less<int>, KeyContainer<int, A1>>;
     int ar[]      = {1, 2, 4, 4, 5};
     auto m        = M(std::sorted_equivalent, ar, ar + 5, A1(5));
     auto expected = M{1, 2, 4, 4, 5};
@@ -134,7 +137,7 @@ void test() {
     // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&, const Allocator&);
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
-    using M  = std::flat_multiset<int, C, std::deque<int, A1>>;
+    using M  = std::flat_multiset<int, C, KeyContainer<int, A1>>;
     int ar[] = {1, 2, 4, 4, 5};
     auto m   = M(std::sorted_equivalent, ar, ar + 5, C(3), A1(5));
     assert((m == M{1, 2, 4, 4, 5}));
@@ -145,7 +148,7 @@ void test() {
     // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&, const Allocator&);
     // explicit(false)
     using A1 = test_allocator<short>;
-    using M  = std::flat_multiset<short, std::less<int>, std::deque<short, A1>>;
+    using M  = std::flat_multiset<short, std::less<int>, KeyContainer<short, A1>>;
     int ar[] = {1, 2, 4, 4, 5};
     M m      = {std::sorted_equivalent, ar, ar + 5, {}, A1(5)}; // implicit ctor
     assert((m == M{1, 2, 4, 4, 5}));
@@ -153,8 +156,22 @@ void test() {
   }
 }
 
+constexpr bool test() {
+  test<std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque>();
+
+  return true;
+}
+
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if.pass.cpp
index 21f3c91..337ad04 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if.pass.cpp
@@ -32,7 +32,7 @@ static_assert(HasStdErase<std::vector<int>>);
 static_assert(!HasStdErase<std::flat_multiset<int>>);
 
 template <class M>
-M make(std::initializer_list<int> vals) {
+constexpr M make(std::initializer_list<int> vals) {
   M ret;
   for (int v : vals)
     ret.emplace(v);
@@ -40,8 +40,8 @@ M make(std::initializer_list<int> vals) {
 }
 
 template <class M, class Pred>
-void test0(
-    std::initializer_list<int> vals, Pred p, std::initializer_list<int> expected, std::size_t expected_erased_count) {
+constexpr void
+test0(std::initializer_list<int> vals, Pred p, std::initializer_list<int> expected, std::size_t expected_erased_count) {
   M s = make<M>(vals);
   ASSERT_SAME_TYPE(typename M::size_type, decltype(std::erase_if(s, p)));
   assert(expected_erased_count == std::erase_if(s, p));
@@ -50,11 +50,11 @@ void test0(
 
 struct NotBool {
   bool b;
-  explicit operator bool() const { return b; }
+  explicit constexpr operator bool() const { return b; }
 };
 
 template <class S>
-void test_one() {
+constexpr void test_one() {
   // Test all the plausible signatures for this predicate.
   auto is1        = [](typename S::const_reference v) { return v == 1; };
   auto is2        = [](typename S::value_type v) { return v == 2; };
@@ -96,18 +96,28 @@ void test_one() {
   test0<S>({1, 1, 2, 2, 3}, nonBoolIs1, {2, 2, 3}, 2);
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::flat_multiset<int>>();
   test_one<std::flat_multiset<int, std::less<int>, std::vector<int, min_allocator<int>>>>();
   test_one<std::flat_multiset<int, std::greater<int>, std::vector<int, test_allocator<int>>>>();
-  test_one<std::flat_multiset<int, std::less<int>, std::deque<int, min_allocator<int>>>>();
-  test_one<std::flat_multiset<int, std::greater<int>, std::deque<int, test_allocator<int>>>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test_one<std::flat_multiset<int, std::less<int>, std::deque<int, min_allocator<int>>>>();
+    test_one<std::flat_multiset<int, std::greater<int>, std::deque<int, test_allocator<int>>>>();
+  }
   test_one<std::flat_multiset<long>>();
   test_one<std::flat_multiset<double>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator.pass.cpp
index 809f03d..878b2b2 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator.pass.cpp
@@ -30,7 +30,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
 
@@ -68,9 +68,12 @@ void test_one() {
   assert(i == m.begin());
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
 
@@ -89,10 +92,15 @@ void test() {
     assert(!(ii1 != cii));
     assert(!(cii != ii1));
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_comparison.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_comparison.pass.cpp
index cbf69d6..ff4ad3f 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_comparison.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_comparison.pass.cpp
@@ -24,7 +24,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using I   = M::iterator;
@@ -141,15 +141,23 @@ void test_one() {
   assert(cri2 <=> cri1 == std::strong_ordering::greater);
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/reverse_iterator.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/reverse_iterator.pass.cpp
index e25d786..678109b 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/reverse_iterator.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/reverse_iterator.pass.cpp
@@ -25,46 +25,59 @@
 
 #include <iterator>
 
+#include "MinSequenceContainer.h"
 #include "test_macros.h"
+#include "min_allocator.h"
 
-void test() {
-  {
-    using M        = std::flat_multiset<int, std::less<int>, std::deque<int>>;
-    M m            = {1, 1, 2, 2, 3, 4};
-    int expected[] = {1, 1, 2, 2, 3, 4};
-    const M& cm    = m;
-    ASSERT_SAME_TYPE(decltype(m.rbegin()), M::reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(m.crbegin()), M::const_reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(cm.rbegin()), M::const_reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(m.rend()), M::reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(m.crend()), M::const_reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(cm.rend()), M::const_reverse_iterator);
-    static_assert(noexcept(m.rbegin()));
-    static_assert(noexcept(cm.rbegin()));
-    static_assert(noexcept(m.crbegin()));
-    static_assert(noexcept(m.rend()));
-    static_assert(noexcept(cm.rend()));
-    static_assert(noexcept(m.crend()));
-    assert(m.size() == 6);
-    assert(std::distance(m.rbegin(), m.rend()) == 6);
-    assert(std::distance(cm.rbegin(), cm.rend()) == 6);
-    assert(std::distance(m.crbegin(), m.crend()) == 6);
-    assert(std::distance(cm.crbegin(), cm.crend()) == 6);
-    M::reverse_iterator i; // default-construct
-    ASSERT_SAME_TYPE(decltype(*i), const int&);
-    i                           = m.rbegin(); // move-assignment
-    M::const_reverse_iterator k = i;          // converting constructor
-    assert(i == k);                           // comparison
-    for (int j = 5; j >= 0; --j, ++i) {       // pre-increment
-      assert(*i == expected[j]);
-    }
-    assert(i == m.rend());
-    for (int j = 0; j <= 5; ++j) {
-      --i; // pre-decrement
-      assert(*i == expected[j]);
-    }
-    assert(i == m.rbegin());
+template <class KeyContainer>
+constexpr void test_one() {
+  using Key      = typename KeyContainer::value_type;
+  using M        = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
+  M m            = {1, 1, 2, 2, 3, 4};
+  int expected[] = {1, 1, 2, 2, 3, 4};
+  const M& cm    = m;
+  ASSERT_SAME_TYPE(decltype(m.rbegin()), typename M::reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(m.crbegin()), typename M::const_reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(cm.rbegin()), typename M::const_reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(m.rend()), typename M::reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(m.crend()), typename M::const_reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(cm.rend()), typename M::const_reverse_iterator);
+  static_assert(noexcept(m.rbegin()));
+  static_assert(noexcept(cm.rbegin()));
+  static_assert(noexcept(m.crbegin()));
+  static_assert(noexcept(m.rend()));
+  static_assert(noexcept(cm.rend()));
+  static_assert(noexcept(m.crend()));
+  assert(m.size() == 6);
+  assert(std::distance(m.rbegin(), m.rend()) == 6);
+  assert(std::distance(cm.rbegin(), cm.rend()) == 6);
+  assert(std::distance(m.crbegin(), m.crend()) == 6);
+  assert(std::distance(cm.crbegin(), cm.crend()) == 6);
+  typename M::reverse_iterator i; // default-construct
+  ASSERT_SAME_TYPE(decltype(*i), const int&);
+  i                                    = m.rbegin(); // move-assignment
+  typename M::const_reverse_iterator k = i;          // converting constructor
+  assert(i == k);                                    // comparison
+  for (int j = 5; j >= 0; --j, ++i) {                // pre-increment
+    assert(*i == expected[j]);
+  }
+  assert(i == m.rend());
+  for (int j = 0; j <= 5; ++j) {
+    --i; // pre-decrement
+    assert(*i == expected[j]);
   }
+  assert(i == m.rbegin());
+}
+
+constexpr bool test() {
+  test_one<std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
+  test_one<MinSequenceContainer<int>>();
+  test_one<std::vector<int, min_allocator<int>>>();
+
   {
     // N3644 testing
     using C = std::flat_multiset<int>;
@@ -80,10 +93,15 @@ void test() {
     assert(!(ii1 != cii));
     assert(!(cii != ii1));
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/clear.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/clear.pass.cpp
index 4d01ece7..088a883 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/clear.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/clear.pass.cpp
@@ -38,7 +38,7 @@ static_assert(NoExceptClear<std::flat_multiset<int, std::less<int>, ThrowOnMoveC
 #endif
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   {
@@ -58,17 +58,25 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace.pass.cpp
index 3ef1396..6772e17 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace.pass.cpp
@@ -28,7 +28,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using R   = typename M::iterator;
@@ -91,7 +91,7 @@ void test_one() {
 }
 
 template <class KeyContainer>
-void test_emplaceable() {
+constexpr void test_emplaceable() {
   using M = std::flat_multiset<Emplaceable, std::less<Emplaceable>, KeyContainer>;
   using R = typename M::iterator;
 
@@ -111,16 +111,24 @@ void test_emplaceable() {
   assert(*r == Emplaceable(1, 3.5));
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
 
   test_emplaceable<std::vector<Emplaceable>>();
-  test_emplaceable<std::deque<Emplaceable>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_emplaceable<std::deque<Emplaceable>>();
   test_emplaceable<MinSequenceContainer<Emplaceable>>();
   test_emplaceable<std::vector<Emplaceable, min_allocator<Emplaceable>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -130,6 +138,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace_hint.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace_hint.pass.cpp
index 41a2e9c..ec99a9f 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace_hint.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace_hint.pass.cpp
@@ -27,11 +27,11 @@
 #include "../helpers.h"
 
 struct CompareTensDigit {
-  bool operator()(auto lhs, auto rhs) const { return (lhs / 10) < (rhs / 10); }
+  constexpr bool operator()(auto lhs, auto rhs) const { return (lhs / 10) < (rhs / 10); }
 };
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using R   = M::iterator;
@@ -179,7 +179,6 @@ void test_one() {
     assert(r == m.begin() + 2);
     assert(m.size() == 7);
     assert(*r == 23);
-    assert(*std::next(r) == 20);
   }
   {
     // hint incorrect and after the last duplicate
@@ -196,7 +195,7 @@ void test_one() {
 }
 
 template <class KeyContainer>
-void test_emplaceable() {
+constexpr void test_emplaceable() {
   using M = std::flat_multiset<Emplaceable, std::less<Emplaceable>, KeyContainer>;
   using R = M::iterator;
 
@@ -216,9 +215,12 @@ void test_emplaceable() {
   assert(*r == Emplaceable(1, 3.5));
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
 
@@ -226,6 +228,8 @@ void test() {
   test_emplaceable<std::vector<Emplaceable>>();
   test_emplaceable<MinSequenceContainer<Emplaceable>>();
   test_emplaceable<std::vector<Emplaceable, min_allocator<Emplaceable>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -235,6 +239,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter.pass.cpp
index 8418efa..f2cb151d 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter.pass.cpp
@@ -27,7 +27,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using I   = M::iterator;
@@ -94,11 +94,16 @@ void test_one() {
   assert(i8 == m.end());
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -108,6 +113,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter_iter.pass.cpp
index 2d54fef..7607892 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter_iter.pass.cpp
@@ -26,7 +26,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using I   = M::iterator;
@@ -78,11 +78,16 @@ void test_one() {
   assert(i5 == m.end());
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -92,6 +97,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key.pass.cpp
index 8175afa..7ddd3d8 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key.pass.cpp
@@ -26,7 +26,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class Compare = std::less<>>
-void test_one() {
+constexpr void test_one() {
   using M = std::flat_multiset<int, Compare, KeyContainer>;
 
   auto make = [](std::initializer_list<int> il) {
@@ -74,12 +74,17 @@ void test_one() {
   assert(m.empty());
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
   test_one<std::vector<int>, std::greater<>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -94,6 +99,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key_transparent.pass.cpp
index a876549..0613744 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key_transparent.pass.cpp
@@ -38,10 +38,10 @@ static_assert(!CanErase<const NonTransparentSet>);
 
 template <class Key, class It>
 struct HeterogeneousKey {
-  explicit HeterogeneousKey(Key key, It it) : key_(key), it_(it) {}
-  operator It() && { return it_; }
-  auto operator<=>(Key key) const { return key_ <=> key; }
-  friend bool operator<(const HeterogeneousKey&, const HeterogeneousKey&) {
+  constexpr explicit HeterogeneousKey(Key key, It it) : key_(key), it_(it) {}
+  constexpr operator It() && { return it_; }
+  constexpr auto operator<=>(Key key) const { return key_ <=> key; }
+  constexpr friend bool operator<(const HeterogeneousKey&, const HeterogeneousKey&) {
     assert(false);
     return false;
   }
@@ -50,7 +50,7 @@ struct HeterogeneousKey {
 };
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
 
@@ -70,7 +70,7 @@ void test_one() {
 }
 
 template <class KeyContainer>
-void test_transparent_comparator() {
+constexpr void test_transparent_comparator() {
   using M = std::flat_multiset<std::string, TransparentComparator, KeyContainer>;
   {
     M m = {"alpha", "beta", "beta", "epsilon", "epsilon", "epsilon", "eta", "eta", "gamma"};
@@ -95,14 +95,20 @@ void test_transparent_comparator() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
 
   test_transparent_comparator<std::vector<std::string>>();
-  test_transparent_comparator<std::deque<std::string>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_transparent_comparator<std::deque<std::string>>();
   test_transparent_comparator<MinSequenceContainer<std::string>>();
   test_transparent_comparator<std::vector<std::string, min_allocator<std::string>>>();
 
@@ -146,6 +152,8 @@ void test() {
     assert(n == 2);
     assert((m == M{"alpha", "epsilon", "eta", "gamma"}));
   }
+
+  return true;
 }
 
 void test_exception() {
@@ -159,6 +167,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/extract.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/extract.pass.cpp
index 8a66431..bb41ced 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/extract.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/extract.pass.cpp
@@ -33,7 +33,7 @@ static_assert(!CanExtract<std::flat_multiset<int> const&>);
 static_assert(!CanExtract<std::flat_multiset<int> const&&>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using M = std::flat_multiset<int, std::less<int>, KeyContainer>;
   {
     M m = M({1, 1, 3});
@@ -55,9 +55,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
 
@@ -70,6 +73,8 @@ void test() {
     check_invariant(m);
     LIBCPP_ASSERT(m.empty());
   }
+
+  return true;
 }
 
 void test_exception() {
@@ -96,6 +101,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_cv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_cv.pass.cpp
index eeb1bdd..5128a40 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_cv.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_cv.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using R   = typename M::iterator;
@@ -61,11 +61,16 @@ void test_one() {
   assert(*r == 1);
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -79,6 +84,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_initializer_list.pass.cpp
index 9c56d3b..f0b1eaf 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_initializer_list.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
 
@@ -65,11 +65,16 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -84,6 +89,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_cv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_cv.pass.cpp
index 61f00f5..55a77d5 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_cv.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_cv.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using R   = typename M::iterator;
@@ -61,11 +61,16 @@ void test_one() {
   assert(*r == 1);
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -80,6 +85,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_iter.pass.cpp
index 9381568..9b10bf3 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_iter.pass.cpp
@@ -37,7 +37,7 @@ static_assert(!CanInsert<Set, int, int>);
 static_assert(!CanInsert<Set, cpp20_input_iterator<int*>, cpp20_input_iterator<int*>>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using M = std::flat_multiset<int, std::less<int>, KeyContainer>;
 
   int ar1[] = {
@@ -75,9 +75,12 @@ void test_one() {
   assert(m == expected2);
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
   {
@@ -86,6 +89,8 @@ void test() {
     m.insert(v.begin(), v.end());
     assert(std::ranges::equal(m, std::vector<int>{1, 2, 3, 4}));
   }
+
+  return true;
 }
 
 void test_exception() {
@@ -95,6 +100,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_rv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_rv.pass.cpp
index 9976c04c..8bbc6c8 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_rv.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_rv.pass.cpp
@@ -22,7 +22,7 @@
 #include "test_macros.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   using V   = Key;
@@ -59,15 +59,22 @@ void test_one() {
   assert(*r == V(1));
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
   test_one<std::vector<MoveOnly>>();
-  test_one<std::deque<int>>();
-  test_one<std::deque<MoveOnly>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test_one<std::deque<int>>();
+    test_one<std::deque<MoveOnly>>();
+  }
   test_one<MinSequenceContainer<int>>();
   test_one<MinSequenceContainer<MoveOnly>>();
   test_one<std::vector<int, min_allocator<int>>>();
   test_one<std::vector<MoveOnly, min_allocator<MoveOnly>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -82,6 +89,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_range.pass.cpp
index 566be39..a9d8f7e 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_range.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_range.pass.cpp
@@ -39,7 +39,7 @@ static_assert(!CanInsertRange<Set, std::ranges::subrange<std::pair<int, int>*>>)
 static_assert(!CanInsertRange<Set, std::ranges::subrange<std::pair<short, short>*>>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
 
   {
@@ -72,9 +72,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
   {
@@ -85,6 +88,8 @@ void test() {
     MoveOnly expected[] = {1, 1, 3, 4, 5};
     assert(std::ranges::equal(m, expected));
   }
+
+  return true;
 }
 
 void test_exception() {
@@ -94,6 +99,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_rv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_rv.pass.cpp
index 9328c42..67f3036 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_rv.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_rv.pass.cpp
@@ -25,7 +25,7 @@
 #include "../helpers.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, TransparentComparator, KeyContainer>;
   using R   = typename M::iterator;
@@ -63,15 +63,22 @@ void test_one() {
   assert(*r == V(1));
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
   test_one<std::vector<MoveOnly>>();
-  test_one<std::deque<int>>();
-  test_one<std::deque<MoveOnly>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test_one<std::deque<int>>();
+    test_one<std::deque<MoveOnly>>();
+  }
   test_one<MinSequenceContainer<int>>();
   test_one<MinSequenceContainer<MoveOnly>>();
   test_one<std::vector<int, min_allocator<int>>>();
   test_one<std::vector<MoveOnly, min_allocator<MoveOnly>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -86,6 +93,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_initializer_list.pass.cpp
index 11af199..81b7e4e 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_initializer_list.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   {
@@ -42,11 +42,16 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -61,6 +66,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_iter_iter.pass.cpp
index 07b62d0..bfb2307 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_iter_iter.pass.cpp
@@ -36,7 +36,7 @@ static_assert(!CanInsert<Set, std::sorted_equivalent_t, int, int>);
 static_assert(!CanInsert<Set, std::sorted_equivalent_t, cpp20_input_iterator<int*>, cpp20_input_iterator<int*>>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
 
@@ -60,11 +60,16 @@ void test_one() {
   assert(m == expected2);
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -76,6 +81,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/replace.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/replace.pass.cpp
index 5fe6138..3c74cf6 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/replace.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/replace.pass.cpp
@@ -31,7 +31,7 @@ static_assert(CanReplace<Set, std::vector<int>>);
 static_assert(!CanReplace<Set, const std::vector<int>&>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   {
@@ -53,11 +53,16 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 void test_exception() {
@@ -82,6 +87,9 @@ void test_exception() {
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
   test_exception();
 
   return 0;
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_free.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_free.pass.cpp
index 2e3ed02..241f2cf 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_free.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_free.pass.cpp
@@ -38,7 +38,7 @@ static_assert(NoExceptAdlSwap<std::flat_multiset<int, std::less<int>, ThrowOnMov
 #endif
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
 
@@ -84,15 +84,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_member.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_member.pass.cpp
index 1d0d915..7ad96ed 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_member.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_member.pass.cpp
@@ -37,7 +37,7 @@ static_assert(NoExceptMemberSwap<std::flat_multiset<int, std::less<int>, ThrowOn
 #endif
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<Key>, KeyContainer>;
   {
@@ -82,15 +82,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.observers/comp.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.observers/comp.pass.cpp
index 4ca6451..74c92f3 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.observers/comp.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.observers/comp.pass.cpp
@@ -21,7 +21,7 @@
 
 #include "test_macros.h"
 
-void test() {
+constexpr bool test() {
   {
     using M    = std::flat_multiset<int>;
     using Comp = std::less<int>; // the default
@@ -36,7 +36,7 @@ void test() {
     assert(vc(1, 2));
     assert(!vc(2, 1));
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     using Comp = std::function<bool(int, int)>;
     using M    = std::flat_multiset<int, Comp>;
     Comp comp  = std::greater<int>();
@@ -67,10 +67,15 @@ void test() {
     assert(vc(1, 2));
     assert(!vc(2, 1));
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains.pass.cpp
index 00fda6c..a178dfd 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   {
     using M = std::flat_multiset<Key, std::less<>, KeyContainer>;
@@ -66,15 +66,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains_transparent.pass.cpp
index abee2b1..3222762 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains_transparent.pass.cpp
@@ -35,7 +35,7 @@ static_assert(!CanContains<NonTransparentSet>);
 static_assert(!CanContains<const NonTransparentSet>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, TransparentComparator, KeyContainer>;
 
@@ -60,9 +60,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<std::string>>();
-  test_one<std::deque<std::string>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<std::string>>();
   test_one<MinSequenceContainer<std::string>>();
   test_one<std::vector<std::string, min_allocator<std::string>>>();
 
@@ -82,10 +85,15 @@ void test() {
     assert(m.contains("beta"));
     assert(!m.contains("charlie"));
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count.pass.cpp
index 1752dab..8b034df 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count.pass.cpp
@@ -23,7 +23,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using S   = typename KeyContainer::size_type;
 
@@ -66,15 +66,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count_transparent.pass.cpp
index a9160ae..a1a0d6b 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count_transparent.pass.cpp
@@ -35,7 +35,7 @@ static_assert(!CanCount<NonTransparentSet>);
 static_assert(!CanCount<const NonTransparentSet>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, TransparentComparator, KeyContainer>;
   {
@@ -59,9 +59,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<std::string>>();
-  test_one<std::deque<std::string>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<std::string>>();
   test_one<MinSequenceContainer<std::string>>();
   test_one<std::vector<std::string, min_allocator<std::string>>>();
 
@@ -81,10 +84,15 @@ void test() {
     auto n  = m.count("beta");
     assert(n == 2);
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range.pass.cpp
index 54ae27e9..b105d19 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range.pass.cpp
@@ -24,7 +24,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   {
     using M  = std::flat_multiset<Key, std::less<>, KeyContainer>;
@@ -74,15 +74,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range_transparent.pass.cpp
index ae16ec1..65bff7a 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range_transparent.pass.cpp
@@ -36,7 +36,7 @@ static_assert(!CanEqualRange<NonTransparentSet>);
 static_assert(!CanEqualRange<const NonTransparentSet>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, TransparentComparator, KeyContainer>;
 
@@ -90,9 +90,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<std::string>>();
-  test_one<std::deque<std::string>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<std::string>>();
   test_one<MinSequenceContainer<std::string>>();
   test_one<std::vector<std::string, min_allocator<std::string>>>();
 
@@ -113,10 +116,15 @@ void test() {
     assert(first == m.begin() + 1);
     assert(last == m.begin() + 3);
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find.pass.cpp
index 49386a6..bc9a439 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find.pass.cpp
@@ -25,7 +25,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, std::less<>, KeyContainer>;
   {
@@ -50,15 +50,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find_transparent.pass.cpp
index 9d0b75c..4c9c403 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find_transparent.pass.cpp
@@ -36,7 +36,7 @@ static_assert(!CanFind<NonTransparentSet>);
 static_assert(!CanFind<const NonTransparentSet>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, TransparentComparator, KeyContainer>;
 
@@ -77,9 +77,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<std::string>>();
-  test_one<std::deque<std::string>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<std::string>>();
   test_one<MinSequenceContainer<std::string>>();
   test_one<std::vector<std::string, min_allocator<std::string>>>();
 
@@ -101,10 +104,15 @@ void test() {
     auto it2 = m.find("charlie");
     assert(it2 == m.end());
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound.pass.cpp
index ba41b82..07f0533 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound.pass.cpp
@@ -24,7 +24,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   {
     using M = std::flat_multiset<Key, std::less<>, KeyContainer>;
@@ -66,15 +66,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound_transparent.pass.cpp
index c03fb27..e674c85 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound_transparent.pass.cpp
@@ -36,7 +36,7 @@ static_assert(!CanLowerBound<NonTransparentSet>);
 static_assert(!CanLowerBound<const NonTransparentSet>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, TransparentComparator, KeyContainer>;
 
@@ -83,9 +83,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<std::string>>();
-  test_one<std::deque<std::string>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<std::string>>();
   test_one<MinSequenceContainer<std::string>>();
   test_one<std::vector<std::string, min_allocator<std::string>>>();
 
@@ -107,10 +110,15 @@ void test() {
     auto it2 = m.lower_bound("charlie");
     assert(it2 == m.begin() + 3);
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound.pass.cpp
index 7828f05..d4d1992 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound.pass.cpp
@@ -24,7 +24,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   {
     using M = std::flat_multiset<Key, std::less<>, KeyContainer>;
@@ -67,15 +67,23 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound_transparent.pass.cpp
index de517fd..75140a7 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound_transparent.pass.cpp
@@ -36,7 +36,7 @@ static_assert(!CanUpperBound<NonTransparentSet>);
 static_assert(!CanUpperBound<const NonTransparentSet>);
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
   using M   = std::flat_multiset<Key, TransparentComparator, KeyContainer>;
 
@@ -83,9 +83,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<std::string>>();
-  test_one<std::deque<std::string>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<std::string>>();
   test_one<MinSequenceContainer<std::string>>();
   test_one<std::vector<std::string, min_allocator<std::string>>>();
 
@@ -105,10 +108,15 @@ void test() {
     auto it = m.upper_bound("beta");
     assert(it == m.begin() + 3);
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/helpers.h b/libcxx/test/std/containers/container.adaptors/flat.multiset/helpers.h
index e7ed8a0..82f91775 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/helpers.h
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/helpers.h
@@ -20,7 +20,7 @@
 #include "test_macros.h"
 
 template <class... Args>
-void check_invariant(const std::flat_multiset<Args...>& m) {
+constexpr void check_invariant(const std::flat_multiset<Args...>& m) {
   assert(std::is_sorted(m.begin(), m.end(), m.key_comp()));
 }
 
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/op_compare.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/op_compare.pass.cpp
index 94f0f2b3..606cdfc 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multiset/op_compare.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/op_compare.pass.cpp
@@ -31,7 +31,7 @@
 #include "test_container_comparisons.h"
 
 template <class KeyContainer>
-void test_one() {
+constexpr void test_one() {
   using Key = typename KeyContainer::value_type;
 
   {
@@ -64,9 +64,12 @@ void test_one() {
   }
 }
 
-void test() {
+constexpr bool test() {
   test_one<std::vector<int>>();
-  test_one<std::deque<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_one<std::deque<int>>();
   test_one<MinSequenceContainer<int>>();
   test_one<std::vector<int, min_allocator<int>>>();
 
@@ -81,7 +84,7 @@ void test() {
   {
     // Comparisons use value_type's native operators, not the comparator
     struct StrongComp {
-      bool operator()(double a, double b) const { return std::strong_order(a, b) < 0; }
+      constexpr bool operator()(double a, double b) const { return std::strong_order(a, b) < 0; }
     };
     using C = std::flat_multiset<double, StrongComp>;
     C s1    = {1};
@@ -96,10 +99,15 @@ void test() {
     assert(s1 != s2);
     assert((s1 <=> s2) == std::partial_ordering::unordered);
   }
+
+  return true;
 }
 
 int main(int, char**) {
   test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp
index 9c06eee..26c8e1bc 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp
@@ -20,30 +20,50 @@
 
 #if TEST_STD_VER < 14
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_map
 #    error "__cpp_lib_flat_map should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 14
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_map
 #    error "__cpp_lib_flat_map should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 17
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_map
 #    error "__cpp_lib_flat_map should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 20
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_map
 #    error "__cpp_lib_flat_map should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 23
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
 #  ifndef __cpp_lib_flat_map
 #    error "__cpp_lib_flat_map should be defined in c++23"
 #  endif
@@ -53,6 +73,13 @@
 
 #elif TEST_STD_VER > 23
 
+#  ifndef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should be defined in c++26"
+#  endif
+#  if __cpp_lib_constexpr_flat_map != 202502L
+#    error "__cpp_lib_constexpr_flat_map should have the value 202502L in c++26"
+#  endif
+
 #  ifndef __cpp_lib_flat_map
 #    error "__cpp_lib_flat_map should be defined in c++26"
 #  endif
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp
index 5985bdc..b29da9f 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp
@@ -20,30 +20,50 @@
 
 #if TEST_STD_VER < 14
 
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_set
 #    error "__cpp_lib_flat_set should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 14
 
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_set
 #    error "__cpp_lib_flat_set should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 17
 
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_set
 #    error "__cpp_lib_flat_set should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 20
 
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_flat_set
 #    error "__cpp_lib_flat_set should not be defined before c++23"
 #  endif
 
 #elif TEST_STD_VER == 23
 
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifndef __cpp_lib_flat_set
 #    error "__cpp_lib_flat_set should be defined in c++23"
 #  endif
@@ -53,6 +73,13 @@
 
 #elif TEST_STD_VER > 23
 
+#  ifndef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should be defined in c++26"
+#  endif
+#  if __cpp_lib_constexpr_flat_set != 202502L
+#    error "__cpp_lib_constexpr_flat_set should have the value 202502L in c++26"
+#  endif
+
 #  ifndef __cpp_lib_flat_set
 #    error "__cpp_lib_flat_set should be defined in c++26"
 #  endif
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index 05af1fb..a9552c2 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -204,6 +204,14 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_forward_list
 #    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
 #  endif
@@ -1116,6 +1124,14 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_forward_list
 #    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
 #  endif
@@ -2130,6 +2146,14 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_forward_list
 #    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
 #  endif
@@ -3384,6 +3408,14 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_forward_list
 #    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
 #  endif
@@ -4860,6 +4892,14 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++23"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should not be defined before c++26"
+#  endif
+
+#  ifdef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_forward_list
 #    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
 #  endif
@@ -6549,6 +6589,20 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++26"
 #  endif
 
+#  ifndef __cpp_lib_constexpr_flat_map
+#    error "__cpp_lib_constexpr_flat_map should be defined in c++26"
+#  endif
+#  if __cpp_lib_constexpr_flat_map != 202502L
+#    error "__cpp_lib_constexpr_flat_map should have the value 202502L in c++26"
+#  endif
+
+#  ifndef __cpp_lib_constexpr_flat_set
+#    error "__cpp_lib_constexpr_flat_set should be defined in c++26"
+#  endif
+#  if __cpp_lib_constexpr_flat_set != 202502L
+#    error "__cpp_lib_constexpr_flat_set should have the value 202502L in c++26"
+#  endif
+
 #  ifndef __cpp_lib_constexpr_forward_list
 #    error "__cpp_lib_constexpr_forward_list should be defined in c++26"
 #  endif
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index f6f2527..3d39130 100644
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -369,6 +369,16 @@ feature_test_macros = [
             "headers": ["memory"],
         },
         {
+            "name": "__cpp_lib_constexpr_flat_map",
+            "values": {"c++26": 202502},
+            "headers": ["flat_map"],
+        },
+        {
+            "name": "__cpp_lib_constexpr_flat_set",
+            "values": {"c++26": 202502},
+            "headers": ["flat_set"],
+        },
+        {
             "name": "__cpp_lib_constexpr_forward_list",
             "values": {"c++26": 202502},
             "headers": ["forward_list"],
diff --git a/lld/MachO/Arch/X86_64.cpp b/lld/MachO/Arch/X86_64.cpp
index a7c4b45..111c4d9 100644
--- a/lld/MachO/Arch/X86_64.cpp
+++ b/lld/MachO/Arch/X86_64.cpp
@@ -104,7 +104,7 @@ int64_t X86_64::getEmbeddedAddend(MemoryBufferRef mb, uint64_t offset,
 void X86_64::relocateOne(uint8_t *loc, const Reloc &r, uint64_t value,
                          uint64_t relocVA) const {
   if (r.pcrel) {
-    uint64_t pc = relocVA + (1 << r.length) + pcrelOffset(r.type);
+    uint64_t pc = relocVA + (1ull << r.length) + pcrelOffset(r.type);
     value -= pc;
   }
 
diff --git a/lld/MachO/InputSection.cpp b/lld/MachO/InputSection.cpp
index b173e14..2b2d28e 100644
--- a/lld/MachO/InputSection.cpp
+++ b/lld/MachO/InputSection.cpp
@@ -348,6 +348,9 @@ WordLiteralInputSection::WordLiteralInputSection(const Section &section,
 }
 
 uint64_t WordLiteralInputSection::getOffset(uint64_t off) const {
+  if (off >= data.size())
+    fatal(toString(this) + ": offset is outside the section");
+
   auto *osec = cast<WordLiteralSection>(parent);
   const uintptr_t buf = reinterpret_cast<uintptr_t>(data.data());
   switch (sectionType(getFlags())) {
diff --git a/lld/test/MachO/invalid/bad-offsets.s b/lld/test/MachO/invalid/bad-offsets.s
new file mode 100644
index 0000000..e1244ee
--- /dev/null
+++ b/lld/test/MachO/invalid/bad-offsets.s
@@ -0,0 +1,45 @@
+## Test that we properly detect and report out-of-bounds offsets in literal sections.
+## We're intentionally testing fatal errors (for malformed input files), and
+## fatal errors aren't supported for testing when main is run twice.
+# XFAIL: main-run-twice
+
+# REQUIRES: x86
+# RUN: rm -rf %t; split-file %s %t
+
+## Test WordLiteralInputSection bounds checking
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/word-literal.s -o %t/word-literal.o
+# RUN: not %lld -dylib %t/word-literal.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=WORD
+
+## Test CStringInputSection bounds checking
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/cstring.s -o %t/cstring.o
+# RUN: not %lld -dylib %t/cstring.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=CSTRING
+
+# WORD: error: {{.*}}word-literal.o:(__literal4): offset is outside the section
+# CSTRING: error: {{.*}}cstring.o:(__cstring): offset is outside the section
+
+#--- word-literal.s
+.section __TEXT,__literal4,4byte_literals
+L_literal:
+  .long 0x01020304
+
+.text
+.globl _main
+_main:
+  # We use a subtractor expression to force a section relocation. Symbol relocations
+  # don't trigger the error.
+  .long L_literal - _main + 4
+
+.subsections_via_symbols
+
+#--- cstring.s
+## Create a cstring section with a reference that points past the end
+.cstring
+L_str:
+  .asciz "foo"
+
+.text
+.globl _main
+_main:
+  .long L_str - _main + 4
+
+.subsections_via_symbols
+\ No newline at end of file
diff --git a/llvm/docs/CommandGuide/llvm-config.rst b/llvm/docs/CommandGuide/llvm-config.rst
index 63658d0..1c5c9c7 100644
--- a/llvm/docs/CommandGuide/llvm-config.rst
+++ b/llvm/docs/CommandGuide/llvm-config.rst
@@ -126,6 +126,11 @@ OPTIONS
 
  Print the installation prefix for LLVM.
 
+**--quote-paths**
+
+ Quote and escape paths when needed, most notably when a quote, space, backslash
+ or dollar sign characters are present in the path.
+
 **--shared-mode**
 
  Print how the provided components can be collectively linked (`shared` or `static`).
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 54c7d0f..3c089b5 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -668,7 +668,7 @@ representation is not just an integer address are called "non-integral".
 Non-integral pointers have at least one of the following three properties:
 
 * the pointer representation contains non-address bits
-* the pointer representation is unstable (may changed at any time in a
+* the pointer representation is unstable (may change at any time in a
   target-specific way)
 * the pointer representation has external state
 
@@ -757,7 +757,7 @@ The following restrictions apply to IR level optimization passes:
 
 The ``inttoptr`` instruction does not recreate the external state and therefore
 it is target dependent whether it can be used to create a dereferenceable
-pointer. In general passes should assume that the result of such an inttoptr
+pointer. In general passes should assume that the result of such an ``inttoptr``
 is not dereferenceable. For example, on CHERI targets an ``inttoptr`` will
 yield a capability with the external state (the validity tag bit) set to zero,
 which will cause any dereference to trap.
@@ -784,7 +784,7 @@ be performed as loads and stores of the correct type since stores of other
 types may not propagate the external data.
 Therefore it is not legal to convert an existing load/store (or a
 ``llvm.memcpy`` / ``llvm.memmove`` intrinsic) of pointer types with external
-state to a load/store of an integer type with same bitwidth, as that may drop
+state to a load/store of an integer type with the same bitwidth, as that may drop
 the external state.
 
 
@@ -806,7 +806,7 @@ Global variables can optionally specify a :ref:`linkage type <linkage>`.
 Either global variable definitions or declarations may have an explicit section
 to be placed in and may have an optional explicit alignment specified. If there
 is a mismatch between the explicit or inferred section information for the
-variable declaration and its definition the resulting behavior is undefined.
+variable declaration and its definition, the resulting behavior is undefined.
 
 A variable may be defined as a global ``constant``, which indicates that
 the contents of the variable will **never** be modified (enabling better
@@ -1334,7 +1334,7 @@ Currently, only the following parameter attributes are defined:
     The byval type argument indicates the in-memory value type.
 
     The byval attribute also supports specifying an alignment with the
-    align attribute. It indicates the alignment of the stack slot to
+    ``align`` attribute. It indicates the alignment of the stack slot to
     form and the known alignment of the pointer specified to the call
     site. If the alignment is not specified, then the code generator
     makes a target-specific assumption.
@@ -1355,7 +1355,7 @@ Currently, only the following parameter attributes are defined:
 
     This is not a valid attribute for return values.
 
-    The alignment for an ``byref`` parameter can be explicitly
+    The alignment for a ``byref`` parameter can be explicitly
     specified by combining it with the ``align`` attribute, similar to
     ``byval``. If the alignment is not specified, then the code generator
     makes a target-specific assumption.
@@ -1382,7 +1382,7 @@ Currently, only the following parameter attributes are defined:
     The preallocated attribute requires a type argument.
 
     The preallocated attribute also supports specifying an alignment with the
-    align attribute. It indicates the alignment of the stack slot to
+    ``align`` attribute. It indicates the alignment of the stack slot to
     form and the known alignment of the pointer specified to the call
     site. If the alignment is not specified, then the code generator
     makes a target-specific assumption.
@@ -1550,7 +1550,7 @@ Currently, only the following parameter attributes are defined:
 
 ``nonnull``
     This indicates that the parameter or return pointer is not null. This
-    attribute may only be applied to pointer typed parameters. This is not
+    attribute may only be applied to pointer-typed parameters. This is not
     checked or enforced by LLVM; if the parameter or return pointer is null,
     :ref:`poison value <poisonvalues>` is returned or passed instead.
     The ``nonnull`` attribute should be combined with the ``noundef`` attribute
@@ -1558,7 +1558,7 @@ Currently, only the following parameter attributes are defined:
 
 ``dereferenceable(<n>)``
     This indicates that the parameter or return pointer is dereferenceable. This
-    attribute may only be applied to pointer typed parameters. A pointer that
+    attribute may only be applied to pointer-typed parameters. A pointer that
     is dereferenceable can be loaded from speculatively without a risk of
     trapping. The number of bytes known to be dereferenceable must be provided
     in parentheses. It is legal for the number of bytes to be less than the
@@ -1584,7 +1584,7 @@ Currently, only the following parameter attributes are defined:
     implies that a pointer is at least one of ``dereferenceable(<n>)``
     or ``null`` (i.e., it may be both ``null`` and
     ``dereferenceable(<n>)``). This attribute may only be applied to
-    pointer typed parameters.
+    pointer-typed parameters.
 
 ``swiftself``
     This indicates that the parameter is the self/context parameter. This is not
@@ -1601,7 +1601,7 @@ Currently, only the following parameter attributes are defined:
 
 ``swifterror``
     This attribute is motivated to model and optimize Swift error handling. It
-    can be applied to a parameter with pointer to pointer type or a
+    can be applied to a parameter with pointer-to-pointer type or a
     pointer-sized alloca. At the call site, the actual argument that corresponds
     to a ``swifterror`` parameter has to come from a ``swifterror`` alloca or
     the ``swifterror`` parameter of the caller. A ``swifterror`` value (either
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 49158fb..bfe6827 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -180,6 +180,10 @@ Changes to the LLVM tools
 * Some code paths for supporting Python 2.7 in `llvm-lit` have been removed.
 * Support for `%T` in lit has been removed.
 
+* `llvm-config` gained a new flag `--quote-paths` which quotes and escapes paths
+  emitted on stdout, to account for spaces or other special characters in path.
+  (`#97305 <https://github.com/llvm/llvm-project/pull/97305>`_).
+
 Changes to LLDB
 ---------------------------------
 
diff --git a/llvm/include/llvm/ADT/AddressRanges.h b/llvm/include/llvm/ADT/AddressRanges.h
index 79ba5d5..6ea097d 100644
--- a/llvm/include/llvm/ADT/AddressRanges.h
+++ b/llvm/include/llvm/ADT/AddressRanges.h
@@ -21,7 +21,7 @@ namespace llvm {
 /// a start and an end address: [Start, End).
 class AddressRange {
 public:
-  AddressRange() {}
+  AddressRange() = default;
   AddressRange(uint64_t S, uint64_t E) : Start(S), End(E) {
     assert(Start <= End);
   }
diff --git a/llvm/include/llvm/ADT/StringMap.h b/llvm/include/llvm/ADT/StringMap.h
index 01cbf2d3..7901365 100644
--- a/llvm/include/llvm/ADT/StringMap.h
+++ b/llvm/include/llvm/ADT/StringMap.h
@@ -302,7 +302,7 @@ public:
       if (FindInRHS == RHS.end())
         return false;
 
-      if constexpr (!std::is_same_v<ValueTy, std::nullopt_t>) {
+      if constexpr (!std::is_same_v<ValueTy, EmptyStringSetTag>) {
         if (!(KeyValue.getValue() == FindInRHS->getValue()))
           return false;
       }
diff --git a/llvm/include/llvm/ADT/StringMapEntry.h b/llvm/include/llvm/ADT/StringMapEntry.h
index 21be5ec..b0a3c8c 100644
--- a/llvm/include/llvm/ADT/StringMapEntry.h
+++ b/llvm/include/llvm/ADT/StringMapEntry.h
@@ -21,6 +21,9 @@
 
 namespace llvm {
 
+/// The "value type" of StringSet represented as an empty struct.
+struct EmptyStringSetTag {};
+
 /// StringMapEntryBase - Shared base class of StringMapEntry instances.
 class StringMapEntryBase {
   size_t keyLength;
@@ -85,14 +88,13 @@ public:
 };
 
 template <>
-class StringMapEntryStorage<std::nullopt_t> : public StringMapEntryBase {
+class StringMapEntryStorage<EmptyStringSetTag> : public StringMapEntryBase {
 public:
-  explicit StringMapEntryStorage(size_t keyLength,
-                                 std::nullopt_t = std::nullopt)
+  explicit StringMapEntryStorage(size_t keyLength, EmptyStringSetTag = {})
       : StringMapEntryBase(keyLength) {}
   StringMapEntryStorage(StringMapEntryStorage &entry) = delete;
 
-  std::nullopt_t getValue() const { return std::nullopt; }
+  EmptyStringSetTag getValue() const { return {}; }
 };
 
 /// StringMapEntry - This is used to represent one value that is inserted into
diff --git a/llvm/include/llvm/ADT/StringSet.h b/llvm/include/llvm/ADT/StringSet.h
index c8be3f2..dc154af 100644
--- a/llvm/include/llvm/ADT/StringSet.h
+++ b/llvm/include/llvm/ADT/StringSet.h
@@ -22,8 +22,8 @@ namespace llvm {
 
 /// StringSet - A wrapper for StringMap that provides set-like functionality.
 template <class AllocatorTy = MallocAllocator>
-class StringSet : public StringMap<std::nullopt_t, AllocatorTy> {
-  using Base = StringMap<std::nullopt_t, AllocatorTy>;
+class StringSet : public StringMap<EmptyStringSetTag, AllocatorTy> {
+  using Base = StringMap<EmptyStringSetTag, AllocatorTy>;
 
 public:
   StringSet() = default;
diff --git a/llvm/include/llvm/ADT/StringSwitch.h b/llvm/include/llvm/ADT/StringSwitch.h
index 98685de..4868153 100644
--- a/llvm/include/llvm/ADT/StringSwitch.h
+++ b/llvm/include/llvm/ADT/StringSwitch.h
@@ -173,6 +173,7 @@ public:
     return CasesLowerImpl(CaseStrings, Value);
   }
 
+  [[deprecated("Pass cases in std::initializer_list instead")]]
   StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, T Value) {
     return CasesLowerImpl({S0, S1}, Value);
   }
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index 71055dd16..e3a0b3f 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -72,7 +72,7 @@ enum class IR2VecKind { Symbolic, FlowAware };
 
 namespace ir2vec {
 
-extern llvm::cl::OptionCategory IR2VecCategory;
+LLVM_ABI extern llvm::cl::OptionCategory IR2VecCategory;
 LLVM_ABI extern cl::opt<float> OpcWeight;
 LLVM_ABI extern cl::opt<float> TypeWeight;
 LLVM_ABI extern cl::opt<float> ArgWeight;
diff --git a/llvm/include/llvm/CodeGen/MIR2Vec.h b/llvm/include/llvm/CodeGen/MIR2Vec.h
index 44f009c..18b1290 100644
--- a/llvm/include/llvm/CodeGen/MIR2Vec.h
+++ b/llvm/include/llvm/CodeGen/MIR2Vec.h
@@ -73,7 +73,7 @@ namespace mir2vec {
 class MIREmbedder;
 class SymbolicMIREmbedder;
 
-extern llvm::cl::OptionCategory MIR2VecCategory;
+LLVM_ABI extern llvm::cl::OptionCategory MIR2VecCategory;
 extern cl::opt<float> OpcWeight, CommonOperandWeight, RegOperandWeight;
 
 using Embedding = ir2vec::Embedding;
@@ -154,14 +154,14 @@ class MIRVocabulary {
   void buildRegisterOperandMapping();
 
   /// Get canonical index for a machine opcode
-  unsigned getCanonicalOpcodeIndex(unsigned Opcode) const;
+  LLVM_ABI unsigned getCanonicalOpcodeIndex(unsigned Opcode) const;
 
   /// Get index for a common (non-register) machine operand
   unsigned
   getCommonOperandIndex(MachineOperand::MachineOperandType OperandType) const;
 
   /// Get index for a register machine operand
-  unsigned getRegisterOperandIndex(Register Reg) const;
+  LLVM_ABI unsigned getRegisterOperandIndex(Register Reg) const;
 
   // Accessors for operand types
   const Embedding &
@@ -192,7 +192,7 @@ class MIRVocabulary {
 
   /// Get entity ID (flat index) for a common operand type
   /// This is used for triplet generation
-  unsigned getEntityIDForCommonOperand(
+  LLVM_ABI unsigned getEntityIDForCommonOperand(
       MachineOperand::MachineOperandType OperandType) const {
     return Layout.CommonOperandBase + getCommonOperandIndex(OperandType);
   }
@@ -221,7 +221,7 @@ public:
                                              bool IsPhysical = true) const;
 
   /// Get the string key for a vocabulary entry at the given position
-  std::string getStringKey(unsigned Pos) const;
+  LLVM_ABI std::string getStringKey(unsigned Pos) const;
 
   unsigned getDimension() const { return Storage.getDimension(); }
 
@@ -268,7 +268,7 @@ public:
          const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI);
 
   /// Create a dummy vocabulary for testing purposes.
-  static Expected<MIRVocabulary>
+  LLVM_ABI static Expected<MIRVocabulary>
   createDummyVocabForTest(const TargetInstrInfo &TII,
                           const TargetRegisterInfo &TRI,
                           const MachineRegisterInfo &MRI, unsigned Dim = 1);
@@ -302,10 +302,10 @@ protected:
         RegOperandWeight(mir2vec::RegOperandWeight) {}
 
   /// Function to compute embeddings.
-  Embedding computeEmbeddings() const;
+  LLVM_ABI Embedding computeEmbeddings() const;
 
   /// Function to compute the embedding for a given machine basic block.
-  Embedding computeEmbeddings(const MachineBasicBlock &MBB) const;
+  LLVM_ABI Embedding computeEmbeddings(const MachineBasicBlock &MBB) const;
 
   /// Function to compute the embedding for a given machine instruction.
   /// Specific to the kind of embeddings being computed.
@@ -316,9 +316,9 @@ public:
 
   /// Factory method to create an Embedder object of the specified kind
   /// Returns nullptr if the requested kind is not supported.
-  static std::unique_ptr<MIREmbedder> create(MIR2VecKind Mode,
-                                             const MachineFunction &MF,
-                                             const MIRVocabulary &Vocab);
+  LLVM_ABI static std::unique_ptr<MIREmbedder>
+  create(MIR2VecKind Mode, const MachineFunction &MF,
+         const MIRVocabulary &Vocab);
 
   /// Computes and returns the embedding for a given machine instruction MI in
   /// the machine function MF.
@@ -369,7 +369,7 @@ class MIR2VecVocabProvider {
 public:
   MIR2VecVocabProvider(const MachineModuleInfo &MMI) : MMI(MMI) {}
 
-  Expected<mir2vec::MIRVocabulary> getVocabulary(const Module &M);
+  LLVM_ABI Expected<mir2vec::MIRVocabulary> getVocabulary(const Module &M);
 
 private:
   Error readVocabulary(VocabMap &OpcVocab, VocabMap &CommonOperandVocab,
@@ -454,7 +454,7 @@ public:
 };
 
 /// Create a machine pass that prints MIR2Vec embeddings
-MachineFunctionPass *createMIR2VecPrinterLegacyPass(raw_ostream &OS);
+LLVM_ABI MachineFunctionPass *createMIR2VecPrinterLegacyPass(raw_ostream &OS);
 
 } // namespace llvm
 
diff --git a/llvm/include/llvm/CodeGenTypes/LowLevelType.h b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
index 4c1fe13..472a3f3 100644
--- a/llvm/include/llvm/CodeGenTypes/LowLevelType.h
+++ b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
@@ -340,18 +340,18 @@ private:
   ///   valid encodings, SizeInBits/SizeOfElement must be larger than 0.
   /// * Non-pointer scalar (isPointer == 0 && isVector == 0):
   ///   SizeInBits: 32;
-  static const constexpr BitFieldInfo ScalarSizeFieldInfo{32, 29};
+  static constexpr BitFieldInfo ScalarSizeFieldInfo{32, 29};
   /// * Pointer (isPointer == 1 && isVector == 0):
   ///   SizeInBits: 16;
   ///   AddressSpace: 24;
-  static const constexpr BitFieldInfo PointerSizeFieldInfo{16, 45};
-  static const constexpr BitFieldInfo PointerAddressSpaceFieldInfo{24, 21};
+  static constexpr BitFieldInfo PointerSizeFieldInfo{16, 45};
+  static constexpr BitFieldInfo PointerAddressSpaceFieldInfo{24, 21};
   /// * Vector-of-non-pointer (isPointer == 0 && isVector == 1):
   ///   NumElements: 16;
   ///   SizeOfElement: 32;
   ///   Scalable: 1;
-  static const constexpr BitFieldInfo VectorElementsFieldInfo{16, 5};
-  static const constexpr BitFieldInfo VectorScalableFieldInfo{1, 0};
+  static constexpr BitFieldInfo VectorElementsFieldInfo{16, 5};
+  static constexpr BitFieldInfo VectorScalableFieldInfo{1, 0};
   /// * Vector-of-pointer (isPointer == 1 && isVector == 1):
   ///   NumElements: 16;
   ///   SizeOfElement: 16;
diff --git a/llvm/include/llvm/DWARFLinker/StringPool.h b/llvm/include/llvm/DWARFLinker/StringPool.h
index d0f4e21..7838e3b 100644
--- a/llvm/include/llvm/DWARFLinker/StringPool.h
+++ b/llvm/include/llvm/DWARFLinker/StringPool.h
@@ -20,7 +20,7 @@ namespace dwarf_linker {
 
 /// StringEntry keeps data of the string: the length, external offset
 /// and a string body which is placed right after StringEntry.
-using StringEntry = StringMapEntry<std::nullopt_t>;
+using StringEntry = StringMapEntry<EmptyStringSetTag>;
 
 class StringPoolEntryInfo {
 public:
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h
new file mode 100644
index 0000000..5170893
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h
@@ -0,0 +1,173 @@
+//===- SymbolFilter.h - Utilities for Symbol Filtering ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_SYMBOLFILTER_H
+#define LLVM_EXECUTIONENGINE_ORC_SHARED_SYMBOLFILTER_H
+
+#include "llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h"
+
+#include <cmath>
+#include <type_traits>
+#include <vector>
+
+namespace llvm {
+namespace orc {
+
+namespace shared {
+using SPSBloomFilter =
+    SPSTuple<bool, uint32_t, uint32_t, uint32_t, SPSSequence<uint64_t>>;
+}
+
+class BloomFilter {
+public:
+  using HashFunc = std::function<uint32_t(StringRef)>;
+
+  BloomFilter() = default;
+  BloomFilter(BloomFilter &&) noexcept = default;
+  BloomFilter &operator=(BloomFilter &&) noexcept = default;
+  BloomFilter(const BloomFilter &) = delete;
+  BloomFilter &operator=(const BloomFilter &) = delete;
+
+  BloomFilter(uint32_t SymbolCount, float FalsePositiveRate, HashFunc hashFn)
+      : HashFn(std::move(hashFn)) {
+    initialize(SymbolCount, FalsePositiveRate);
+  }
+  bool isInitialized() const { return Initialized; }
+
+  void add(StringRef Sym) {
+    assert(Initialized);
+    addHash(HashFn(Sym));
+  }
+
+  bool mayContain(StringRef Sym) const {
+    return !isEmpty() && testHash(HashFn(Sym));
+  }
+
+  bool isEmpty() const { return SymbolCount == 0; }
+
+private:
+  friend class shared::SPSSerializationTraits<shared::SPSBloomFilter,
+                                              BloomFilter>;
+  static constexpr uint32_t BitsPerEntry = 64;
+
+  bool Initialized = false;
+  uint32_t SymbolCount = 0;
+  uint32_t BloomSize = 0;
+  uint32_t BloomShift = 0;
+  std::vector<uint64_t> BloomTable;
+  HashFunc HashFn;
+
+  void initialize(uint32_t SymCount, float FalsePositiveRate) {
+    assert(SymCount > 0);
+    SymbolCount = SymCount;
+    Initialized = true;
+
+    float ln2 = std::log(2.0f);
+    float M = -1.0f * SymbolCount * std::log(FalsePositiveRate) / (ln2 * ln2);
+    BloomSize = static_cast<uint32_t>(std::ceil(M / BitsPerEntry));
+    BloomShift = std::min(6u, log2ceil(SymbolCount));
+    BloomTable.resize(BloomSize, 0);
+  }
+
+  void addHash(uint32_t Hash) {
+    uint32_t Hash2 = Hash >> BloomShift;
+    uint32_t N = (Hash / BitsPerEntry) % BloomSize;
+    uint64_t Mask =
+        (1ULL << (Hash % BitsPerEntry)) | (1ULL << (Hash2 % BitsPerEntry));
+    BloomTable[N] |= Mask;
+  }
+
+  bool testHash(uint32_t Hash) const {
+    uint32_t Hash2 = Hash >> BloomShift;
+    uint32_t N = (Hash / BitsPerEntry) % BloomSize;
+    uint64_t Mask =
+        (1ULL << (Hash % BitsPerEntry)) | (1ULL << (Hash2 % BitsPerEntry));
+    return (BloomTable[N] & Mask) == Mask;
+  }
+
+  static constexpr uint32_t log2ceil(uint32_t V) {
+    return V <= 1 ? 0 : 32 - countl_zero(V - 1);
+  }
+};
+
+class BloomFilterBuilder {
+public:
+  using HashFunc = BloomFilter::HashFunc;
+
+  BloomFilterBuilder() = default;
+
+  BloomFilterBuilder &setFalsePositiveRate(float Rate) {
+    assert(Rate > 0.0f && Rate < 1.0f);
+    FalsePositiveRate = Rate;
+    return *this;
+  }
+
+  BloomFilterBuilder &setHashFunction(HashFunc Fn) {
+    HashFn = std::move(Fn);
+    return *this;
+  }
+
+  BloomFilter build(ArrayRef<StringRef> Symbols) const {
+    assert(!Symbols.empty() && "Cannot build filter from empty symbol list.");
+    BloomFilter F(static_cast<uint32_t>(Symbols.size()), FalsePositiveRate,
+                  HashFn);
+    for (const auto &Sym : Symbols)
+      F.add(Sym);
+
+    return F;
+  }
+
+private:
+  float FalsePositiveRate = 0.02f;
+  HashFunc HashFn = [](StringRef S) -> uint32_t {
+    uint32_t H = 5381;
+    for (char C : S)
+      H = ((H << 5) + H) + static_cast<uint8_t>(C); // H * 33 + C
+    return H;
+  };
+};
+
+namespace shared {
+
+template <> class SPSSerializationTraits<SPSBloomFilter, BloomFilter> {
+public:
+  static size_t size(const BloomFilter &Filter) {
+    return SPSBloomFilter::AsArgList::size(
+        Filter.Initialized, Filter.SymbolCount, Filter.BloomSize,
+        Filter.BloomShift, Filter.BloomTable);
+  }
+
+  static bool serialize(SPSOutputBuffer &OB, const BloomFilter &Filter) {
+    return SPSBloomFilter::AsArgList::serialize(
+        OB, Filter.Initialized, Filter.SymbolCount, Filter.BloomSize,
+        Filter.BloomShift, Filter.BloomTable);
+  }
+
+  static bool deserialize(SPSInputBuffer &IB, BloomFilter &Filter) {
+    bool IsInitialized;
+    uint32_t SymbolCount = 0, BloomSize = 0, BloomShift = 0;
+    std::vector<uint64_t> BloomTable;
+
+    if (!SPSBloomFilter::AsArgList::deserialize(
+            IB, IsInitialized, SymbolCount, BloomSize, BloomShift, BloomTable))
+      return false;
+
+    Filter.Initialized = IsInitialized;
+    Filter.SymbolCount = SymbolCount;
+    Filter.BloomSize = BloomSize;
+    Filter.BloomShift = BloomShift;
+    Filter.BloomTable = std::move(BloomTable);
+
+    return true;
+  }
+};
+
+} // end namespace shared
+} // end namespace orc
+} // end namespace llvm
+#endif // LLVM_EXECUTIONENGINE_ORC_SHARED_SYMBOLFILTER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h
new file mode 100644
index 0000000..9182995
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h
@@ -0,0 +1,511 @@
+//===- LibraryResolver.h - Automatic Library Symbol Resolution -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides support for automatically searching symbols across
+// dynamic libraries that have not yet been loaded.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYRESOLVER_H
+#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYRESOLVER_H
+
+#include "llvm/ADT/FunctionExtras.h"
+#include "llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h"
+#include "llvm/Support/Path.h"
+
+#include <atomic>
+#include <shared_mutex>
+#include <unordered_map>
+
+namespace llvm {
+namespace orc {
+
+/// Manages library metadata and state for symbol resolution.
+///
+/// Tracks libraries by load state and kind (user/system), and stores
+/// associated Bloom filters and hash maps to speed up symbol lookups.
+/// Thread-safe for concurrent access.
+class LibraryManager {
+public:
+  enum class LibState : uint8_t { Unloaded = 0, Loaded = 1, Queried = 2 };
+
+  class LibraryInfo {
+  public:
+    LibraryInfo(const LibraryInfo &) = delete;
+    LibraryInfo &operator=(const LibraryInfo &) = delete;
+
+    LibraryInfo(std::string FilePath, LibState S, PathType K,
+                std::optional<BloomFilter> Filter = std::nullopt)
+        : FilePath(std::move(FilePath)), S(S), K(K), Filter(std::move(Filter)) {
+    }
+
+    StringRef getBasePath() const { return sys::path::parent_path(FilePath); }
+    StringRef getFileName() const { return sys::path::filename(FilePath); }
+
+    std::string getFullPath() const { return FilePath; }
+
+    void setFilter(BloomFilter F) {
+      std::lock_guard<std::shared_mutex> Lock(Mtx);
+      if (Filter)
+        return;
+      Filter.emplace(std::move(F));
+    }
+
+    void ensureFilterBuilt(const BloomFilterBuilder &FB,
+                           ArrayRef<StringRef> Symbols) {
+      std::lock_guard<std::shared_mutex> Lock(Mtx);
+      if (Filter)
+        return;
+      Filter.emplace(FB.build(Symbols));
+    }
+
+    bool mayContain(StringRef Symbol) const {
+      assert(hasFilter());
+      std::shared_lock<std::shared_mutex> Lock(Mtx);
+      return Filter->mayContain(Symbol);
+    }
+
+    bool hasFilter() const {
+      std::shared_lock<std::shared_mutex> Lock(Mtx);
+      return Filter.has_value();
+    }
+
+    LibState getState() const { return S.load(); }
+    PathType getKind() const { return K; }
+
+    void setState(LibState s) { S.store(s); }
+
+    bool operator==(const LibraryInfo &other) const {
+      return FilePath == other.FilePath;
+    }
+
+  private:
+    std::string FilePath;
+    std::atomic<LibState> S;
+    PathType K;
+    std::optional<BloomFilter> Filter;
+    mutable std::shared_mutex Mtx;
+  };
+
+  /// A read-only view of libraries filtered by state and kind.
+  ///
+  /// Lets you loop over only the libraries in a map that match a given State
+  /// and PathType.
+  class FilteredView {
+  public:
+    using Map = StringMap<std::shared_ptr<LibraryInfo>>;
+    using Iterator = typename Map::const_iterator;
+    class FilterIterator {
+    public:
+      FilterIterator(Iterator it_, Iterator end_, LibState S, PathType K)
+          : it(it_), end(end_), S(S), K(K) {
+        advance();
+      }
+
+      bool operator!=(const FilterIterator &other) const {
+        return it != other.it;
+      }
+
+      const std::shared_ptr<LibraryInfo> &operator*() const {
+        return it->second;
+      }
+
+      FilterIterator &operator++() {
+        ++it;
+        advance();
+        return *this;
+      }
+
+    private:
+      void advance() {
+        for (; it != end; ++it)
+          if (it->second->getState() == S && it->second->getKind() == K)
+            break;
+      }
+      Iterator it;
+      Iterator end;
+      LibState S;
+      PathType K;
+    };
+    FilteredView(Iterator begin, Iterator end, LibState s, PathType k)
+        : mapBegin(begin), mapEnd(end), state(s), kind(k) {}
+
+    FilterIterator begin() const {
+      return FilterIterator(mapBegin, mapEnd, state, kind);
+    }
+
+    FilterIterator end() const {
+      return FilterIterator(mapEnd, mapEnd, state, kind);
+    }
+
+  private:
+    Iterator mapBegin;
+    Iterator mapEnd;
+    LibState state;
+    PathType kind;
+  };
+
+private:
+  StringMap<std::shared_ptr<LibraryInfo>> Libraries;
+  mutable std::shared_mutex Mtx;
+
+public:
+  using LibraryVisitor = std::function<bool(const LibraryInfo &)>;
+
+  LibraryManager() = default;
+  ~LibraryManager() = default;
+
+  bool addLibrary(std::string Path, PathType Kind,
+                  std::optional<BloomFilter> Filter = std::nullopt) {
+    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    if (Libraries.count(Path) > 0)
+      return false;
+    Libraries.insert({std::move(Path),
+                      std::make_shared<LibraryInfo>(Path, LibState::Unloaded,
+                                                    Kind, std::move(Filter))});
+    return true;
+  }
+
+  bool hasLibrary(StringRef Path) const {
+    std::shared_lock<std::shared_mutex> Lock(Mtx);
+    if (Libraries.count(Path) > 0)
+      return true;
+    return false;
+  }
+
+  void removeLibrary(StringRef Path) {
+    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    auto I = Libraries.find(Path);
+    if (I == Libraries.end())
+      return;
+    Libraries.erase(I);
+  }
+
+  void markLoaded(StringRef Path) {
+    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    if (auto It = Libraries.find(Path); It != Libraries.end())
+      It->second->setState(LibState::Loaded);
+  }
+
+  void markQueried(StringRef Path) {
+    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    if (auto It = Libraries.find(Path); It != Libraries.end())
+      It->second->setState(LibState::Queried);
+  }
+
+  std::shared_ptr<LibraryInfo> getLibrary(StringRef Path) {
+    std::shared_lock<std::shared_mutex> Lock(Mtx);
+    if (auto It = Libraries.find(Path); It != Libraries.end())
+      return It->second;
+    return nullptr;
+  }
+
+  FilteredView getView(LibState S, PathType K) const {
+    std::shared_lock<std::shared_mutex> Lock(Mtx);
+    return FilteredView(Libraries.begin(), Libraries.end(), S, K);
+  }
+
+  void forEachLibrary(const LibraryVisitor &visitor) const {
+    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    for (const auto &[_, entry] : Libraries) {
+      if (!visitor(*entry))
+        break;
+    }
+  }
+
+  bool isLoaded(StringRef Path) const {
+    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    if (auto It = Libraries.find(Path.str()); It != Libraries.end())
+      return It->second->getState() == LibState::Loaded;
+    return false;
+  }
+
+  bool isQueried(StringRef Path) const {
+    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    if (auto It = Libraries.find(Path.str()); It != Libraries.end())
+      return It->second->getState() == LibState::Queried;
+    return false;
+  }
+
+  void clear() {
+    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    Libraries.clear();
+  }
+};
+
+using LibraryInfo = LibraryManager::LibraryInfo;
+
+struct SearchPlanEntry {
+  LibraryManager::LibState State; // Loaded, Queried, Unloaded
+  PathType Type;                  // User, System
+};
+
+struct SearchPolicy {
+  std::vector<SearchPlanEntry> Plan;
+
+  static SearchPolicy defaultPlan() {
+    return {{{LibraryManager::LibState::Loaded, PathType::User},
+             {LibraryManager::LibState::Queried, PathType::User},
+             {LibraryManager::LibState::Unloaded, PathType::User},
+             {LibraryManager::LibState::Loaded, PathType::System},
+             {LibraryManager::LibState::Queried, PathType::System},
+             {LibraryManager::LibState::Unloaded, PathType::System}}};
+  }
+};
+
+struct SymbolEnumeratorOptions {
+  enum Filter : uint32_t {
+    None = 0,
+    IgnoreUndefined = 1 << 0,
+    IgnoreWeak = 1 << 1,
+    IgnoreIndirect = 1 << 2,
+    IgnoreHidden = 1 << 3,
+    IgnoreNonGlobal = 1 << 4
+  };
+
+  static SymbolEnumeratorOptions defaultOptions() {
+    return {Filter::IgnoreUndefined | Filter::IgnoreWeak |
+            Filter::IgnoreIndirect};
+  }
+  uint32_t FilterFlags = Filter::None;
+};
+
+struct SearchConfig {
+  SearchPolicy Policy;
+  SymbolEnumeratorOptions Options;
+
+  SearchConfig()
+      : Policy(SearchPolicy::defaultPlan()), // default plan
+        Options(SymbolEnumeratorOptions::defaultOptions()) {}
+};
+
+/// Scans libraries and resolves Symbols across user and system paths.
+///
+/// Supports symbol enumeration and filtering via SymbolEnumerator, and tracks
+/// symbol resolution results through SymbolQuery. Thread-safe and uses
+/// LibraryScanHelper for efficient path resolution and caching.
+class LibraryResolver {
+  friend class LibraryResolutionDriver;
+
+public:
+  class SymbolEnumerator {
+  public:
+    enum class EnumerateResult { Continue, Stop, Error };
+
+    using OnEachSymbolFn = std::function<EnumerateResult(StringRef Sym)>;
+
+    static bool enumerateSymbols(StringRef Path, OnEachSymbolFn OnEach,
+                                 const SymbolEnumeratorOptions &Opts);
+  };
+
+  /// Tracks a set of symbols and the libraries where they are resolved.
+  ///
+  /// SymbolQuery is used to keep track of which symbols have been resolved
+  /// to which libraries. It supports concurrent read/write access using a
+  /// shared mutex, allowing multiple readers or a single writer at a time.
+  class SymbolQuery {
+  public:
+    /// Holds the result for a single symbol.
+    struct Result {
+      std::string Name;
+      std::string ResolvedLibPath;
+    };
+
+  private:
+    mutable std::shared_mutex Mtx;
+    StringMap<Result> Results;
+    std::atomic<size_t> ResolvedCount = 0;
+
+  public:
+    explicit SymbolQuery(const std::vector<std::string> &Symbols) {
+      for (const auto &s : Symbols) {
+        if (!Results.contains(s))
+          Results.insert({s, Result{s, ""}});
+      }
+    }
+
+    SmallVector<StringRef> getUnresolvedSymbols() const {
+      SmallVector<StringRef> Unresolved;
+      std::shared_lock<std::shared_mutex> Lock(Mtx);
+      for (const auto &[name, res] : Results) {
+        if (res.ResolvedLibPath.empty())
+          Unresolved.push_back(name);
+      }
+      return Unresolved;
+    }
+
+    void resolve(StringRef Sym, const std::string &LibPath) {
+      std::unique_lock<std::shared_mutex> Lock(Mtx);
+      auto It = Results.find(Sym);
+      if (It != Results.end() && It->second.ResolvedLibPath.empty()) {
+        It->second.ResolvedLibPath = LibPath;
+        ResolvedCount.fetch_add(1, std::memory_order_relaxed);
+      }
+    }
+
+    bool allResolved() const {
+      return ResolvedCount.load(std::memory_order_relaxed) == Results.size();
+    }
+
+    bool hasUnresolved() const {
+      return ResolvedCount.load(std::memory_order_relaxed) < Results.size();
+    }
+
+    std::optional<StringRef> getResolvedLib(StringRef Sym) const {
+      std::shared_lock<std::shared_mutex> Lock(Mtx);
+      auto It = Results.find(Sym);
+      if (It != Results.end() && !It->second.ResolvedLibPath.empty())
+        return StringRef(It->second.ResolvedLibPath);
+      return std::nullopt;
+    }
+
+    bool isResolved(StringRef Sym) const {
+      std::shared_lock<std::shared_mutex> Lock(Mtx);
+      auto It = Results.find(Sym.str());
+      return It != Results.end() && !It->second.ResolvedLibPath.empty();
+    }
+
+    std::vector<const Result *> getAllResults() const {
+      std::shared_lock<std::shared_mutex> Lock(Mtx);
+      std::vector<const Result *> Out;
+      Out.reserve(Results.size());
+      for (const auto &[_, res] : Results)
+        Out.push_back(&res);
+      return Out;
+    }
+  };
+
+  struct Setup {
+    std::vector<std::string> BasePaths;
+    std::shared_ptr<LibraryPathCache> Cache;
+    std::shared_ptr<PathResolver> PResolver;
+
+    size_t ScanBatchSize = 0;
+
+    LibraryScanner::ShouldScanFn ShouldScanCall = [](StringRef) {
+      return true;
+    };
+
+    BloomFilterBuilder FilterBuilder = BloomFilterBuilder();
+
+    static Setup
+    create(std::vector<std::string> BasePaths,
+           std::shared_ptr<LibraryPathCache> existingCache = nullptr,
+           std::shared_ptr<PathResolver> existingResolver = nullptr,
+           LibraryScanner::ShouldScanFn customShouldScan = nullptr) {
+      Setup S;
+      S.BasePaths = std::move(BasePaths);
+
+      S.Cache =
+          existingCache ? existingCache : std::make_shared<LibraryPathCache>();
+
+      S.PResolver = existingResolver ? existingResolver
+                                     : std::make_shared<PathResolver>(S.Cache);
+
+      if (customShouldScan)
+        S.ShouldScanCall = std::move(customShouldScan);
+
+      return S;
+    }
+  };
+
+  LibraryResolver() = delete;
+  explicit LibraryResolver(const Setup &S);
+  ~LibraryResolver() = default;
+
+  using OnSearchComplete = unique_function<void(SymbolQuery &)>;
+
+  void dump() {
+    int i = 0;
+    LibMgr.forEachLibrary([&](const LibraryInfo &Lib) -> bool {
+      dbgs() << ++i << ". Library Path : " << Lib.getFullPath() << " -> \n\t\t:"
+             << " ({Type : ("
+             << (Lib.getKind() == PathType::User ? "User" : "System")
+             << ") }, { State : "
+             << (Lib.getState() == LibraryManager::LibState::Loaded
+                     ? "Loaded"
+                     : "Unloaded")
+             << "})\n";
+      return true;
+    });
+  }
+
+  void searchSymbolsInLibraries(std::vector<std::string> &SymList,
+                                OnSearchComplete OnComplete,
+                                const SearchConfig &Config = SearchConfig());
+
+private:
+  bool scanLibrariesIfNeeded(PathType K, size_t BatchSize = 0);
+  void resolveSymbolsInLibrary(LibraryInfo &Lib, SymbolQuery &Q,
+                               const SymbolEnumeratorOptions &Opts);
+  bool
+  symbolExistsInLibrary(const LibraryInfo &Lib, StringRef Sym,
+                        std::vector<std::string> *MatchedSymbols = nullptr);
+
+  bool symbolExistsInLibrary(const LibraryInfo &Lib, StringRef SymName,
+                             std::vector<std::string> *AllSymbols,
+                             const SymbolEnumeratorOptions &Opts);
+
+  std::shared_ptr<LibraryPathCache> LibPathCache;
+  std::shared_ptr<PathResolver> LibPathResolver;
+  LibraryScanHelper ScanHelper;
+  BloomFilterBuilder FB;
+  LibraryManager LibMgr;
+  LibraryScanner::ShouldScanFn ShouldScanCall;
+  size_t scanBatchSize;
+};
+
+using SymbolEnumerator = LibraryResolver::SymbolEnumerator;
+using SymbolQuery = LibraryResolver::SymbolQuery;
+using EnumerateResult = SymbolEnumerator::EnumerateResult;
+
+class LibraryResolutionDriver {
+public:
+  static std::unique_ptr<LibraryResolutionDriver>
+  create(const LibraryResolver::Setup &S);
+
+  void addScanPath(const std::string &Path, PathType Kind);
+  bool markLibraryLoaded(StringRef Path);
+  bool markLibraryUnLoaded(StringRef Path);
+  bool isLibraryLoaded(StringRef Path) const {
+    return LR->LibMgr.isLoaded(Path);
+  }
+
+  void resetAll() {
+    LR->LibMgr.clear();
+    LR->ScanHelper.resetToScan();
+    LR->LibPathCache->clear();
+  }
+
+  void scanAll(size_t BatchSize = 0) {
+    LR->scanLibrariesIfNeeded(PathType::User, BatchSize);
+    LR->scanLibrariesIfNeeded(PathType::System, BatchSize);
+  }
+
+  void scan(PathType PK, size_t BatchSize = 0) {
+    LR->scanLibrariesIfNeeded(PK, BatchSize);
+  }
+
+  void resolveSymbols(std::vector<std::string> Symbols,
+                      LibraryResolver::OnSearchComplete OnCompletion,
+                      const SearchConfig &Config = SearchConfig());
+
+  ~LibraryResolutionDriver() = default;
+
+private:
+  LibraryResolutionDriver(std::unique_ptr<LibraryResolver> L)
+      : LR(std::move(L)) {}
+
+  std::unique_ptr<LibraryResolver> LR;
+};
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYRESOLVER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h
new file mode 100644
index 0000000..d1c2013
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h
@@ -0,0 +1,474 @@
+//===- LibraryScanner.h - Scanner for Shared Libraries ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides functionality for scanning dynamic (shared) libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYSCANNER_H
+#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYSCANNER_H
+
+#include "llvm/ADT/FunctionExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/StringSaver.h"
+
+#include <atomic>
+#include <mutex>
+#include <queue>
+#include <shared_mutex>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace llvm {
+namespace orc {
+
+class LibraryManager;
+
+class LibraryPathCache {
+  friend class PathResolver;
+
+public:
+  LibraryPathCache() = default;
+
+  void clear(bool isRealPathCache = false) {
+    std::unique_lock<std::shared_mutex> lock(Mtx);
+    Seen.clear();
+    if (isRealPathCache) {
+      RealPathCache.clear();
+#ifndef _WIN32
+      ReadlinkCache.clear();
+      LstatCache.clear();
+#endif
+    }
+  }
+
+  void markSeen(const std::string &CanonPath) {
+    std::unique_lock<std::shared_mutex> lock(Mtx);
+    Seen.insert(CanonPath);
+  }
+
+  bool hasSeen(StringRef CanonPath) const {
+    std::shared_lock<std::shared_mutex> lock(Mtx);
+    return Seen.contains(CanonPath);
+  }
+
+  bool hasSeenOrMark(StringRef CanonPath) {
+    std::string s = CanonPath.str();
+    {
+      std::shared_lock<std::shared_mutex> lock(Mtx);
+      if (Seen.contains(s))
+        return true;
+    }
+    {
+      std::unique_lock<std::shared_mutex> lock(Mtx);
+      Seen.insert(s);
+    }
+    return false;
+  }
+
+private:
+  mutable std::shared_mutex Mtx;
+
+  struct PathInfo {
+    std::string canonicalPath;
+    std::error_code ErrnoCode;
+  };
+
+  void insert_realpath(StringRef Path, const PathInfo &Info) {
+    std::unique_lock<std::shared_mutex> lock(Mtx);
+    RealPathCache.insert({Path, Info});
+  }
+
+  std::optional<PathInfo> read_realpath(StringRef Path) const {
+    std::shared_lock<std::shared_mutex> lock(Mtx);
+    auto It = RealPathCache.find(Path);
+    if (It != RealPathCache.end())
+      return It->second;
+
+    return std::nullopt;
+  }
+
+  StringSet<> Seen;
+  StringMap<PathInfo> RealPathCache;
+
+#ifndef _WIN32
+  StringMap<std::string> ReadlinkCache;
+  StringMap<mode_t> LstatCache;
+
+  void insert_link(StringRef Path, const std::string &s) {
+    std::unique_lock<std::shared_mutex> lock(Mtx);
+    ReadlinkCache.insert({Path, s});
+  }
+
+  std::optional<std::string> read_link(StringRef Path) const {
+    std::shared_lock<std::shared_mutex> lock(Mtx);
+    auto It = ReadlinkCache.find(Path);
+    if (It != ReadlinkCache.end())
+      return It->second;
+
+    return std::nullopt;
+  }
+
+  void insert_lstat(StringRef Path, mode_t m) {
+    std::unique_lock<std::shared_mutex> lock(Mtx);
+    LstatCache.insert({Path, m});
+  }
+
+  std::optional<mode_t> read_lstat(StringRef Path) const {
+    std::shared_lock<std::shared_mutex> lock(Mtx);
+    auto It = LstatCache.find(Path);
+    if (It != LstatCache.end())
+      return It->second;
+
+    return std::nullopt;
+  }
+
+#endif
+};
+
+/// Resolves file system paths with optional caching of results.
+///
+/// Supports lstat, readlink, and realpath operations. Can resolve paths
+/// relative to a base and handle symbolic links. Caches results to reduce
+/// repeated system calls when enabled.
+class PathResolver {
+private:
+  std::shared_ptr<LibraryPathCache> LibPathCache;
+
+public:
+  PathResolver(std::shared_ptr<LibraryPathCache> cache)
+      : LibPathCache(std::move(cache)) {}
+
+  std::optional<std::string> resolve(StringRef Path, std::error_code &ec) {
+    return realpathCached(Path, ec);
+  }
+#ifndef _WIN32
+  mode_t lstatCached(StringRef Path);
+  std::optional<std::string> readlinkCached(StringRef Path);
+#endif
+  std::optional<std::string> realpathCached(StringRef Path, std::error_code &ec,
+                                            StringRef base = "",
+                                            bool baseIsResolved = false,
+                                            long symloopLevel = 40);
+};
+
+/// Performs placeholder substitution in dynamic library paths.
+///
+/// Configures known placeholders (like @loader_path) and replaces them
+/// in input paths with their resolved values.
+class DylibSubstitutor {
+public:
+  void configure(StringRef loaderPath);
+
+  std::string substitute(StringRef input) const {
+    for (const auto &[ph, value] : Placeholders) {
+      if (input.starts_with_insensitive(ph))
+        return (Twine(value) + input.drop_front(ph.size())).str();
+    }
+    return input.str();
+  }
+
+private:
+  StringMap<std::string> Placeholders;
+};
+
+/// Validates and normalizes dynamic library paths.
+///
+/// Uses a `PathResolver` to resolve paths to their canonical form and
+/// checks whether they point to valid shared libraries.
+class DylibPathValidator {
+public:
+  DylibPathValidator(PathResolver &PR) : LibPathResolver(PR) {}
+
+  static bool isSharedLibrary(StringRef Path);
+
+  std::optional<std::string> normalize(StringRef Path) const {
+    std::error_code ec;
+    auto real = LibPathResolver.resolve(Path, ec);
+    if (!real || ec)
+      return std::nullopt;
+
+    return real;
+  }
+
+  /// Validate the given path as a shared library.
+  std::optional<std::string> validate(StringRef Path) const {
+    auto realOpt = normalize(Path);
+    if (!realOpt)
+      return std::nullopt;
+
+    if (!isSharedLibrary(*realOpt))
+      return std::nullopt;
+
+    return realOpt;
+  }
+
+private:
+  PathResolver &LibPathResolver;
+};
+
+enum class SearchPathType {
+  RPath,
+  UsrOrSys,
+  RunPath,
+};
+
+struct SearchPathConfig {
+  ArrayRef<StringRef> Paths;
+  SearchPathType type;
+};
+
+class SearchPathResolver {
+public:
+  SearchPathResolver(const SearchPathConfig &Cfg,
+                     StringRef PlaceholderPrefix = "")
+      : Kind(Cfg.type), PlaceholderPrefix(PlaceholderPrefix) {
+    for (auto &path : Cfg.Paths)
+      Paths.emplace_back(path.str());
+  }
+
+  std::optional<std::string> resolve(StringRef libStem,
+                                     const DylibSubstitutor &Subst,
+                                     DylibPathValidator &Validator) const;
+  SearchPathType searchPathType() const { return Kind; }
+
+private:
+  std::vector<std::string> Paths;
+  SearchPathType Kind;
+  std::string PlaceholderPrefix;
+};
+
+class DylibResolverImpl {
+public:
+  DylibResolverImpl(DylibSubstitutor Substitutor, DylibPathValidator &Validator,
+                    std::vector<SearchPathResolver> Resolvers)
+      : Substitutor(std::move(Substitutor)), Validator(Validator),
+        Resolvers(std::move(Resolvers)) {}
+
+  std::optional<std::string> resolve(StringRef Stem,
+                                     bool VariateLibStem = false) const;
+
+private:
+  std::optional<std::string> tryWithExtensions(StringRef libstem) const;
+
+  DylibSubstitutor Substitutor;
+  DylibPathValidator &Validator;
+  std::vector<SearchPathResolver> Resolvers;
+};
+
+class DylibResolver {
+public:
+  DylibResolver(DylibPathValidator &Validator) : Validator(Validator) {}
+
+  void configure(StringRef loaderPath,
+                 ArrayRef<SearchPathConfig> SearchPathCfg) {
+    DylibSubstitutor Substitutor;
+    Substitutor.configure(loaderPath);
+
+    std::vector<SearchPathResolver> Resolvers;
+    for (const auto &cfg : SearchPathCfg) {
+      Resolvers.emplace_back(cfg,
+                             cfg.type == SearchPathType::RPath ? "@rpath" : "");
+    }
+
+    impl_ = std::make_unique<DylibResolverImpl>(
+        std::move(Substitutor), Validator, std::move(Resolvers));
+  }
+
+  std::optional<std::string> resolve(StringRef libStem,
+                                     bool VariateLibStem = false) const {
+    if (!impl_)
+      return std::nullopt;
+    return impl_->resolve(libStem, VariateLibStem);
+  }
+
+  static std::string resolvelinkerFlag(StringRef libStem,
+                                       StringRef loaderPath) {
+    DylibSubstitutor Substitutor;
+    Substitutor.configure(loaderPath);
+    return Substitutor.substitute(libStem);
+  }
+
+private:
+  DylibPathValidator &Validator;
+  std::unique_ptr<DylibResolverImpl> impl_;
+};
+
+enum class PathType : uint8_t { User, System, Unknown };
+
+enum class ScanState : uint8_t { NotScanned, Scanning, Scanned };
+
+struct LibrarySearchPath {
+  std::string BasePath; // Canonical base directory path
+  PathType Kind;        // User or System
+  std::atomic<ScanState> State;
+
+  LibrarySearchPath(std::string Base, PathType K)
+      : BasePath(std::move(Base)), Kind(K), State(ScanState::NotScanned) {}
+};
+
+/// Scans and tracks libraries for symbol resolution.
+///
+/// Maintains a list of library paths to scan, caches scanned units,
+/// and resolves paths canonically for consistent tracking.
+class LibraryScanHelper {
+public:
+  explicit LibraryScanHelper(const std::vector<std::string> &SPaths,
+                             std::shared_ptr<LibraryPathCache> LibPathCache,
+                             std::shared_ptr<PathResolver> LibPathResolver)
+      : LibPathCache(std::move(LibPathCache)),
+        LibPathResolver(std::move(LibPathResolver)) {
+    DEBUG_WITH_TYPE(
+        "orc", dbgs() << "LibraryScanHelper::LibraryScanHelper: base paths : "
+                      << SPaths.size() << "\n";);
+    for (const auto &p : SPaths)
+      addBasePath(p);
+  }
+
+  void
+  addBasePath(const std::string &P,
+              PathType Kind =
+                  PathType::Unknown); // Add a canonical directory for scanning
+  std::vector<std::shared_ptr<LibrarySearchPath>>
+  getNextBatch(PathType Kind, size_t batchSize);
+
+  bool leftToScan(PathType K) const;
+  void resetToScan();
+
+  bool isTrackedBasePath(StringRef P) const;
+  std::vector<std::shared_ptr<LibrarySearchPath>> getAllUnits() const;
+
+  SmallVector<StringRef> getSearchPaths() const {
+    SmallVector<StringRef> SearchPaths;
+    for (const auto &[_, SP] : LibSearchPaths)
+      SearchPaths.push_back(SP->BasePath);
+    return SearchPaths;
+  }
+
+  PathResolver &getPathResolver() const { return *LibPathResolver; }
+
+  LibraryPathCache &getCache() const { return *LibPathCache; }
+
+  bool hasSeenOrMark(StringRef P) const {
+    return LibPathCache->hasSeenOrMark(P);
+  }
+
+  std::optional<std::string> resolve(StringRef P, std::error_code &ec) const {
+    return LibPathResolver->resolve(P.str(), ec);
+  }
+
+private:
+  std::string resolveCanonical(StringRef P, std::error_code &ec) const;
+  PathType classifyKind(StringRef P) const;
+
+  mutable std::shared_mutex Mtx;
+  std::shared_ptr<LibraryPathCache> LibPathCache;
+  std::shared_ptr<PathResolver> LibPathResolver;
+
+  StringMap<std::shared_ptr<LibrarySearchPath>>
+      LibSearchPaths; // key: canonical path
+  std::deque<StringRef> UnscannedUsr;
+  std::deque<StringRef> UnscannedSys;
+};
+
+/// Loads an object file and provides access to it.
+///
+/// Owns the underlying `ObjectFile` and ensures it is valid.
+/// Any errors encountered during construction are stored and
+/// returned when attempting to access the file.
+class ObjectFileLoader {
+public:
+  /// Construct an object file loader from the given path.
+  explicit ObjectFileLoader(StringRef Path) {
+    auto ObjOrErr = loadObjectFileWithOwnership(Path);
+    if (ObjOrErr)
+      Obj = std::move(*ObjOrErr);
+    else {
+      consumeError(std::move(Err));
+      Err = ObjOrErr.takeError();
+    }
+  }
+
+  ObjectFileLoader(const ObjectFileLoader &) = delete;
+  ObjectFileLoader &operator=(const ObjectFileLoader &) = delete;
+
+  ObjectFileLoader(ObjectFileLoader &&) = default;
+  ObjectFileLoader &operator=(ObjectFileLoader &&) = default;
+
+  /// Get the loaded object file, or return an error if loading failed.
+  Expected<object::ObjectFile &> getObjectFile() {
+    if (Err)
+      return std::move(Err);
+    return *Obj.getBinary();
+  }
+
+  static bool isArchitectureCompatible(const object::ObjectFile &Obj);
+
+private:
+  object::OwningBinary<object::ObjectFile> Obj;
+  Error Err = Error::success();
+
+  static Expected<object::OwningBinary<object::ObjectFile>>
+  loadObjectFileWithOwnership(StringRef FilePath);
+};
+
+/// Scans libraries, resolves dependencies, and registers them.
+class LibraryScanner {
+public:
+  using ShouldScanFn = std::function<bool(StringRef)>;
+
+  LibraryScanner(
+      LibraryScanHelper &H, LibraryManager &LibMgr,
+      ShouldScanFn ShouldScanCall = [](StringRef path) { return true; })
+      : ScanHelper(H), LibMgr(LibMgr),
+        ShouldScanCall(std::move(ShouldScanCall)) {}
+
+  void scanNext(PathType Kind, size_t batchSize = 1);
+
+  /// Dependency info for a library.
+  struct LibraryDepsInfo {
+    llvm::BumpPtrAllocator Alloc;
+    llvm::StringSaver Saver{Alloc};
+
+    SmallVector<StringRef, 2> rpath;
+    SmallVector<StringRef, 2> runPath;
+    SmallVector<StringRef, 4> deps;
+    bool isPIE = false;
+
+    void addRPath(StringRef s) { rpath.push_back(Saver.save(s)); }
+
+    void addRunPath(StringRef s) { runPath.push_back(Saver.save(s)); }
+
+    void addDep(StringRef s) { deps.push_back(Saver.save(s)); }
+  };
+
+private:
+  LibraryScanHelper &ScanHelper;
+  LibraryManager &LibMgr;
+  ShouldScanFn ShouldScanCall;
+
+  std::optional<std::string> shouldScan(StringRef FilePath);
+  Expected<LibraryDepsInfo> extractDeps(StringRef FilePath);
+
+  void handleLibrary(StringRef P, PathType K, int level = 1);
+
+  void scanBaseDir(std::shared_ptr<LibrarySearchPath> U);
+};
+
+using LibraryDepsInfo = LibraryScanner::LibraryDepsInfo;
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYSCANNER_H
diff --git a/llvm/include/llvm/IR/ConstantFold.h b/llvm/include/llvm/IR/ConstantFold.h
index f9f2b35..01bb128 100644
--- a/llvm/include/llvm/IR/ConstantFold.h
+++ b/llvm/include/llvm/IR/ConstantFold.h
@@ -26,42 +26,41 @@
 #include <optional>
 
 namespace llvm {
-  template <typename T> class ArrayRef;
-  class Value;
-  class Constant;
-  class Type;
+template <typename T> class ArrayRef;
+class Value;
+class Constant;
+class Type;
 
-  // Constant fold various types of instruction...
-  LLVM_ABI Constant *
-  ConstantFoldCastInstruction(unsigned opcode, ///< The opcode of the cast
-                              Constant *V,     ///< The source constant
-                              Type *DestTy     ///< The destination type
-  );
-  LLVM_ABI Constant *ConstantFoldSelectInstruction(Constant *Cond, Constant *V1,
-                                                   Constant *V2);
-  LLVM_ABI Constant *ConstantFoldExtractElementInstruction(Constant *Val,
-                                                           Constant *Idx);
-  LLVM_ABI Constant *ConstantFoldInsertElementInstruction(Constant *Val,
-                                                          Constant *Elt,
-                                                          Constant *Idx);
-  LLVM_ABI Constant *ConstantFoldShuffleVectorInstruction(Constant *V1,
-                                                          Constant *V2,
-                                                          ArrayRef<int> Mask);
-  LLVM_ABI Constant *
-  ConstantFoldExtractValueInstruction(Constant *Agg, ArrayRef<unsigned> Idxs);
-  LLVM_ABI Constant *
-  ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val,
-                                     ArrayRef<unsigned> Idxs);
-  LLVM_ABI Constant *ConstantFoldUnaryInstruction(unsigned Opcode, Constant *V);
-  LLVM_ABI Constant *ConstantFoldBinaryInstruction(unsigned Opcode,
-                                                   Constant *V1, Constant *V2);
-  LLVM_ABI Constant *
-  ConstantFoldCompareInstruction(CmpInst::Predicate Predicate, Constant *C1,
-                                 Constant *C2);
-  LLVM_ABI Constant *
-  ConstantFoldGetElementPtr(Type *Ty, Constant *C,
-                            std::optional<ConstantRange> InRange,
-                            ArrayRef<Value *> Idxs);
-} // End llvm namespace
+// Constant fold various types of instruction...
+LLVM_ABI Constant *
+ConstantFoldCastInstruction(unsigned opcode, ///< The opcode of the cast
+                            Constant *V,     ///< The source constant
+                            Type *DestTy     ///< The destination type
+);
+LLVM_ABI Constant *ConstantFoldSelectInstruction(Constant *Cond, Constant *V1,
+                                                 Constant *V2);
+LLVM_ABI Constant *ConstantFoldExtractElementInstruction(Constant *Val,
+                                                         Constant *Idx);
+LLVM_ABI Constant *ConstantFoldInsertElementInstruction(Constant *Val,
+                                                        Constant *Elt,
+                                                        Constant *Idx);
+LLVM_ABI Constant *ConstantFoldShuffleVectorInstruction(Constant *V1,
+                                                        Constant *V2,
+                                                        ArrayRef<int> Mask);
+LLVM_ABI Constant *ConstantFoldExtractValueInstruction(Constant *Agg,
+                                                       ArrayRef<unsigned> Idxs);
+LLVM_ABI Constant *ConstantFoldInsertValueInstruction(Constant *Agg,
+                                                      Constant *Val,
+                                                      ArrayRef<unsigned> Idxs);
+LLVM_ABI Constant *ConstantFoldUnaryInstruction(unsigned Opcode, Constant *V);
+LLVM_ABI Constant *ConstantFoldBinaryInstruction(unsigned Opcode, Constant *V1,
+                                                 Constant *V2);
+LLVM_ABI Constant *ConstantFoldCompareInstruction(CmpInst::Predicate Predicate,
+                                                  Constant *C1, Constant *C2);
+LLVM_ABI Constant *
+ConstantFoldGetElementPtr(Type *Ty, Constant *C,
+                          std::optional<ConstantRange> InRange,
+                          ArrayRef<Value *> Idxs);
+} // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/IR/DataLayout.h b/llvm/include/llvm/IR/DataLayout.h
index 56fc749..5445820 100644
--- a/llvm/include/llvm/IR/DataLayout.h
+++ b/llvm/include/llvm/IR/DataLayout.h
@@ -590,7 +590,7 @@ public:
   ///
   /// This is the amount that alloca reserves for this type. For example,
   /// returns 12 or 16 for x86_fp80, depending on alignment.
-  TypeSize getTypeAllocSize(Type *Ty) const;
+  LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const;
 
   /// Returns the offset in bits between successive objects of the
   /// specified type, including alignment padding; always a multiple of 8.
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 719181a..2710853 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1334,15 +1334,8 @@ let TargetPrefix = "nvvm" in {
   //
   let IntrProperties = [IntrNoMem] in {
     foreach ftz = ["", "_ftz"] in
-      def int_nvvm_ex2_approx # ftz # _f : NVVMBuiltin,
-          DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>;
-
-    def int_nvvm_ex2_approx_d : NVVMBuiltin,
-        DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>;
-    def int_nvvm_ex2_approx_f16 :
-        DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty]>;
-    def int_nvvm_ex2_approx_f16x2 :
-        DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty]>;
+      def int_nvvm_ex2_approx # ftz :
+          DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
 
     foreach ftz = ["", "_ftz"] in
       def int_nvvm_lg2_approx # ftz # _f : NVVMBuiltin,
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
index 3381e17..ccb77e7 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
@@ -79,7 +79,7 @@ struct CustomMappingTraits<
       }
       Args.push_back(Arg);
     }
-    io.mapRequired(Key.str().c_str(), V[Args]);
+    io.mapRequired(Key, V[Args]);
   }
   static void output(
       IO &io,
@@ -91,7 +91,7 @@ struct CustomMappingTraits<
           Key += ',';
         Key += llvm::utostr(Arg);
       }
-      io.mapRequired(Key.c_str(), P.second);
+      io.mapRequired(Key, P.second);
     }
   }
 };
@@ -122,11 +122,11 @@ struct CustomMappingTraits<std::map<uint64_t, WholeProgramDevirtResolution>> {
       io.setError("key not an integer");
       return;
     }
-    io.mapRequired(Key.str().c_str(), V[KeyInt]);
+    io.mapRequired(Key, V[KeyInt]);
   }
   static void output(IO &io, std::map<uint64_t, WholeProgramDevirtResolution> &V) {
     for (auto &P : V)
-      io.mapRequired(llvm::utostr(P.first).c_str(), P.second);
+      io.mapRequired(llvm::utostr(P.first), P.second);
   }
 };
 
@@ -215,7 +215,7 @@ namespace yaml {
 template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
   static void inputOne(IO &io, StringRef Key, GlobalValueSummaryMapTy &V) {
     std::vector<GlobalValueSummaryYaml> GVSums;
-    io.mapRequired(Key.str().c_str(), GVSums);
+    io.mapRequired(Key, GVSums);
     uint64_t KeyInt;
     if (Key.getAsInteger(0, KeyInt)) {
       io.setError("key not an integer");
@@ -290,7 +290,7 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
         }
       }
       if (!GVSums.empty())
-        io.mapRequired(llvm::utostr(P.first).c_str(), GVSums);
+        io.mapRequired(llvm::utostr(P.first), GVSums);
     }
   }
   static void fixAliaseeLinks(GlobalValueSummaryMapTy &V) {
@@ -313,12 +313,12 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
 template <> struct CustomMappingTraits<TypeIdSummaryMapTy> {
   static void inputOne(IO &io, StringRef Key, TypeIdSummaryMapTy &V) {
     TypeIdSummary TId;
-    io.mapRequired(Key.str().c_str(), TId);
+    io.mapRequired(Key, TId);
     V.insert({GlobalValue::getGUIDAssumingExternalLinkage(Key), {Key, TId}});
   }
   static void output(IO &io, TypeIdSummaryMapTy &V) {
     for (auto &TidIter : V)
-      io.mapRequired(TidIter.second.first.str().c_str(), TidIter.second.second);
+      io.mapRequired(TidIter.second.first, TidIter.second.second);
   }
 };
 
diff --git a/llvm/include/llvm/Object/SFrameParser.h b/llvm/include/llvm/Object/SFrameParser.h
index 3ce5d70..23298357 100644
--- a/llvm/include/llvm/Object/SFrameParser.h
+++ b/llvm/include/llvm/Object/SFrameParser.h
@@ -90,7 +90,7 @@ public:
                       uint32_t Idx, uint32_t Size, uint64_t Offset)
       : Data(Data), FREType(FREType), Idx(Idx), Size(Size), Offset(Offset) {}
 
-  Error inc();
+  LLVM_ABI Error inc();
   const FrameRowEntry &operator*() const { return FRE; }
 
   friend bool operator==(const FallibleFREIterator &LHS,
diff --git a/llvm/include/llvm/ProfileData/MemProfYAML.h b/llvm/include/llvm/ProfileData/MemProfYAML.h
index d66e16d..c55f780 100644
--- a/llvm/include/llvm/ProfileData/MemProfYAML.h
+++ b/llvm/include/llvm/ProfileData/MemProfYAML.h
@@ -141,7 +141,7 @@ template <> struct CustomMappingTraits<memprof::PortableMemInfoBlock> {
 #define MIBEntryDef(NameTag, Name, Type)                                       \
   if (KeyStr == #Name) {                                                       \
     uint64_t Value;                                                            \
-    Io.mapRequired(KeyStr.str().c_str(), Value);                               \
+    Io.mapRequired(KeyStr, Value);                                             \
     MIB.Name = static_cast<Type>(Value);                                       \
     MIB.Schema.set(llvm::to_underlying(memprof::Meta::Name));                  \
     return;                                                                    \
diff --git a/llvm/include/llvm/Support/ELFAttributeParser.h b/llvm/include/llvm/Support/ELFAttributeParser.h
index 97350ed..c2ad812 100644
--- a/llvm/include/llvm/Support/ELFAttributeParser.h
+++ b/llvm/include/llvm/Support/ELFAttributeParser.h
@@ -17,7 +17,7 @@ namespace llvm {
 
 class ELFAttributeParser {
 public:
-  virtual ~ELFAttributeParser() {}
+  virtual ~ELFAttributeParser() = default;
 
   virtual Error parse(ArrayRef<uint8_t> Section, llvm::endianness Endian) {
     return llvm::Error::success();
diff --git a/llvm/include/llvm/Support/GraphWriter.h b/llvm/include/llvm/Support/GraphWriter.h
index 3bef75c..43d9b0c 100644
--- a/llvm/include/llvm/Support/GraphWriter.h
+++ b/llvm/include/llvm/Support/GraphWriter.h
@@ -128,7 +128,7 @@ public:
     DTraits = DOTTraits(SN);
     RenderUsingHTML = DTraits.renderNodesUsingHTML();
   }
-  virtual ~GraphWriterBase() {}
+  virtual ~GraphWriterBase() = default;
 
   void writeGraph(const std::string &Title = "") {
     // Output the header for the graph...
@@ -369,7 +369,7 @@ class GraphWriter : public GraphWriterBase<GraphType, GraphWriter<GraphType>> {
 public:
   GraphWriter(raw_ostream &o, const GraphType &g, bool SN)
       : GraphWriterBase<GraphType, GraphWriter<GraphType>>(o, g, SN) {}
-  ~GraphWriter() override {}
+  ~GraphWriter() override = default;
 };
 
 template <typename GraphType>
diff --git a/llvm/include/llvm/Support/JSON.h b/llvm/include/llvm/Support/JSON.h
index d8c6de4..a973c56 100644
--- a/llvm/include/llvm/Support/JSON.h
+++ b/llvm/include/llvm/Support/JSON.h
@@ -154,7 +154,7 @@ public:
   LLVM_ABI const json::Array *getArray(StringRef K) const;
   LLVM_ABI json::Array *getArray(StringRef K);
 
-  friend bool operator==(const Object &LHS, const Object &RHS);
+  friend LLVM_ABI bool operator==(const Object &LHS, const Object &RHS);
 };
 LLVM_ABI bool operator==(const Object &LHS, const Object &RHS);
 inline bool operator!=(const Object &LHS, const Object &RHS) {
diff --git a/llvm/include/llvm/Support/SourceMgr.h b/llvm/include/llvm/Support/SourceMgr.h
index 8320006..43f7e27 100644
--- a/llvm/include/llvm/Support/SourceMgr.h
+++ b/llvm/include/llvm/Support/SourceMgr.h
@@ -103,7 +103,7 @@ private:
 
 public:
   /// Create new source manager without support for include files.
-  SourceMgr();
+  LLVM_ABI SourceMgr();
   /// Create new source manager with the capability of finding include files
   /// via the provided file system.
   explicit SourceMgr(IntrusiveRefCntPtr<vfs::FileSystem> FS);
@@ -111,10 +111,10 @@ public:
   SourceMgr &operator=(const SourceMgr &) = delete;
   SourceMgr(SourceMgr &&);
   SourceMgr &operator=(SourceMgr &&);
-  ~SourceMgr();
+  LLVM_ABI ~SourceMgr();
 
   IntrusiveRefCntPtr<vfs::FileSystem> getVirtualFileSystem() const;
-  void setVirtualFileSystem(IntrusiveRefCntPtr<vfs::FileSystem> FS);
+  LLVM_ABI void setVirtualFileSystem(IntrusiveRefCntPtr<vfs::FileSystem> FS);
 
   /// Return the include directories of this source manager.
   ArrayRef<std::string> getIncludeDirs() const { return IncludeDirectories; }
diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h b/llvm/include/llvm/Support/VirtualFileSystem.h
index c8911a0..dbd5a5c 100644
--- a/llvm/include/llvm/Support/VirtualFileSystem.h
+++ b/llvm/include/llvm/Support/VirtualFileSystem.h
@@ -1116,8 +1116,9 @@ protected:
 /// Collect all pairs of <virtual path, real path> entries from the
 /// \p VFS. This is used by the module dependency collector to forward
 /// the entries into the reproducer output VFS YAML file.
-void collectVFSEntries(RedirectingFileSystem &VFS,
-                       SmallVectorImpl<YAMLVFSEntry> &CollectedEntries);
+LLVM_ABI void
+collectVFSEntries(RedirectingFileSystem &VFS,
+                  SmallVectorImpl<YAMLVFSEntry> &CollectedEntries);
 
 class YAMLVFSWriter {
   std::vector<YAMLVFSEntry> Mappings;
diff --git a/llvm/include/llvm/Support/VirtualOutputBackend.h b/llvm/include/llvm/Support/VirtualOutputBackend.h
index 85caa021..78ed4b9b 100644
--- a/llvm/include/llvm/Support/VirtualOutputBackend.h
+++ b/llvm/include/llvm/Support/VirtualOutputBackend.h
@@ -32,7 +32,7 @@ namespace llvm::vfs {
 /// If virtual functions are added here, also add them to \a
 /// ProxyOutputBackend.
 class OutputBackend : public RefCountedBase<OutputBackend> {
-  virtual void anchor();
+  LLVM_ABI virtual void anchor();
 
 public:
   /// Get a backend that points to the same destination as this one but that
@@ -47,7 +47,7 @@ public:
   /// have been customized).
   ///
   /// Thread-safe.
-  Expected<OutputFile>
+  LLVM_ABI Expected<OutputFile>
   createFile(const Twine &Path,
              std::optional<OutputConfig> Config = std::nullopt);
 
diff --git a/llvm/include/llvm/Support/VirtualOutputBackends.h b/llvm/include/llvm/Support/VirtualOutputBackends.h
index 219bc30..13a9611 100644
--- a/llvm/include/llvm/Support/VirtualOutputBackends.h
+++ b/llvm/include/llvm/Support/VirtualOutputBackends.h
@@ -77,14 +77,14 @@ private:
 
 /// An output backend that creates files on disk, wrapping APIs in sys::fs.
 class OnDiskOutputBackend : public OutputBackend {
-  void anchor() override;
+  LLVM_ABI void anchor() override;
 
 protected:
   IntrusiveRefCntPtr<OutputBackend> cloneImpl() const override {
     return clone();
   }
 
-  Expected<std::unique_ptr<OutputFileImpl>>
+  LLVM_ABI Expected<std::unique_ptr<OutputFileImpl>>
   createFileImpl(StringRef Path, std::optional<OutputConfig> Config) override;
 
 public:
diff --git a/llvm/include/llvm/Support/VirtualOutputError.h b/llvm/include/llvm/Support/VirtualOutputError.h
index 2293ff9..44590a1 100644
--- a/llvm/include/llvm/Support/VirtualOutputError.h
+++ b/llvm/include/llvm/Support/VirtualOutputError.h
@@ -43,7 +43,7 @@ public:
   void log(raw_ostream &OS) const override;
 
   // Used by ErrorInfo::classID.
-  static char ID;
+  LLVM_ABI static char ID;
 
   OutputError(const Twine &OutputPath, std::error_code EC)
       : ErrorInfo<OutputError, ECError>(EC), OutputPath(OutputPath.str()) {
@@ -99,7 +99,7 @@ public:
   void log(raw_ostream &OS) const override;
 
   // Used by ErrorInfo::classID.
-  static char ID;
+  LLVM_ABI static char ID;
 
   TempFileOutputError(const Twine &TempPath, const Twine &OutputPath,
                       std::error_code EC)
diff --git a/llvm/include/llvm/Support/VirtualOutputFile.h b/llvm/include/llvm/Support/VirtualOutputFile.h
index dd50437..d53701c 100644
--- a/llvm/include/llvm/Support/VirtualOutputFile.h
+++ b/llvm/include/llvm/Support/VirtualOutputFile.h
@@ -80,13 +80,13 @@ public:
   ///
   /// If there's an open proxy from \a createProxy(), calls \a discard() to
   /// clean up temporaries followed by \a report_fatal_error().
-  Error keep();
+  LLVM_ABI Error keep();
 
   /// Discard an output, cleaning up any temporary state. Errors if clean-up
   /// fails.
   ///
   /// If it has already been closed, calls \a report_fatal_error().
-  Error discard();
+  LLVM_ABI Error discard();
 
   /// Discard the output when destroying it if it's still open, sending the
   /// result to \a Handler.
@@ -98,7 +98,7 @@ public:
   /// producer. Errors if there's already a proxy. The proxy must be deleted
   /// before calling \a keep(). The proxy will crash if it's written to after
   /// calling \a discard().
-  Expected<std::unique_ptr<raw_pwrite_stream>> createProxy();
+  LLVM_ABI Expected<std::unique_ptr<raw_pwrite_stream>> createProxy();
 
   bool hasOpenProxy() const { return OpenProxy; }
 
@@ -132,7 +132,7 @@ public:
 private:
   /// Destroy \a Impl. Reports fatal error if the file is open and there's no
   /// handler from \a discardOnDestroy().
-  void destroy();
+  LLVM_ABI void destroy();
   OutputFile &moveFrom(OutputFile &O) {
     Path = std::move(O.Path);
     Impl = std::move(O.Impl);
diff --git a/llvm/include/llvm/Support/YAMLTraits.h b/llvm/include/llvm/Support/YAMLTraits.h
index 3d36f41..b53b28d 100644
--- a/llvm/include/llvm/Support/YAMLTraits.h
+++ b/llvm/include/llvm/Support/YAMLTraits.h
@@ -1921,12 +1921,12 @@ template <typename T> struct StdMapStringCustomMappingTraitsImpl {
   using map_type = std::map<std::string, T>;
 
   static void inputOne(IO &io, StringRef key, map_type &v) {
-    io.mapRequired(key.str().c_str(), v[std::string(key)]);
+    io.mapRequired(key, v[std::string(key)]);
   }
 
   static void output(IO &io, map_type &v) {
     for (auto &p : v)
-      io.mapRequired(p.first.c_str(), p.second);
+      io.mapRequired(p.first, p.second);
   }
 };
 
diff --git a/llvm/include/llvm/Transforms/IPO/InferFunctionAttrs.h b/llvm/include/llvm/Transforms/IPO/InferFunctionAttrs.h
index 8addf49..272b960 100644
--- a/llvm/include/llvm/Transforms/IPO/InferFunctionAttrs.h
+++ b/llvm/include/llvm/Transforms/IPO/InferFunctionAttrs.h
@@ -23,7 +23,7 @@ class Module;
 /// A pass which infers function attributes from the names and signatures of
 /// function declarations in a module.
 struct InferFunctionAttrsPass : PassInfoMixin<InferFunctionAttrsPass> {
-  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  LLVM_ABI PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 };
 
 }
diff --git a/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h b/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h
index a8a09fb..346e7f0 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h
@@ -33,7 +33,7 @@ class FileSystem;
 /// appends globals to llvm.compiler.used.
 class SanitizerCoveragePass : public PassInfoMixin<SanitizerCoveragePass> {
 public:
-  explicit SanitizerCoveragePass(
+  LLVM_ABI explicit SanitizerCoveragePass(
       SanitizerCoverageOptions Options = SanitizerCoverageOptions(),
       IntrusiveRefCntPtr<vfs::FileSystem> VFS = nullptr,
       const std::vector<std::string> &AllowlistFiles = {},
diff --git a/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp b/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp
index 3de3dcc..80b421d 100644
--- a/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp
+++ b/llvm/lib/BinaryFormat/MsgPackDocumentYAML.cpp
@@ -209,12 +209,12 @@ template <> struct CustomMappingTraits<MapDocNode> {
   static void inputOne(IO &IO, StringRef Key, MapDocNode &M) {
     ScalarDocNode KeyObj = M.getDocument()->getNode();
     KeyObj.fromString(Key, "");
-    IO.mapRequired(Key.str().c_str(), M.getMap()[KeyObj]);
+    IO.mapRequired(Key, M.getMap()[KeyObj]);
   }
 
   static void output(IO &IO, MapDocNode &M) {
     for (auto I : M.getMap()) {
-      IO.mapRequired(I.first.toString().c_str(), I.second);
+      IO.mapRequired(I.first.toString(), I.second);
     }
   }
 };
diff --git a/llvm/lib/CGData/OutlinedHashTreeRecord.cpp b/llvm/lib/CGData/OutlinedHashTreeRecord.cpp
index cc76063..2b6e2f0 100644
--- a/llvm/lib/CGData/OutlinedHashTreeRecord.cpp
+++ b/llvm/lib/CGData/OutlinedHashTreeRecord.cpp
@@ -37,7 +37,7 @@ template <> struct MappingTraits<HashNodeStable> {
 template <> struct CustomMappingTraits<IdHashNodeStableMapTy> {
   static void inputOne(IO &io, StringRef Key, IdHashNodeStableMapTy &V) {
     HashNodeStable NodeStable;
-    io.mapRequired(Key.str().c_str(), NodeStable);
+    io.mapRequired(Key, NodeStable);
     unsigned Id;
     if (Key.getAsInteger(0, Id)) {
       io.setError("Id not an integer");
@@ -48,7 +48,7 @@ template <> struct CustomMappingTraits<IdHashNodeStableMapTy> {
 
   static void output(IO &io, IdHashNodeStableMapTy &V) {
     for (auto Iter = V.begin(); Iter != V.end(); ++Iter)
-      io.mapRequired(utostr(Iter->first).c_str(), Iter->second);
+      io.mapRequired(utostr(Iter->first), Iter->second);
   }
 };
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index bf1abfe..58983cb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -1172,6 +1172,12 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
   case ISD::FAKE_USE:
     Res = SoftenFloatOp_FAKE_USE(N);
     break;
+  case ISD::STACKMAP:
+    Res = SoftenFloatOp_STACKMAP(N, OpNo);
+    break;
+  case ISD::PATCHPOINT:
+    Res = SoftenFloatOp_PATCHPOINT(N, OpNo);
+    break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -1512,6 +1518,20 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FAKE_USE(SDNode *N) {
                      N->getOperand(0), Op1);
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatOp_STACKMAP(SDNode *N, unsigned OpNo) {
+  assert(OpNo > 1); // Because the first two arguments are guaranteed legal.
+  SmallVector<SDValue> NewOps(N->ops());
+  NewOps[OpNo] = GetSoftenedFloat(NewOps[OpNo]);
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_PATCHPOINT(SDNode *N, unsigned OpNo) {
+  assert(OpNo >= 7);
+  SmallVector<SDValue> NewOps(N->ops());
+  NewOps[OpNo] = GetSoftenedFloat(NewOps[OpNo]);
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
 //===----------------------------------------------------------------------===//
 //  Float Result Expansion
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 9656a30..ede522e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -658,6 +658,8 @@ private:
   SDValue SoftenFloatOp_ATOMIC_STORE(SDNode *N, unsigned OpNo);
   SDValue SoftenFloatOp_FCOPYSIGN(SDNode *N);
   SDValue SoftenFloatOp_FAKE_USE(SDNode *N);
+  SDValue SoftenFloatOp_STACKMAP(SDNode *N, unsigned OpNo);
+  SDValue SoftenFloatOp_PATCHPOINT(SDNode *N, unsigned OpNo);
 
   //===--------------------------------------------------------------------===//
   // Float Expansion Support: LegalizeFloatTypes.cpp
diff --git a/llvm/lib/CodeGenTypes/LowLevelType.cpp b/llvm/lib/CodeGenTypes/LowLevelType.cpp
index 4785f26..92b7fad 100644
--- a/llvm/lib/CodeGenTypes/LowLevelType.cpp
+++ b/llvm/lib/CodeGenTypes/LowLevelType.cpp
@@ -54,9 +54,3 @@ LLVM_DUMP_METHOD void LLT::dump() const {
   dbgs() << '\n';
 }
 #endif
-
-const constexpr LLT::BitFieldInfo LLT::ScalarSizeFieldInfo;
-const constexpr LLT::BitFieldInfo LLT::PointerSizeFieldInfo;
-const constexpr LLT::BitFieldInfo LLT::PointerAddressSpaceFieldInfo;
-const constexpr LLT::BitFieldInfo LLT::VectorElementsFieldInfo;
-const constexpr LLT::BitFieldInfo LLT::VectorScalableFieldInfo;
diff --git a/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp b/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
index 6c23ba8..23ab534 100644
--- a/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
+++ b/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
@@ -102,7 +102,8 @@ std::optional<CVType> LazyRandomTypeCollection::tryGetType(TypeIndex Index) {
     return std::nullopt;
   }
 
-  assert(contains(Index));
+  if (!contains(Index))
+    return std::nullopt;
   return Records[Index.toArrayIndex()].Type;
 }
 
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
index 9275586..ca8192b 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
@@ -16,9 +16,11 @@ add_llvm_component_library(LLVMOrcTargetProcess
   ExecutorSharedMemoryMapperService.cpp
   DefaultHostBootstrapValues.cpp
   ExecutorResolver.cpp
+  LibraryResolver.cpp
   JITLoaderGDB.cpp
   JITLoaderPerf.cpp
   JITLoaderVTune.cpp
+  LibraryScanner.cpp
   OrcRTBootstrap.cpp
   RegisterEHFrames.cpp
   SimpleExecutorDylibManager.cpp
@@ -36,6 +38,8 @@ add_llvm_component_library(LLVMOrcTargetProcess
 
   LINK_COMPONENTS
   ${intel_jit_profiling}
+  BinaryFormat
+  Object
   OrcShared
   Support
   TargetParser
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp
new file mode 100644
index 0000000..35da82a
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp
@@ -0,0 +1,370 @@
+//===- LibraryResolver.cpp - Library Resolution of Unresolved Symbols ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Library resolution impl for unresolved symbols
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h"
+
+#include "llvm/ADT/StringSet.h"
+
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Error.h"
+
+#include <mutex>
+#include <thread>
+
+#define DEBUG_TYPE "orc-resolver"
+
+namespace llvm::orc {
+
+LibraryResolver::LibraryResolver(const LibraryResolver::Setup &S)
+    : LibPathCache(S.Cache ? S.Cache : std::make_shared<LibraryPathCache>()),
+      LibPathResolver(S.PResolver
+                          ? S.PResolver
+                          : std::make_shared<PathResolver>(LibPathCache)),
+      ScanHelper(S.BasePaths, LibPathCache, LibPathResolver),
+      FB(S.FilterBuilder), LibMgr(),
+      ShouldScanCall(S.ShouldScanCall ? S.ShouldScanCall
+                                      : [](StringRef) -> bool { return true; }),
+      scanBatchSize(S.ScanBatchSize) {
+
+  if (ScanHelper.getAllUnits().empty()) {
+    LLVM_DEBUG(dbgs() << "Warning: No base paths provided for scanning.\n");
+  }
+}
+
+std::unique_ptr<LibraryResolutionDriver>
+LibraryResolutionDriver::create(const LibraryResolver::Setup &S) {
+  auto LR = std::make_unique<LibraryResolver>(S);
+  return std::unique_ptr<LibraryResolutionDriver>(
+      new LibraryResolutionDriver(std::move(LR)));
+}
+
+void LibraryResolutionDriver::addScanPath(const std::string &Path, PathType K) {
+  LR->ScanHelper.addBasePath(Path, K);
+}
+
+bool LibraryResolutionDriver::markLibraryLoaded(StringRef Path) {
+  auto Lib = LR->LibMgr.getLibrary(Path);
+  if (!Lib)
+    return false;
+
+  Lib->setState(LibraryManager::LibState::Loaded);
+
+  return true;
+}
+
+bool LibraryResolutionDriver::markLibraryUnLoaded(StringRef Path) {
+  auto Lib = LR->LibMgr.getLibrary(Path);
+  if (!Lib)
+    return false;
+
+  Lib->setState(LibraryManager::LibState::Unloaded);
+
+  return true;
+}
+
+void LibraryResolutionDriver::resolveSymbols(
+    std::vector<std::string> Syms,
+    LibraryResolver::OnSearchComplete OnCompletion,
+    const SearchConfig &Config) {
+  LR->searchSymbolsInLibraries(Syms, std::move(OnCompletion), Config);
+}
+
+static bool shouldIgnoreSymbol(const object::SymbolRef &Sym,
+                               uint32_t IgnoreFlags) {
+  Expected<uint32_t> FlagsOrErr = Sym.getFlags();
+  if (!FlagsOrErr) {
+    consumeError(FlagsOrErr.takeError());
+    return true;
+  }
+
+  uint32_t Flags = *FlagsOrErr;
+
+  using Filter = SymbolEnumeratorOptions;
+  if ((IgnoreFlags & Filter::IgnoreUndefined) &&
+      (Flags & object::SymbolRef::SF_Undefined))
+    return true;
+  if ((IgnoreFlags & Filter::IgnoreIndirect) &&
+      (Flags & object::SymbolRef::SF_Indirect))
+    return true;
+  if ((IgnoreFlags & Filter::IgnoreWeak) &&
+      (Flags & object::SymbolRef::SF_Weak))
+    return true;
+
+  return false;
+}
+
+bool SymbolEnumerator::enumerateSymbols(StringRef Path, OnEachSymbolFn OnEach,
+                                        const SymbolEnumeratorOptions &Opts) {
+  if (Path.empty())
+    return false;
+
+  ObjectFileLoader ObjLoader(Path);
+
+  auto ObjOrErr = ObjLoader.getObjectFile();
+  if (!ObjOrErr) {
+    std::string ErrMsg;
+    handleAllErrors(ObjOrErr.takeError(),
+                    [&](const ErrorInfoBase &EIB) { ErrMsg = EIB.message(); });
+    LLVM_DEBUG(dbgs() << "Failed loading object file: " << Path
+                      << "\nError: " << ErrMsg << "\n");
+    return false;
+  }
+
+  object::ObjectFile *Obj = &ObjOrErr.get();
+
+  auto processSymbolRange =
+      [&](object::ObjectFile::symbol_iterator_range Range) -> EnumerateResult {
+    for (const auto &Sym : Range) {
+      if (shouldIgnoreSymbol(Sym, Opts.FilterFlags))
+        continue;
+
+      auto NameOrErr = Sym.getName();
+      if (!NameOrErr) {
+        consumeError(NameOrErr.takeError());
+        continue;
+      }
+
+      StringRef Name = *NameOrErr;
+      if (Name.empty())
+        continue;
+
+      EnumerateResult Res = OnEach(Name);
+      if (Res != EnumerateResult::Continue)
+        return Res;
+    }
+    return EnumerateResult::Continue;
+  };
+
+  EnumerateResult Res = processSymbolRange(Obj->symbols());
+  if (Res != EnumerateResult::Continue)
+    return Res == EnumerateResult::Stop;
+
+  if (Obj->isELF()) {
+    const auto *ElfObj = cast<object::ELFObjectFileBase>(Obj);
+    Res = processSymbolRange(ElfObj->getDynamicSymbolIterators());
+    if (Res != EnumerateResult::Continue)
+      return Res == EnumerateResult::Stop;
+  } else if (Obj->isCOFF()) {
+    const auto *CoffObj = cast<object::COFFObjectFile>(Obj);
+    for (auto I = CoffObj->export_directory_begin(),
+              E = CoffObj->export_directory_end();
+         I != E; ++I) {
+      StringRef Name;
+      if (I->getSymbolName(Name))
+        continue;
+      if (Name.empty())
+        continue;
+
+      EnumerateResult Res = OnEach(Name);
+      if (Res != EnumerateResult::Continue)
+        return Res == EnumerateResult::Stop;
+    }
+  } else if (Obj->isMachO()) {
+  }
+
+  return true;
+}
+
+class SymbolSearchContext {
+public:
+  SymbolSearchContext(SymbolQuery &Q) : Q(Q) {}
+
+  bool hasSearched(LibraryInfo *Lib) const { return Searched.count(Lib); }
+
+  void markSearched(LibraryInfo *Lib) { Searched.insert(Lib); }
+
+  inline bool allResolved() const { return Q.allResolved(); }
+
+  SymbolQuery &query() { return Q; }
+
+private:
+  SymbolQuery &Q;
+  DenseSet<LibraryInfo *> Searched;
+};
+
+void LibraryResolver::resolveSymbolsInLibrary(
+    LibraryInfo &Lib, SymbolQuery &UnresolvedSymbols,
+    const SymbolEnumeratorOptions &Opts) {
+  LLVM_DEBUG(dbgs() << "Checking unresolved symbols "
+                    << " in library : " << Lib.getFileName() << "\n";);
+  StringSet<> DiscoveredSymbols;
+
+  if (!UnresolvedSymbols.hasUnresolved()) {
+    LLVM_DEBUG(dbgs() << "Skipping library: " << Lib.getFullPath()
+                      << " — unresolved symbols exist.\n";);
+    return;
+  }
+
+  bool HasEnumerated = false;
+  auto enumerateSymbolsIfNeeded = [&]() {
+    if (HasEnumerated)
+      return;
+
+    HasEnumerated = true;
+
+    LLVM_DEBUG(dbgs() << "Enumerating symbols in library: " << Lib.getFullPath()
+                      << "\n";);
+    SymbolEnumerator::enumerateSymbols(
+        Lib.getFullPath(),
+        [&](StringRef sym) {
+          DiscoveredSymbols.insert(sym);
+          return EnumerateResult::Continue;
+        },
+        Opts);
+
+    if (DiscoveredSymbols.empty()) {
+      LLVM_DEBUG(dbgs() << "  No symbols and remove library : "
+                        << Lib.getFullPath() << "\n";);
+      LibMgr.removeLibrary(Lib.getFullPath());
+      return;
+    }
+  };
+
+  if (!Lib.hasFilter()) {
+    LLVM_DEBUG(dbgs() << "Building filter for library: " << Lib.getFullPath()
+                      << "\n";);
+    enumerateSymbolsIfNeeded();
+    SmallVector<StringRef> SymbolVec;
+    SymbolVec.reserve(DiscoveredSymbols.size());
+    for (const auto &KV : DiscoveredSymbols)
+      SymbolVec.push_back(KV.first());
+
+    Lib.ensureFilterBuilt(FB, SymbolVec);
+    LLVM_DEBUG({
+      dbgs() << "DiscoveredSymbols : " << DiscoveredSymbols.size() << "\n";
+      for (const auto &KV : DiscoveredSymbols)
+        dbgs() << "DiscoveredSymbols : " << KV.first() << "\n";
+    });
+  }
+
+  const auto &Unresolved = UnresolvedSymbols.getUnresolvedSymbols();
+  bool HadAnySym = false;
+  LLVM_DEBUG(dbgs() << "Total unresolved symbols : " << Unresolved.size()
+                    << "\n";);
+  for (const auto &Sym : Unresolved) {
+    if (Lib.mayContain(Sym)) {
+      LLVM_DEBUG(dbgs() << "Checking symbol '" << Sym
+                        << "' in library: " << Lib.getFullPath() << "\n";);
+      enumerateSymbolsIfNeeded();
+      if (DiscoveredSymbols.count(Sym) > 0) {
+        LLVM_DEBUG(dbgs() << "  Resolved symbol: " << Sym
+                          << " in library: " << Lib.getFullPath() << "\n";);
+        UnresolvedSymbols.resolve(Sym, Lib.getFullPath());
+        HadAnySym = true;
+      }
+    }
+  }
+
+  using LibraryState = LibraryManager::LibState;
+  if (HadAnySym && Lib.getState() != LibraryState::Loaded)
+    Lib.setState(LibraryState::Queried);
+}
+
+void LibraryResolver::searchSymbolsInLibraries(
+    std::vector<std::string> &SymbolList, OnSearchComplete OnComplete,
+    const SearchConfig &Config) {
+  SymbolQuery Q(SymbolList);
+
+  using LibraryState = LibraryManager::LibState;
+  using LibraryType = PathType;
+  auto tryResolveFrom = [&](LibraryState S, LibraryType K) {
+    LLVM_DEBUG(dbgs() << "Trying resolve from state=" << static_cast<int>(S)
+                      << " type=" << static_cast<int>(K) << "\n";);
+
+    SymbolSearchContext Ctx(Q);
+    while (!Ctx.allResolved()) {
+
+      for (auto &Lib : LibMgr.getView(S, K)) {
+        if (Ctx.hasSearched(Lib.get()))
+          continue;
+
+        // can use Async here?
+        resolveSymbolsInLibrary(*Lib, Ctx.query(), Config.Options);
+        Ctx.markSearched(Lib.get());
+
+        if (Ctx.allResolved())
+          return;
+      }
+
+      if (Ctx.allResolved())
+        return;
+
+      if (!scanLibrariesIfNeeded(K, scanBatchSize))
+        break; // no more new libs to scan
+    }
+  };
+
+  for (const auto &[St, Ty] : Config.Policy.Plan) {
+    tryResolveFrom(St, Ty);
+    if (Q.allResolved())
+      break;
+  }
+
+  // done:
+  LLVM_DEBUG({
+    dbgs() << "Search complete.\n";
+    for (const auto &r : Q.getAllResults())
+      dbgs() << "Resolved Symbol:" << r->Name << " -> " << r->ResolvedLibPath
+             << "\n";
+  });
+
+  OnComplete(Q);
+}
+
+bool LibraryResolver::scanLibrariesIfNeeded(PathType PK, size_t BatchSize) {
+  LLVM_DEBUG(dbgs() << "LibraryResolver::scanLibrariesIfNeeded: Scanning for "
+                    << (PK == PathType::User ? "User" : "System")
+                    << " libraries\n";);
+  if (!ScanHelper.leftToScan(PK))
+    return false;
+
+  LibraryScanner Scanner(ScanHelper, LibMgr, ShouldScanCall);
+  Scanner.scanNext(PK, BatchSize);
+  return true;
+}
+
+bool LibraryResolver::symbolExistsInLibrary(const LibraryInfo &Lib,
+                                            StringRef SymName,
+                                            std::vector<std::string> *AllSyms) {
+  SymbolEnumeratorOptions Opts;
+  return symbolExistsInLibrary(Lib, SymName, AllSyms, Opts);
+}
+
+bool LibraryResolver::symbolExistsInLibrary(
+    const LibraryInfo &Lib, StringRef SymName,
+    std::vector<std::string> *AllSyms, const SymbolEnumeratorOptions &Opts) {
+  bool Found = false;
+
+  SymbolEnumerator::enumerateSymbols(
+      Lib.getFullPath(),
+      [&](StringRef Sym) {
+        if (AllSyms)
+          AllSyms->emplace_back(Sym.str());
+
+        if (Sym == SymName) {
+          Found = true;
+        }
+
+        return EnumerateResult::Continue;
+      },
+      Opts);
+
+  return Found;
+}
+
+} // end namespace llvm::orc
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp
new file mode 100644
index 0000000..d93f686
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp
@@ -0,0 +1,1161 @@
+//===- LibraryScanner.cpp - Provide Library Scanning Implementation ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h"
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/ELFTypes.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/MachOUniversal.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
+#include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/Triple.h"
+
+#ifdef LLVM_ON_UNIX
+#include <sys/stat.h>
+#include <unistd.h>
+#endif // LLVM_ON_UNIX
+
+#ifdef __APPLE__
+#include <sys/stat.h>
+#undef LC_LOAD_DYLIB
+#undef LC_RPATH
+#endif // __APPLE__
+
+#define DEBUG_TYPE "orc-scanner"
+
+namespace llvm::orc {
+
+void handleError(Error Err, StringRef context = "") {
+  consumeError(handleErrors(std::move(Err), [&](const ErrorInfoBase &EIB) {
+    dbgs() << "LLVM Error";
+    if (!context.empty())
+      dbgs() << " [" << context << "]";
+    dbgs() << ": " << EIB.message() << "\n";
+  }));
+}
+
+bool ObjectFileLoader::isArchitectureCompatible(const object::ObjectFile &Obj) {
+  Triple HostTriple(sys::getDefaultTargetTriple());
+  Triple ObjTriple = Obj.makeTriple();
+
+  LLVM_DEBUG({
+    dbgs() << "Host triple: " << HostTriple.str()
+           << ", Object triple: " << ObjTriple.str() << "\n";
+  });
+
+  if (ObjTriple.getArch() != Triple::UnknownArch &&
+      HostTriple.getArch() != ObjTriple.getArch())
+    return false;
+
+  if (ObjTriple.getOS() != Triple::UnknownOS &&
+      HostTriple.getOS() != ObjTriple.getOS())
+    return false;
+
+  if (ObjTriple.getEnvironment() != Triple::UnknownEnvironment &&
+      HostTriple.getEnvironment() != Triple::UnknownEnvironment &&
+      HostTriple.getEnvironment() != ObjTriple.getEnvironment())
+    return false;
+
+  return true;
+}
+
+Expected<object::OwningBinary<object::ObjectFile>>
+ObjectFileLoader::loadObjectFileWithOwnership(StringRef FilePath) {
+  LLVM_DEBUG(dbgs() << "ObjectFileLoader: Attempting to open file " << FilePath
+                    << "\n";);
+  auto BinOrErr = object::createBinary(FilePath);
+  if (!BinOrErr) {
+    LLVM_DEBUG(dbgs() << "ObjectFileLoader: Failed to open file " << FilePath
+                      << "\n";);
+    return BinOrErr.takeError();
+  }
+
+  LLVM_DEBUG(dbgs() << "ObjectFileLoader: Successfully opened file " << FilePath
+                    << "\n";);
+
+  auto OwningBin = BinOrErr->takeBinary();
+  object::Binary *Bin = OwningBin.first.get();
+
+  if (Bin->isArchive()) {
+    LLVM_DEBUG(dbgs() << "ObjectFileLoader: File is an archive, not supported: "
+                      << FilePath << "\n";);
+    return createStringError(std::errc::invalid_argument,
+                             "Archive files are not supported: %s",
+                             FilePath.str().c_str());
+  }
+
+#if defined(__APPLE__)
+  if (auto *UB = dyn_cast<object::MachOUniversalBinary>(Bin)) {
+    LLVM_DEBUG(dbgs() << "ObjectFileLoader: Detected Mach-O universal binary: "
+                      << FilePath << "\n";);
+    for (auto ObjForArch : UB->objects()) {
+      auto ObjOrErr = ObjForArch.getAsObjectFile();
+      if (!ObjOrErr) {
+        LLVM_DEBUG(
+            dbgs()
+                << "ObjectFileLoader: Skipping invalid architecture slice\n";);
+
+        consumeError(ObjOrErr.takeError());
+        continue;
+      }
+
+      std::unique_ptr<object::ObjectFile> Obj = std::move(ObjOrErr.get());
+      if (isArchitectureCompatible(*Obj)) {
+        LLVM_DEBUG(
+            dbgs() << "ObjectFileLoader: Found compatible object slice\n";);
+
+        return object::OwningBinary<object::ObjectFile>(
+            std::move(Obj), std::move(OwningBin.second));
+
+      } else {
+        LLVM_DEBUG(dbgs() << "ObjectFileLoader: Incompatible architecture "
+                             "slice skipped\n";);
+      }
+    }
+    LLVM_DEBUG(dbgs() << "ObjectFileLoader: No compatible slices found in "
+                         "universal binary\n";);
+    return createStringError(inconvertibleErrorCode(),
+                             "No compatible object found in fat binary: %s",
+                             FilePath.str().c_str());
+  }
+#endif
+
+  auto ObjOrErr =
+      object::ObjectFile::createObjectFile(Bin->getMemoryBufferRef());
+  if (!ObjOrErr) {
+    LLVM_DEBUG(dbgs() << "ObjectFileLoader: Failed to create object file\n";);
+    return ObjOrErr.takeError();
+  }
+  LLVM_DEBUG(dbgs() << "ObjectFileLoader: Detected object file\n";);
+
+  std::unique_ptr<object::ObjectFile> Obj = std::move(*ObjOrErr);
+  if (!isArchitectureCompatible(*Obj)) {
+    LLVM_DEBUG(dbgs() << "ObjectFileLoader: Incompatible architecture: "
+                      << FilePath << "\n";);
+    return createStringError(inconvertibleErrorCode(),
+                             "Incompatible object file: %s",
+                             FilePath.str().c_str());
+  }
+
+  LLVM_DEBUG(dbgs() << "ObjectFileLoader: Object file is compatible\n";);
+
+  return object::OwningBinary<object::ObjectFile>(std::move(Obj),
+                                                  std::move(OwningBin.second));
+}
+
+template <class ELFT>
+bool isELFSharedLibrary(const object::ELFFile<ELFT> &ELFObj) {
+  if (ELFObj.getHeader().e_type != ELF::ET_DYN)
+    return false;
+
+  auto PHOrErr = ELFObj.program_headers();
+  if (!PHOrErr) {
+    consumeError(PHOrErr.takeError());
+    return true;
+  }
+
+  for (auto Phdr : *PHOrErr) {
+    if (Phdr.p_type == ELF::PT_INTERP)
+      return false;
+  }
+
+  return true;
+}
+
+bool isSharedLibraryObject(object::ObjectFile &Obj) {
+  if (Obj.isELF()) {
+    if (auto *ELF32LE = dyn_cast<object::ELF32LEObjectFile>(&Obj))
+      return isELFSharedLibrary(ELF32LE->getELFFile());
+    if (auto *ELF64LE = dyn_cast<object::ELF64LEObjectFile>(&Obj))
+      return isELFSharedLibrary(ELF64LE->getELFFile());
+    if (auto *ELF32BE = dyn_cast<object::ELF32BEObjectFile>(&Obj))
+      return isELFSharedLibrary(ELF32BE->getELFFile());
+    if (auto *ELF64BE = dyn_cast<object::ELF64BEObjectFile>(&Obj))
+      return isELFSharedLibrary(ELF64BE->getELFFile());
+  } else if (Obj.isMachO()) {
+    const object::MachOObjectFile *MachO =
+        dyn_cast<object::MachOObjectFile>(&Obj);
+    if (!MachO) {
+      LLVM_DEBUG(dbgs() << "Failed to cast to MachOObjectFile.\n";);
+      return false;
+    }
+    LLVM_DEBUG({
+      bool Result =
+          MachO->getHeader().filetype == MachO::HeaderFileType::MH_DYLIB;
+      dbgs() << "Mach-O filetype: " << MachO->getHeader().filetype
+             << " (MH_DYLIB == " << MachO::HeaderFileType::MH_DYLIB
+             << "), shared: " << Result << "\n";
+    });
+
+    return MachO->getHeader().filetype == MachO::HeaderFileType::MH_DYLIB;
+  } else if (Obj.isCOFF()) {
+    const object::COFFObjectFile *coff = dyn_cast<object::COFFObjectFile>(&Obj);
+    if (!coff)
+      return false;
+    return coff->getCharacteristics() & COFF::IMAGE_FILE_DLL;
+  } else {
+    LLVM_DEBUG(dbgs() << "Binary is not an ObjectFile.\n";);
+  }
+
+  return false;
+}
+
+bool DylibPathValidator::isSharedLibrary(StringRef Path) {
+  LLVM_DEBUG(dbgs() << "Checking if path is a shared library: " << Path
+                    << "\n";);
+
+  auto FileType = sys::fs::get_file_type(Path, /*Follow*/ true);
+  if (FileType != sys::fs::file_type::regular_file) {
+    LLVM_DEBUG(dbgs() << "File type is not a regular file for path: " << Path
+                      << "\n";);
+    return false;
+  }
+
+  file_magic MagicCode;
+  identify_magic(Path, MagicCode);
+
+  // Skip archives.
+  if (MagicCode == file_magic::archive)
+    return false;
+
+  // Universal binary handling.
+#if defined(__APPLE__)
+  if (MagicCode == file_magic::macho_universal_binary) {
+    ObjectFileLoader ObjLoader(Path);
+    auto ObjOrErr = ObjLoader.getObjectFile();
+    if (!ObjOrErr) {
+      consumeError(ObjOrErr.takeError());
+      return false;
+    }
+    return isSharedLibraryObject(ObjOrErr.get());
+  }
+#endif
+
+  // Object file inspection for PE/COFF, ELF, and Mach-O
+  bool NeedsObjectInspection =
+#if defined(_WIN32)
+      (MagicCode == file_magic::pecoff_executable);
+#elif defined(__APPLE__)
+      (MagicCode == file_magic::macho_fixed_virtual_memory_shared_lib ||
+       MagicCode == file_magic::macho_dynamically_linked_shared_lib ||
+       MagicCode == file_magic::macho_dynamically_linked_shared_lib_stub);
+#elif defined(LLVM_ON_UNIX)
+#ifdef __CYGWIN__
+      (MagicCode == file_magic::pecoff_executable);
+#else
+      (MagicCode == file_magic::elf_shared_object);
+#endif
+#else
+#error "Unsupported platform."
+#endif
+
+  if (NeedsObjectInspection) {
+    ObjectFileLoader ObjLoader(Path);
+    auto ObjOrErr = ObjLoader.getObjectFile();
+    if (!ObjOrErr) {
+      consumeError(ObjOrErr.takeError());
+      return false;
+    }
+    return isSharedLibraryObject(ObjOrErr.get());
+  }
+
+  LLVM_DEBUG(dbgs() << "Path is not identified as a shared library: " << Path
+                    << "\n";);
+  return false;
+}
+
+void DylibSubstitutor::configure(StringRef LoaderPath) {
+  SmallString<512> ExecPath(sys::fs::getMainExecutable(nullptr, nullptr));
+  sys::path::remove_filename(ExecPath);
+
+  SmallString<512> LoaderDir;
+  if (LoaderPath.empty()) {
+    LoaderDir = ExecPath;
+  } else {
+    LoaderDir = LoaderPath.str();
+    if (!sys::fs::is_directory(LoaderPath))
+      sys::path::remove_filename(LoaderDir);
+  }
+
+#ifdef __APPLE__
+  Placeholders["@loader_path"] = std::string(LoaderDir);
+  Placeholders["@executable_path"] = std::string(ExecPath);
+#else
+  Placeholders["$origin"] = std::string(LoaderDir);
+#endif
+}
+
+std::optional<std::string>
+SearchPathResolver::resolve(StringRef Stem, const DylibSubstitutor &Subst,
+                            DylibPathValidator &Validator) const {
+  for (const auto &SP : Paths) {
+    std::string Base = Subst.substitute(SP);
+
+    SmallString<512> FullPath(Base);
+    if (!PlaceholderPrefix.empty() &&
+        Stem.starts_with_insensitive(PlaceholderPrefix))
+      FullPath.append(Stem.drop_front(PlaceholderPrefix.size()));
+    else
+      sys::path::append(FullPath, Stem);
+
+    LLVM_DEBUG(dbgs() << "SearchPathResolver::resolve FullPath = " << FullPath
+                      << "\n";);
+
+    if (auto Valid = Validator.validate(FullPath.str()))
+      return Valid;
+  }
+
+  return std::nullopt;
+}
+
+std::optional<std::string>
+DylibResolverImpl::tryWithExtensions(StringRef LibStem) const {
+  LLVM_DEBUG(dbgs() << "tryWithExtensions: baseName = " << LibStem << "\n";);
+  SmallVector<SmallString<256>, 8> Candidates;
+
+  // Add extensions by platform
+#if defined(__APPLE__)
+  Candidates.emplace_back(LibStem);
+  Candidates.back() += ".dylib";
+#elif defined(_WIN32)
+  Candidates.emplace_back(LibStem);
+  Candidates.back() += ".dll";
+#else
+  Candidates.emplace_back(LibStem);
+  Candidates.back() += ".so";
+#endif
+
+  // Optionally try "lib" prefix if not already there
+  StringRef FileName = sys::path::filename(LibStem);
+  StringRef Base = sys::path::parent_path(LibStem);
+  if (!FileName.starts_with("lib")) {
+    SmallString<256> WithPrefix(Base);
+    if (!WithPrefix.empty())
+      sys::path::append(WithPrefix, ""); // ensure separator if needed
+    WithPrefix += "lib";
+    WithPrefix += FileName;
+
+#if defined(__APPLE__)
+    WithPrefix += ".dylib";
+#elif defined(_WIN32)
+    WithPrefix += ".dll";
+#else
+    WithPrefix += ".so";
+#endif
+
+    Candidates.push_back(std::move(WithPrefix));
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "  Candidates to try:\n";
+    for (const auto &C : Candidates)
+      dbgs() << "    " << C << "\n";
+  });
+
+  // Try all variants using tryAllPaths
+  for (const auto &Name : Candidates) {
+
+    LLVM_DEBUG(dbgs() << "  Trying candidate: " << Name << "\n";);
+
+    for (const auto &R : Resolvers) {
+      if (auto Res = R.resolve(Name, Substitutor, Validator))
+        return Res;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "  -> No candidate Resolved.\n";);
+
+  return std::nullopt;
+}
+
+std::optional<std::string>
+DylibResolverImpl::resolve(StringRef LibStem, bool VariateLibStem) const {
+  LLVM_DEBUG(dbgs() << "Resolving library stem: " << LibStem << "\n";);
+
+  // If it is an absolute path, don't try iterate over the paths.
+  if (sys::path::is_absolute(LibStem)) {
+    LLVM_DEBUG(dbgs() << "  -> Absolute path detected.\n";);
+    return Validator.validate(LibStem);
+  }
+
+  if (!LibStem.starts_with_insensitive("@rpath")) {
+    if (auto norm = Validator.validate(Substitutor.substitute(LibStem))) {
+      LLVM_DEBUG(dbgs() << "  -> Resolved after substitution: " << *norm
+                        << "\n";);
+
+      return norm;
+    }
+  }
+
+  for (const auto &R : Resolvers) {
+    LLVM_DEBUG(dbgs() << "  -> Resolving via search path ... \n";);
+    if (auto Result = R.resolve(LibStem, Substitutor, Validator)) {
+      LLVM_DEBUG(dbgs() << "  -> Resolved via search path: " << *Result
+                        << "\n";);
+
+      return Result;
+    }
+  }
+
+  // Expand libStem with paths, extensions, etc.
+  // std::string foundName;
+  if (VariateLibStem) {
+    LLVM_DEBUG(dbgs() << "  -> Trying with extensions...\n";);
+
+    if (auto Norm = tryWithExtensions(LibStem)) {
+      LLVM_DEBUG(dbgs() << "  -> Resolved via tryWithExtensions: " << *Norm
+                        << "\n";);
+
+      return Norm;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "  -> Could not resolve: " << LibStem << "\n";);
+
+  return std::nullopt;
+}
+
+#ifndef _WIN32
+mode_t PathResolver::lstatCached(StringRef Path) {
+  // If already cached - retun cached result
+  if (auto Cache = LibPathCache->read_lstat(Path))
+    return *Cache;
+
+  // Not cached: perform lstat and store
+  struct stat buf{};
+  mode_t st_mode = (lstat(Path.str().c_str(), &buf) == -1) ? 0 : buf.st_mode;
+
+  LibPathCache->insert_lstat(Path, st_mode);
+
+  return st_mode;
+}
+
+std::optional<std::string> PathResolver::readlinkCached(StringRef Path) {
+  // If already cached - retun cached result
+  if (auto Cache = LibPathCache->read_link(Path))
+    return Cache;
+
+  // If result not in cache - call system function and cache result
+  char buf[PATH_MAX];
+  ssize_t len;
+  if ((len = readlink(Path.str().c_str(), buf, sizeof(buf))) != -1) {
+    buf[len] = '\0';
+    std::string s(buf);
+    LibPathCache->insert_link(Path, s);
+    return s;
+  }
+  return std::nullopt;
+}
+
+void createComponent(StringRef Path, StringRef BasePath, bool BaseIsResolved,
+                     SmallVector<StringRef, 16> &Component) {
+  StringRef Separator = sys::path::get_separator();
+  if (!BaseIsResolved) {
+    if (Path[0] == '~' &&
+        (Path.size() == 1 || sys::path::is_separator(Path[1]))) {
+      static SmallString<128> HomeP;
+      if (HomeP.str().empty())
+        sys::path::home_directory(HomeP);
+      StringRef(HomeP).split(Component, Separator, /*MaxSplit*/ -1,
+                             /*KeepEmpty*/ false);
+    } else if (BasePath.empty()) {
+      static SmallString<256> CurrentPath;
+      if (CurrentPath.str().empty())
+        sys::fs::current_path(CurrentPath);
+      StringRef(CurrentPath)
+          .split(Component, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false);
+    } else {
+      BasePath.split(Component, Separator, /*MaxSplit*/ -1,
+                     /*KeepEmpty*/ false);
+    }
+  }
+
+  Path.split(Component, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false);
+}
+
+void normalizePathSegments(SmallVector<StringRef, 16> &PathParts) {
+  SmallVector<StringRef, 16> NormalizedPath;
+  for (auto &Part : PathParts) {
+    if (Part == ".") {
+      continue;
+    } else if (Part == "..") {
+      if (!NormalizedPath.empty() && NormalizedPath.back() != "..") {
+        NormalizedPath.pop_back();
+      } else {
+        NormalizedPath.push_back("..");
+      }
+    } else {
+      NormalizedPath.push_back(Part);
+    }
+  }
+  PathParts.swap(NormalizedPath);
+}
+#endif
+
+std::optional<std::string> PathResolver::realpathCached(StringRef Path,
+                                                        std::error_code &EC,
+                                                        StringRef Base,
+                                                        bool BaseIsResolved,
+                                                        long SymLoopLevel) {
+  EC.clear();
+
+  if (Path.empty()) {
+    EC = std::make_error_code(std::errc::no_such_file_or_directory);
+    LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Empty path\n";);
+
+    return std::nullopt;
+  }
+
+  if (SymLoopLevel <= 0) {
+    EC = std::make_error_code(std::errc::too_many_symbolic_link_levels);
+    LLVM_DEBUG(
+        dbgs() << "PathResolver::realpathCached: Too many Symlink levels: "
+               << Path << "\n";);
+
+    return std::nullopt;
+  }
+
+  // If already cached - retun cached result
+  bool isRelative = sys::path::is_relative(Path);
+  if (!isRelative) {
+    if (auto Cached = LibPathCache->read_realpath(Path)) {
+      EC = Cached->ErrnoCode;
+      if (EC) {
+        LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Cached (error) for "
+                          << Path << "\n";);
+      } else {
+        LLVM_DEBUG(
+            dbgs() << "PathResolver::realpathCached: Cached (success) for "
+                   << Path << " => " << Cached->canonicalPath << "\n";);
+      }
+      return Cached->canonicalPath.empty()
+                 ? std::nullopt
+                 : std::make_optional(Cached->canonicalPath);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Resolving path: " << Path
+                    << "\n";);
+
+  // If result not in cache - call system function and cache result
+
+  StringRef Separator(sys::path::get_separator());
+  SmallString<256> Resolved(Separator);
+#ifndef _WIN32
+  SmallVector<StringRef, 16> Components;
+
+  if (isRelative) {
+    if (BaseIsResolved) {
+      Resolved.assign(Base);
+      LLVM_DEBUG(dbgs() << "  Using Resolved base: " << Base << "\n";);
+    }
+    createComponent(Path, Base, BaseIsResolved, Components);
+  } else {
+    Path.split(Components, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false);
+  }
+
+  normalizePathSegments(Components);
+  LLVM_DEBUG({
+    for (auto &C : Components)
+      dbgs() << " " << C << " ";
+
+    dbgs() << "\n";
+  });
+
+  // Handle path list items
+  for (const auto &Component : Components) {
+    if (Component == ".")
+      continue;
+    if (Component == "..") {
+      // collapse "a/b/../c" to "a/c"
+      size_t S = Resolved.rfind(Separator);
+      if (S != llvm::StringRef::npos)
+        Resolved.resize(S);
+      if (Resolved.empty())
+        Resolved = Separator;
+      continue;
+    }
+
+    size_t oldSize = Resolved.size();
+    sys::path::append(Resolved, Component);
+    const char *ResolvedPath = Resolved.c_str();
+    LLVM_DEBUG(dbgs() << "  Processing Component: " << Component << " => "
+                      << ResolvedPath << "\n";);
+    mode_t st_mode = lstatCached(ResolvedPath);
+
+    if (S_ISLNK(st_mode)) {
+      LLVM_DEBUG(dbgs() << "    Found symlink: " << ResolvedPath << "\n";);
+
+      auto SymlinkOpt = readlinkCached(ResolvedPath);
+      if (!SymlinkOpt) {
+        EC = std::make_error_code(std::errc::no_such_file_or_directory);
+        LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC});
+        LLVM_DEBUG(dbgs() << "    Failed to read symlink: " << ResolvedPath
+                          << "\n";);
+
+        return std::nullopt;
+      }
+
+      StringRef Symlink = *SymlinkOpt;
+      LLVM_DEBUG(dbgs() << "    Symlink points to: " << Symlink << "\n";);
+
+      std::string resolvedBase = "";
+      if (sys::path::is_relative(Symlink)) {
+        Resolved.resize(oldSize);
+        resolvedBase = Resolved.str().str();
+      }
+
+      auto RealSymlink =
+          realpathCached(Symlink, EC, resolvedBase,
+                         /*BaseIsResolved=*/true, SymLoopLevel - 1);
+      if (!RealSymlink) {
+        LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC});
+        LLVM_DEBUG(dbgs() << "    Failed to resolve symlink target: " << Symlink
+                          << "\n";);
+
+        return std::nullopt;
+      }
+
+      Resolved.assign(*RealSymlink);
+      LLVM_DEBUG(dbgs() << "    Symlink Resolved to: " << Resolved << "\n";);
+
+    } else if (st_mode == 0) {
+      EC = std::make_error_code(std::errc::no_such_file_or_directory);
+      LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC});
+      LLVM_DEBUG(dbgs() << "    Component does not exist: " << ResolvedPath
+                        << "\n";);
+
+      return std::nullopt;
+    }
+  }
+#else
+  EC = sys::fs::real_path(Path, Resolved); // Windows fallback
+#endif
+
+  std::string Canonical = Resolved.str().str();
+  {
+    LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{
+                                            Canonical,
+                                            std::error_code() // success
+                                        });
+  }
+  LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Final Resolved: " << Path
+                    << " => " << Canonical << "\n";);
+  return Canonical;
+}
+
+void LibraryScanHelper::addBasePath(const std::string &Path, PathType K) {
+  std::error_code EC;
+  std::string Canon = resolveCanonical(Path, EC);
+  if (EC) {
+    LLVM_DEBUG(
+        dbgs()
+            << "LibraryScanHelper::addBasePath: Failed to canonicalize path: "
+            << Path << "\n";);
+    return;
+  }
+  std::unique_lock<std::shared_mutex> Lock(Mtx);
+  if (LibSearchPaths.count(Canon)) {
+    LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Already added: "
+                      << Canon << "\n";);
+    return;
+  }
+  K = K == PathType::Unknown ? classifyKind(Canon) : K;
+  auto SP = std::make_shared<LibrarySearchPath>(Canon, K);
+  LibSearchPaths[Canon] = SP;
+
+  if (K == PathType::User) {
+    LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Added User path: "
+                      << Canon << "\n";);
+    UnscannedUsr.push_back(StringRef(SP->BasePath));
+  } else {
+    LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Added System path: "
+                      << Canon << "\n";);
+    UnscannedSys.push_back(StringRef(SP->BasePath));
+  }
+}
+
+std::vector<std::shared_ptr<LibrarySearchPath>>
+LibraryScanHelper::getNextBatch(PathType K, size_t BatchSize) {
+  std::vector<std::shared_ptr<LibrarySearchPath>> Result;
+  auto &Queue = (K == PathType::User) ? UnscannedUsr : UnscannedSys;
+
+  std::unique_lock<std::shared_mutex> Lock(Mtx);
+
+  while (!Queue.empty() && (BatchSize == 0 || Result.size() < BatchSize)) {
+    StringRef Base = Queue.front();
+    auto It = LibSearchPaths.find(Base);
+    if (It != LibSearchPaths.end()) {
+      auto &SP = It->second;
+      ScanState Expected = ScanState::NotScanned;
+      if (SP->State.compare_exchange_strong(Expected, ScanState::Scanning)) {
+        Result.push_back(SP);
+      }
+    }
+    Queue.pop_front();
+  }
+
+  return Result;
+}
+
+bool LibraryScanHelper::isTrackedBasePath(StringRef Path) const {
+  std::error_code EC;
+  std::string Canon = resolveCanonical(Path, EC);
+  if (EC)
+    return false;
+
+  std::shared_lock<std::shared_mutex> Lock(Mtx);
+  return LibSearchPaths.count(Canon) > 0;
+}
+
+bool LibraryScanHelper::leftToScan(PathType K) const {
+  std::shared_lock<std::shared_mutex> Lock(Mtx);
+  for (const auto &KV : LibSearchPaths) {
+    const auto &SP = KV.second;
+    if (SP->Kind == K && SP->State == ScanState::NotScanned)
+      return true;
+  }
+  return false;
+}
+
+void LibraryScanHelper::resetToScan() {
+  std::shared_lock<std::shared_mutex> Lock(Mtx);
+
+  for (auto &[_, SP] : LibSearchPaths) {
+    ScanState Expected = ScanState::Scanned;
+
+    if (!SP->State.compare_exchange_strong(Expected, ScanState::NotScanned))
+      continue;
+
+    auto &TargetList =
+        (SP->Kind == PathType::User) ? UnscannedUsr : UnscannedSys;
+    TargetList.emplace_back(SP->BasePath);
+  }
+}
+
+std::vector<std::shared_ptr<LibrarySearchPath>>
+LibraryScanHelper::getAllUnits() const {
+  std::shared_lock<std::shared_mutex> Lock(Mtx);
+  std::vector<std::shared_ptr<LibrarySearchPath>> Result;
+  Result.reserve(LibSearchPaths.size());
+  for (const auto &[_, SP] : LibSearchPaths) {
+    Result.push_back(SP);
+  }
+  return Result;
+}
+
+std::string LibraryScanHelper::resolveCanonical(StringRef Path,
+                                                std::error_code &EC) const {
+  auto Canon = LibPathResolver->resolve(Path, EC);
+  return EC ? Path.str() : *Canon;
+}
+
+PathType LibraryScanHelper::classifyKind(StringRef Path) const {
+  // Detect home directory
+  const char *Home = getenv("HOME");
+  if (Home && Path.find(Home) == 0)
+    return PathType::User;
+
+  static const std::array<std::string, 5> UserPrefixes = {
+      "/usr/local",    // often used by users for manual installs
+      "/opt/homebrew", // common on macOS
+      "/opt/local",    // MacPorts
+      "/home",         // Linux home dirs
+      "/Users",        // macOS user dirs
+  };
+
+  for (const auto &Prefix : UserPrefixes) {
+    if (Path.find(Prefix) == 0)
+      return PathType::User;
+  }
+
+  return PathType::System;
+}
+
+Expected<LibraryDepsInfo> parseMachODeps(const object::MachOObjectFile &Obj) {
+  LibraryDepsInfo Libdeps;
+  LLVM_DEBUG(dbgs() << "Parsing Mach-O dependencies...\n";);
+  for (const auto &Command : Obj.load_commands()) {
+    switch (Command.C.cmd) {
+    case MachO::LC_LOAD_DYLIB: {
+      MachO::dylib_command dylibCmd = Obj.getDylibIDLoadCommand(Command);
+      const char *name = Command.Ptr + dylibCmd.dylib.name;
+      Libdeps.addDep(name);
+      LLVM_DEBUG(dbgs() << "  Found LC_LOAD_DYLIB: " << name << "\n";);
+    } break;
+    case MachO::LC_LOAD_WEAK_DYLIB:
+    case MachO::LC_REEXPORT_DYLIB:
+    case MachO::LC_LOAD_UPWARD_DYLIB:
+    case MachO::LC_LAZY_LOAD_DYLIB:
+      break;
+    case MachO::LC_RPATH: {
+      // Extract RPATH
+      MachO::rpath_command rpathCmd = Obj.getRpathCommand(Command);
+      const char *rpath = Command.Ptr + rpathCmd.path;
+      LLVM_DEBUG(dbgs() << "  Found LC_RPATH: " << rpath << "\n";);
+
+      SmallVector<StringRef, 4> RawPaths;
+      SplitString(StringRef(rpath), RawPaths,
+                  sys::EnvPathSeparator == ':' ? ":" : ";");
+
+      for (const auto &raw : RawPaths) {
+        Libdeps.addRPath(raw.str()); // Convert to std::string
+        LLVM_DEBUG(dbgs() << "    Parsed RPATH entry: " << raw << "\n";);
+      }
+      break;
+    }
+    }
+  }
+
+  return Expected<LibraryDepsInfo>(std::move(Libdeps));
+}
+
+template <class ELFT>
+static Expected<StringRef> getDynamicStrTab(const object::ELFFile<ELFT> &Elf) {
+  auto DynamicEntriesOrError = Elf.dynamicEntries();
+  if (!DynamicEntriesOrError)
+    return DynamicEntriesOrError.takeError();
+
+  for (const typename ELFT::Dyn &Dyn : *DynamicEntriesOrError) {
+    if (Dyn.d_tag == ELF::DT_STRTAB) {
+      auto MappedAddrOrError = Elf.toMappedAddr(Dyn.getPtr());
+      if (!MappedAddrOrError)
+        return MappedAddrOrError.takeError();
+      return StringRef(reinterpret_cast<const char *>(*MappedAddrOrError));
+    }
+  }
+
+  // If the dynamic segment is not present, we fall back on the sections.
+  auto SectionsOrError = Elf.sections();
+  if (!SectionsOrError)
+    return SectionsOrError.takeError();
+
+  for (const typename ELFT::Shdr &Sec : *SectionsOrError) {
+    if (Sec.sh_type == ELF::SHT_DYNSYM)
+      return Elf.getStringTableForSymtab(Sec);
+  }
+
+  return make_error<StringError>("dynamic string table not found",
+                                 inconvertibleErrorCode());
+}
+
+template <typename ELFT>
+Expected<LibraryDepsInfo> parseELF(const object::ELFFile<ELFT> &Elf) {
+  LibraryDepsInfo Deps;
+  Expected<StringRef> StrTabOrErr = getDynamicStrTab(Elf);
+  if (!StrTabOrErr)
+    return StrTabOrErr.takeError();
+
+  const char *Data = StrTabOrErr->data();
+
+  auto DynamicEntriesOrError = Elf.dynamicEntries();
+  if (!DynamicEntriesOrError) {
+    return DynamicEntriesOrError.takeError();
+  }
+
+  for (const typename ELFT::Dyn &Dyn : *DynamicEntriesOrError) {
+    switch (Dyn.d_tag) {
+    case ELF::DT_NEEDED:
+      Deps.addDep(Data + Dyn.d_un.d_val);
+      break;
+    case ELF::DT_RPATH: {
+      SmallVector<StringRef, 4> RawPaths;
+      SplitString(Data + Dyn.d_un.d_val, RawPaths,
+                  sys::EnvPathSeparator == ':' ? ":" : ";");
+      for (const auto &raw : RawPaths)
+        Deps.addRPath(raw.str());
+      break;
+    }
+    case ELF::DT_RUNPATH: {
+      SmallVector<StringRef, 4> RawPaths;
+      SplitString(Data + Dyn.d_un.d_val, RawPaths,
+                  sys::EnvPathSeparator == ':' ? ":" : ";");
+      for (const auto &raw : RawPaths)
+        Deps.addRunPath(raw.str());
+      break;
+    }
+    case ELF::DT_FLAGS_1:
+      // Check if this is not a pie executable.
+      if (Dyn.d_un.d_val & ELF::DF_1_PIE)
+        Deps.isPIE = true;
+      break;
+      // (Dyn.d_tag == ELF::DT_NULL) continue;
+      // (Dyn.d_tag == ELF::DT_AUXILIARY || Dyn.d_tag == ELF::DT_FILTER)
+    default:
+      break;
+    }
+  }
+
+  return Expected<LibraryDepsInfo>(std::move(Deps));
+}
+
+Expected<LibraryDepsInfo> parseELFDeps(const object::ELFObjectFileBase &Obj) {
+  using namespace object;
+  LLVM_DEBUG(dbgs() << "parseELFDeps: Detected ELF object\n";);
+  if (const auto *ELF = dyn_cast<ELF32LEObjectFile>(&Obj))
+    return parseELF(ELF->getELFFile());
+  else if (const auto *ELF = dyn_cast<ELF32BEObjectFile>(&Obj))
+    return parseELF(ELF->getELFFile());
+  else if (const auto *ELF = dyn_cast<ELF64LEObjectFile>(&Obj))
+    return parseELF(ELF->getELFFile());
+  else if (const auto *ELF = dyn_cast<ELF64BEObjectFile>(&Obj))
+    return parseELF(ELF->getELFFile());
+
+  LLVM_DEBUG(dbgs() << "parseELFDeps: Unknown ELF format\n";);
+  return createStringError(std::errc::not_supported, "Unknown ELF format");
+}
+
+Expected<LibraryDepsInfo> LibraryScanner::extractDeps(StringRef FilePath) {
+  LLVM_DEBUG(dbgs() << "extractDeps: Attempting to open file " << FilePath
+                    << "\n";);
+
+  ObjectFileLoader ObjLoader(FilePath);
+  auto ObjOrErr = ObjLoader.getObjectFile();
+  if (!ObjOrErr) {
+    LLVM_DEBUG(dbgs() << "extractDeps: Failed to open " << FilePath << "\n";);
+    return ObjOrErr.takeError();
+  }
+
+  object::ObjectFile *Obj = &ObjOrErr.get();
+
+  if (auto *elfObj = dyn_cast<object::ELFObjectFileBase>(Obj)) {
+    LLVM_DEBUG(dbgs() << "extractDeps: File " << FilePath
+                      << " is an ELF object\n";);
+
+    return parseELFDeps(*elfObj);
+  }
+
+  if (auto *macho = dyn_cast<object::MachOObjectFile>(Obj)) {
+    LLVM_DEBUG(dbgs() << "extractDeps: File " << FilePath
+                      << " is a Mach-O object\n";);
+    return parseMachODeps(*macho);
+  }
+
+  if (Obj->isCOFF()) {
+    // TODO: COFF support
+    return LibraryDepsInfo();
+  }
+
+  LLVM_DEBUG(dbgs() << "extractDeps: Unsupported binary format for file "
+                    << FilePath << "\n";);
+  return createStringError(inconvertibleErrorCode(),
+                           "Unsupported binary format: %s",
+                           FilePath.str().c_str());
+}
+
+std::optional<std::string> LibraryScanner::shouldScan(StringRef FilePath) {
+  std::error_code EC;
+
+  LLVM_DEBUG(dbgs() << "[shouldScan] Checking: " << FilePath << "\n";);
+
+  // [1] Check file existence early
+  if (!sys::fs::exists(FilePath)) {
+    LLVM_DEBUG(dbgs() << "  -> Skipped: file does not exist.\n";);
+
+    return std::nullopt;
+  }
+
+  // [2] Resolve to canonical path
+  auto CanonicalPathOpt = ScanHelper.resolve(FilePath, EC);
+  if (EC || !CanonicalPathOpt) {
+    LLVM_DEBUG(dbgs() << "  -> Skipped: failed to resolve path (EC="
+                      << EC.message() << ").\n";);
+
+    return std::nullopt;
+  }
+
+  const std::string &CanonicalPath = *CanonicalPathOpt;
+  LLVM_DEBUG(dbgs() << "  -> Canonical path: " << CanonicalPath << "\n");
+
+  // [3] Check if it's a directory — skip directories
+  if (sys::fs::is_directory(CanonicalPath)) {
+    LLVM_DEBUG(dbgs() << "  -> Skipped: path is a directory.\n";);
+
+    return std::nullopt;
+  }
+
+  // [4] Skip if it's not a shared library.
+  if (!DylibPathValidator::isSharedLibrary(CanonicalPath)) {
+    LLVM_DEBUG(dbgs() << "  -> Skipped: not a shared library.\n";);
+    return std::nullopt;
+  }
+
+  // [5] Skip if we've already seen this path (via cache)
+  if (ScanHelper.hasSeenOrMark(CanonicalPath)) {
+    LLVM_DEBUG(dbgs() << "  -> Skipped: already seen.\n";);
+
+    return std::nullopt;
+  }
+
+  // [6] Already tracked in LibraryManager?
+  if (LibMgr.hasLibrary(CanonicalPath)) {
+    LLVM_DEBUG(dbgs() << "  -> Skipped: already tracked by LibraryManager.\n";);
+
+    return std::nullopt;
+  }
+
+  // [7] Run user-defined hook (default: always true)
+  if (!ShouldScanCall(CanonicalPath)) {
+    LLVM_DEBUG(dbgs() << "  -> Skipped: user-defined hook rejected.\n";);
+
+    return std::nullopt;
+  }
+
+  LLVM_DEBUG(dbgs() << "  -> Accepted: ready to scan " << CanonicalPath
+                    << "\n";);
+  return CanonicalPath;
+}
+
+void LibraryScanner::handleLibrary(StringRef FilePath, PathType K, int level) {
+  LLVM_DEBUG(dbgs() << "LibraryScanner::handleLibrary: Scanning: " << FilePath
+                    << ", level=" << level << "\n";);
+  auto CanonPathOpt = shouldScan(FilePath);
+  if (!CanonPathOpt) {
+    LLVM_DEBUG(dbgs() << "  Skipped (shouldScan returned false): " << FilePath
+                      << "\n";);
+
+    return;
+  }
+  const std::string CanonicalPath = *CanonPathOpt;
+
+  auto DepsOrErr = extractDeps(CanonicalPath);
+  if (!DepsOrErr) {
+    LLVM_DEBUG(dbgs() << "  Failed to extract deps for: " << CanonicalPath
+                      << "\n";);
+    handleError(DepsOrErr.takeError());
+    return;
+  }
+
+  LibraryDepsInfo &Deps = *DepsOrErr;
+
+  LLVM_DEBUG({
+    dbgs() << "    Found deps : \n";
+    for (const auto &dep : Deps.deps)
+      dbgs() << "        : " << dep << "\n";
+    dbgs() << "    Found @rpath : " << Deps.rpath.size() << "\n";
+    for (const auto &r : Deps.rpath)
+      dbgs() << "     : " << r << "\n";
+    dbgs() << "    Found @runpath : \n";
+    for (const auto &r : Deps.runPath)
+      dbgs() << "     : " << r << "\n";
+  });
+
+  if (Deps.isPIE && level == 0) {
+    LLVM_DEBUG(dbgs() << "  Skipped PIE executable at top level: "
+                      << CanonicalPath << "\n";);
+
+    return;
+  }
+
+  bool Added = LibMgr.addLibrary(CanonicalPath, K);
+  if (!Added) {
+    LLVM_DEBUG(dbgs() << "  Already added: " << CanonicalPath << "\n";);
+    return;
+  }
+
+  // Heuristic 1: No RPATH/RUNPATH, skip deps
+  if (Deps.rpath.empty() && Deps.runPath.empty()) {
+    LLVM_DEBUG(
+        dbgs() << "LibraryScanner::handleLibrary: Skipping deps (Heuristic1): "
+               << CanonicalPath << "\n";);
+    return;
+  }
+
+  // Heuristic 2: All RPATH and RUNPATH already tracked
+  auto allTracked = [&](const auto &Paths) {
+    LLVM_DEBUG(dbgs() << "   Checking : " << Paths.size() << "\n";);
+    return std::all_of(Paths.begin(), Paths.end(), [&](StringRef P) {
+      LLVM_DEBUG(dbgs() << "      Checking isTrackedBasePath : " << P << "\n";);
+      return ScanHelper.isTrackedBasePath(
+          DylibResolver::resolvelinkerFlag(P, CanonicalPath));
+    });
+  };
+
+  if (allTracked(Deps.rpath) && allTracked(Deps.runPath)) {
+    LLVM_DEBUG(
+        dbgs() << "LibraryScanner::handleLibrary: Skipping deps (Heuristic2): "
+               << CanonicalPath << "\n";);
+    return;
+  }
+
+  DylibPathValidator Validator(ScanHelper.getPathResolver());
+  DylibResolver Resolver(Validator);
+  Resolver.configure(CanonicalPath,
+                     {{Deps.rpath, SearchPathType::RPath},
+                      {ScanHelper.getSearchPaths(), SearchPathType::UsrOrSys},
+                      {Deps.runPath, SearchPathType::RunPath}});
+  for (StringRef Dep : Deps.deps) {
+    LLVM_DEBUG(dbgs() << "  Resolving dep: " << Dep << "\n";);
+    auto DepFullOpt = Resolver.resolve(Dep);
+    if (!DepFullOpt) {
+      LLVM_DEBUG(dbgs() << "    Failed to resolve dep: " << Dep << "\n";);
+
+      continue;
+    }
+    LLVM_DEBUG(dbgs() << "    Resolved dep to: " << *DepFullOpt << "\n";);
+
+    handleLibrary(*DepFullOpt, K, level + 1);
+  }
+}
+
+void LibraryScanner::scanBaseDir(std::shared_ptr<LibrarySearchPath> SP) {
+  if (!sys::fs::is_directory(SP->BasePath) || SP->BasePath.empty()) {
+    LLVM_DEBUG(
+        dbgs() << "LibraryScanner::scanBaseDir: Invalid or empty basePath: "
+               << SP->BasePath << "\n";);
+    return;
+  }
+
+  LLVM_DEBUG(dbgs() << "LibraryScanner::scanBaseDir: Scanning directory: "
+                    << SP->BasePath << "\n";);
+  std::error_code EC;
+
+  SP->State.store(ScanState::Scanning);
+
+  for (sys::fs::directory_iterator It(SP->BasePath, EC), end; It != end && !EC;
+       It.increment(EC)) {
+    auto Entry = *It;
+    if (!Entry.status())
+      continue;
+
+    auto Status = *Entry.status();
+    if (sys::fs::is_regular_file(Status) || sys::fs::is_symlink_file(Status)) {
+      LLVM_DEBUG(dbgs() << "  Found file: " << Entry.path() << "\n";);
+      // async support ?
+      handleLibrary(Entry.path(), SP->Kind);
+    }
+  }
+
+  SP->State.store(ScanState::Scanned);
+}
+
+void LibraryScanner::scanNext(PathType K, size_t BatchSize) {
+  LLVM_DEBUG(dbgs() << "LibraryScanner::scanNext: Scanning next batch of size "
+                    << BatchSize << " for kind "
+                    << (K == PathType::User ? "User" : "System") << "\n";);
+
+  auto SearchPaths = ScanHelper.getNextBatch(K, BatchSize);
+  for (auto &SP : SearchPaths) {
+    LLVM_DEBUG(dbgs() << "  Scanning unit with basePath: " << SP->BasePath
+                      << "\n";);
+
+    scanBaseDir(SP);
+  }
+}
+
+} // end namespace llvm::orc
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index b838e36..4d4e9f9 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1504,6 +1504,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
       else if (Name.consume_front("fabs."))
         // nvvm.fabs.{f,ftz.f,d}
         Expand = Name == "f" || Name == "ftz.f" || Name == "d";
+      else if (Name.consume_front("ex2.approx."))
+        // nvvm.ex2.approx.{f,ftz.f,d,f16x2}
+        Expand =
+            Name == "f" || Name == "ftz.f" || Name == "d" || Name == "f16x2";
       else if (Name.consume_front("max.") || Name.consume_front("min."))
         // nvvm.{min,max}.{i,ii,ui,ull}
         Expand = Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
@@ -2550,6 +2554,11 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
     Intrinsic::ID IID = (Name == "fabs.ftz.f") ? Intrinsic::nvvm_fabs_ftz
                                                : Intrinsic::nvvm_fabs;
     Rep = Builder.CreateUnaryIntrinsic(IID, CI->getArgOperand(0));
+  } else if (Name.consume_front("ex2.approx.")) {
+    // nvvm.ex2.approx.{f,ftz.f,d,f16x2}
+    Intrinsic::ID IID = Name.starts_with("ftz") ? Intrinsic::nvvm_ex2_approx_ftz
+                                                : Intrinsic::nvvm_ex2_approx;
+    Rep = Builder.CreateUnaryIntrinsic(IID, CI->getArgOperand(0));
   } else if (Name.starts_with("atomic.load.add.f32.p") ||
              Name.starts_with("atomic.load.add.f64.p")) {
     Value *Ptr = CI->getArgOperand(0);
diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
index 3462954..3a85770 100644
--- a/llvm/lib/MC/MCParser/MasmParser.cpp
+++ b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -5323,10 +5323,10 @@ void MasmParser::initializeDirectiveKindMap() {
 bool MasmParser::isMacroLikeDirective() {
   if (getLexer().is(AsmToken::Identifier)) {
     bool IsMacroLike = StringSwitch<bool>(getTok().getIdentifier())
-                           .CasesLower("repeat", "rept", true)
+                           .CasesLower({"repeat", "rept"}, true)
                            .CaseLower("while", true)
-                           .CasesLower("for", "irp", true)
-                           .CasesLower("forc", "irpc", true)
+                           .CasesLower({"for", "irp"}, true)
+                           .CasesLower({"forc", "irpc"}, true)
                            .Default(false);
     if (IsMacroLike)
       return true;
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index e21cf8e..e2645fa 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -269,12 +269,6 @@ bool APFloatBase::isRepresentableBy(const fltSemantics &A,
          A.precision <= B.precision;
 }
 
-constexpr RoundingMode APFloatBase::rmNearestTiesToEven;
-constexpr RoundingMode APFloatBase::rmTowardPositive;
-constexpr RoundingMode APFloatBase::rmTowardNegative;
-constexpr RoundingMode APFloatBase::rmTowardZero;
-constexpr RoundingMode APFloatBase::rmNearestTiesToAway;
-
 /* A tight upper bound on number of parts required to hold the value
    pow(5, power) is
 
diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc
index 648d6a5..da68994 100644
--- a/llvm/lib/Support/Windows/Signals.inc
+++ b/llvm/lib/Support/Windows/Signals.inc
@@ -421,8 +421,13 @@ bool sys::RemoveFileOnSignal(StringRef Filename, std::string *ErrMsg) {
     return true;
   }
 
-  if (FilesToRemove == NULL)
+  if (FilesToRemove == NULL) {
     FilesToRemove = new std::vector<std::string>;
+    std::atexit([]() {
+      delete FilesToRemove;
+      FilesToRemove = NULL;
+    });
+  }
 
   FilesToRemove->push_back(std::string(Filename));
 
diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp
index 07b9989..d6f27fb 100644
--- a/llvm/lib/Support/raw_ostream.cpp
+++ b/llvm/lib/Support/raw_ostream.cpp
@@ -61,17 +61,6 @@
 
 using namespace llvm;
 
-constexpr raw_ostream::Colors raw_ostream::BLACK;
-constexpr raw_ostream::Colors raw_ostream::RED;
-constexpr raw_ostream::Colors raw_ostream::GREEN;
-constexpr raw_ostream::Colors raw_ostream::YELLOW;
-constexpr raw_ostream::Colors raw_ostream::BLUE;
-constexpr raw_ostream::Colors raw_ostream::MAGENTA;
-constexpr raw_ostream::Colors raw_ostream::CYAN;
-constexpr raw_ostream::Colors raw_ostream::WHITE;
-constexpr raw_ostream::Colors raw_ostream::SAVEDCOLOR;
-constexpr raw_ostream::Colors raw_ostream::RESET;
-
 raw_ostream::~raw_ostream() {
   // raw_ostream's subclasses should take care to flush the buffer
   // in their destructors.
diff --git a/llvm/lib/Support/raw_socket_stream.cpp b/llvm/lib/Support/raw_socket_stream.cpp
index 3b510d3..f716317 100644
--- a/llvm/lib/Support/raw_socket_stream.cpp
+++ b/llvm/lib/Support/raw_socket_stream.cpp
@@ -332,7 +332,7 @@ ListeningSocket::~ListeningSocket() {
 raw_socket_stream::raw_socket_stream(int SocketFD)
     : raw_fd_stream(SocketFD, true) {}
 
-raw_socket_stream::~raw_socket_stream() {}
+raw_socket_stream::~raw_socket_stream() = default;
 
 Expected<std::unique_ptr<raw_socket_stream>>
 raw_socket_stream::createConnectedUnix(StringRef SocketPath) {
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6dcbced..b7fa899 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1288,18 +1288,38 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
 }
 
 void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
+  // On entry to a block with multiple predescessors, there may
+  // be pending SMEM and VMEM events active at the same time.
+  // In such cases, only clear one active event at a time.
+  auto applyPendingXcntGroup = [this](unsigned E) {
+    unsigned LowerBound = getScoreLB(X_CNT);
+    applyWaitcnt(X_CNT, 0);
+    PendingEvents |= (1 << E);
+    setScoreLB(X_CNT, LowerBound);
+  };
+
   // Wait on XCNT is redundant if we are already waiting for a load to complete.
   // SMEM can return out of order, so only omit XCNT wait if we are waiting till
   // zero.
-  if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
-    return applyWaitcnt(X_CNT, 0);
+  if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) {
+    if (hasPendingEvent(VMEM_GROUP))
+      applyPendingXcntGroup(VMEM_GROUP);
+    else
+      applyWaitcnt(X_CNT, 0);
+    return;
+  }
 
   // If we have pending store we cannot optimize XCnt because we do not wait for
   // stores. VMEM loads retun in order, so if we only have loads XCnt is
   // decremented to the same number as LOADCnt.
   if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
-      !hasPendingEvent(STORE_CNT))
-    return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+      !hasPendingEvent(STORE_CNT)) {
+    if (hasPendingEvent(SMEM_GROUP))
+      applyPendingXcntGroup(SMEM_GROUP);
+    else
+      applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+    return;
+  }
 
   applyWaitcnt(X_CNT, Wait.XCnt);
 }
diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
index e5b4f6e..ab4ee55 100644
--- a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
+++ b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
@@ -889,7 +889,7 @@ CSKYTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                .Cases("{t9}", "{bsp}", CSKY::R25)
                                .Case("{r26}", CSKY::R26)
                                .Case("{r27}", CSKY::R27)
-                               .Cases("{gb}", "{rgb}", "{rdb}", CSKY::R28)
+                               .Cases({"{gb}", "{rgb}", "{rdb}"}, CSKY::R28)
                                .Cases("{tb}", "{rtb}", CSKY::R29)
                                .Case("{svbr}", CSKY::R30)
                                .Case("{tls}", CSKY::R31)
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
index fe83dc6..51bafe4 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
@@ -49,7 +49,7 @@ public:
   M68kAsmBackend(const Target &T, const MCSubtargetInfo &STI)
       : MCAsmBackend(llvm::endianness::big),
         Allows32BitBranch(llvm::StringSwitch<bool>(STI.getCPU())
-                              .CasesLower("m68020", "m68030", "m68040", true)
+                              .CasesLower({"m68020", "m68030", "m68040"}, true)
                               .Default(false)) {}
 
   void applyFixup(const MCFragment &, const MCFixup &, const MCValue &,
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index e8758aa..50827bd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1562,12 +1562,17 @@ def : Pat<(int_nvvm_saturate_d f64:$a),     (CVT_f64_f64 $a, CvtSAT)>;
 // Exp2  Log2
 //
 
-def : Pat<(int_nvvm_ex2_approx_ftz_f f32:$a), (EX2_APPROX_f32 $a, FTZ)>;
-def : Pat<(int_nvvm_ex2_approx_f f32:$a), (EX2_APPROX_f32 $a, NoFTZ)>;
+def : Pat<(f32 (int_nvvm_ex2_approx_ftz f32:$a)), (EX2_APPROX_f32 $a, FTZ)>;
+def : Pat<(f32 (int_nvvm_ex2_approx f32:$a)), (EX2_APPROX_f32 $a, NoFTZ)>;
 
 let Predicates = [hasPTX<70>, hasSM<75>] in {
-  def : Pat<(int_nvvm_ex2_approx_f16 f16:$a), (EX2_APPROX_f16 $a)>;
-  def : Pat<(int_nvvm_ex2_approx_f16x2 v2f16:$a), (EX2_APPROX_f16x2 $a)>;
+  def : Pat<(f16 (int_nvvm_ex2_approx f16:$a)), (EX2_APPROX_f16 $a)>;
+  def : Pat<(v2f16 (int_nvvm_ex2_approx v2f16:$a)), (EX2_APPROX_f16x2 $a)>;
+}
+
+let Predicates = [hasPTX<78>, hasSM<90>] in {
+  def : Pat<(bf16 (int_nvvm_ex2_approx_ftz bf16:$a)), (EX2_APPROX_bf16 $a)>;
+  def : Pat<(v2bf16 (int_nvvm_ex2_approx_ftz v2bf16:$a)), (EX2_APPROX_bf16x2 $a)>;
 }
 
 def LG2_APPROX_f32 :
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 729c077..64593e6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -318,7 +318,7 @@ static Instruction *convertNvvmIntrinsicToLlvm(InstCombiner &IC,
       // answer. These include:
       //
       //   - nvvm_cos_approx_{f,ftz_f}
-      //   - nvvm_ex2_approx_{d,f,ftz_f}
+      //   - nvvm_ex2_approx(_ftz)
       //   - nvvm_lg2_approx_{d,f,ftz_f}
       //   - nvvm_sin_approx_{f,ftz_f}
       //   - nvvm_sqrt_approx_{f,ftz_f}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index ff4d6469..ee575e3 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -207,8 +207,7 @@ template <> struct MappingTraits<WebAssemblyFunctionInfo> {
 template <> struct CustomMappingTraits<BBNumberMap> {
   static void inputOne(IO &YamlIO, StringRef Key,
                        BBNumberMap &SrcToUnwindDest) {
-    YamlIO.mapRequired(Key.str().c_str(),
-                       SrcToUnwindDest[std::atoi(Key.str().c_str())]);
+    YamlIO.mapRequired(Key, SrcToUnwindDest[std::atoi(Key.str().c_str())]);
   }
 
   static void output(IO &YamlIO, BBNumberMap &SrcToUnwindDest) {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 007074c..133406b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -22861,6 +22861,13 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
   if (!OpVT.isScalarInteger() || OpSize < 128)
     return SDValue();
 
+  // Don't do this if we're not supposed to use the FPU.
+  bool NoImplicitFloatOps =
+      DAG.getMachineFunction().getFunction().hasFnAttribute(
+          Attribute::NoImplicitFloat);
+  if (Subtarget.useSoftFloat() || NoImplicitFloatOps)
+    return SDValue();
+
   // Ignore a comparison with zero because that gets special treatment in
   // EmitTest(). But make an exception for the special case of a pair of
   // logically-combined vector-sized operands compared to zero. This pattern may
@@ -22883,13 +22890,9 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
   // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
   // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
   // Otherwise use PCMPEQ (plus AND) and mask testing.
-  bool NoImplicitFloatOps =
-      DAG.getMachineFunction().getFunction().hasFnAttribute(
-          Attribute::NoImplicitFloat);
-  if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
-      ((OpSize == 128 && Subtarget.hasSSE2()) ||
-       (OpSize == 256 && Subtarget.hasAVX()) ||
-       (OpSize == 512 && Subtarget.useAVX512Regs()))) {
+  if ((OpSize == 128 && Subtarget.hasSSE2()) ||
+      (OpSize == 256 && Subtarget.hasAVX()) ||
+      (OpSize == 512 && Subtarget.useAVX512Regs())) {
     bool HasPT = Subtarget.hasSSE41();
 
     // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
@@ -53344,105 +53347,6 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-// Look for a RMW operation that only touches one bit of a larger than legal
-// type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single
-// i32 sub value.
-static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
-                              SelectionDAG &DAG,
-                              const X86Subtarget &Subtarget) {
-  using namespace SDPatternMatch;
-
-  // Only handle normal stores and its chain was a matching normal load.
-  auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
-  if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld ||
-      !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
-      Ld->getBasePtr() != St->getBasePtr() ||
-      Ld->getOffset() != St->getOffset())
-    return SDValue();
-
-  SDValue LoadVal(Ld, 0);
-  SDValue StoredVal = St->getValue();
-  EVT VT = StoredVal.getValueType();
-
-  // Only narrow larger than legal scalar integers.
-  if (!VT.isScalarInteger() ||
-      VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32))
-    return SDValue();
-
-  // BTR: X & ~(1 << ShAmt)
-  // BTS: X | (1 << ShAmt)
-  // BTC: X ^ (1 << ShAmt)
-  //
-  // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
-  SDValue InsertBit, ShAmt;
-  if (!StoredVal.hasOneUse() ||
-      !(sd_match(StoredVal, m_And(m_Specific(LoadVal),
-                                  m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
-        sd_match(StoredVal,
-                 m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
-        sd_match(StoredVal,
-                 m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
-        sd_match(StoredVal,
-                 m_Or(m_And(m_Specific(LoadVal),
-                            m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
-                      m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
-    return SDValue();
-
-  // Ensure the shift amount is in bounds.
-  KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
-  if (KnownAmt.getMaxValue().uge(VT.getSizeInBits()))
-    return SDValue();
-
-  // If we're inserting a bit then it must be the LSB.
-  if (InsertBit) {
-    KnownBits KnownInsert = DAG.computeKnownBits(InsertBit);
-    if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1))
-      return SDValue();
-  }
-
-  // Split the shift into an alignment shift that moves the active i32 block to
-  // the bottom bits for truncation and a modulo shift that can act on the i32.
-  EVT AmtVT = ShAmt.getValueType();
-  SDValue AlignAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt,
-                                 DAG.getSignedConstant(-32LL, DL, AmtVT));
-  SDValue ModuloAmt =
-      DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT));
-  ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8);
-
-  // Compute the byte offset for the i32 block that is changed by the RMW.
-  // combineTruncate will adjust the load for us in a similar way.
-  EVT PtrVT = St->getBasePtr().getValueType();
-  SDValue PtrBitOfs = DAG.getZExtOrTrunc(AlignAmt, DL, PtrVT);
-  SDValue PtrByteOfs = DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs,
-                                   DAG.getShiftAmountConstant(3, PtrVT, DL));
-  SDValue NewPtr = DAG.getMemBasePlusOffset(St->getBasePtr(), PtrByteOfs, DL,
-                                            SDNodeFlags::NoUnsignedWrap);
-
-  // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store.
-  SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt);
-  X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
-
-  SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32,
-                             DAG.getConstant(1, DL, MVT::i32), ModuloAmt);
-
-  SDValue Res;
-  if (InsertBit) {
-    SDValue BitMask =
-        DAG.getNode(ISD::SHL, DL, MVT::i32,
-                    DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt);
-    Res =
-        DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32));
-    Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask);
-  } else {
-    if (StoredVal.getOpcode() == ISD::AND)
-      Mask = DAG.getNOT(DL, Mask, MVT::i32);
-    Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
-  }
-
-  return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
-                      Align(), St->getMemOperand()->getFlags());
-}
-
 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
                             TargetLowering::DAGCombinerInfo &DCI,
                             const X86Subtarget &Subtarget) {
@@ -53669,9 +53573,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  if (SDValue R = narrowBitOpRMW(St, dl, DAG, Subtarget))
-    return R;
-
   // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
   //         store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
   if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
@@ -54604,9 +54505,8 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
   // truncation, see if we can convert the shift into a pointer offset instead.
   // Limit this to normal (non-ext) scalar integer loads.
   if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL &&
-      Src.hasOneUse() && ISD::isNormalLoad(Src.getOperand(0).getNode()) &&
-      (Src.getOperand(0).hasOneUse() ||
-       !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, SrcVT))) {
+      Src.hasOneUse() && Src.getOperand(0).hasOneUse() &&
+      ISD::isNormalLoad(Src.getOperand(0).getNode())) {
     auto *Ld = cast<LoadSDNode>(Src.getOperand(0));
     if (Ld->isSimple() && VT.isByteSized() &&
         isPowerOf2_64(VT.getSizeInBits())) {
@@ -56406,7 +56306,6 @@ static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC,
 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
                             TargetLowering::DAGCombinerInfo &DCI,
                             const X86Subtarget &Subtarget) {
-  using namespace SDPatternMatch;
   const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
   const SDValue LHS = N->getOperand(0);
   const SDValue RHS = N->getOperand(1);
@@ -56465,37 +56364,6 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
       if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
 
-      // If we're performing a bit test on a larger than legal type, attempt
-      // to (aligned) shift down the value to the bottom 32-bits and then
-      // perform the bittest on the i32 value.
-      // ICMP_ZERO(AND(X,SHL(1,IDX)))
-      // --> ICMP_ZERO(AND(TRUNC(SRL(X,AND(IDX,-32))),SHL(1,AND(IDX,31))))
-      if (isNullConstant(RHS) &&
-          OpVT.getScalarSizeInBits() > (Subtarget.is64Bit() ? 64 : 32)) {
-        SDValue X, ShAmt;
-        if (sd_match(LHS, m_OneUse(m_And(m_Value(X),
-                                         m_Shl(m_One(), m_Value(ShAmt)))))) {
-          // Only attempt this if the shift amount is known to be in bounds.
-          KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
-          if (KnownAmt.getMaxValue().ult(OpVT.getScalarSizeInBits())) {
-            EVT AmtVT = ShAmt.getValueType();
-            SDValue AlignAmt =
-                DAG.getNode(ISD::AND, DL, AmtVT, ShAmt,
-                            DAG.getSignedConstant(-32LL, DL, AmtVT));
-            SDValue ModuloAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt,
-                                            DAG.getConstant(31, DL, AmtVT));
-            SDValue Mask = DAG.getNode(
-                ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32),
-                DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8));
-            X = DAG.getNode(ISD::SRL, DL, OpVT, X, AlignAmt);
-            X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
-            X = DAG.getNode(ISD::AND, DL, MVT::i32, X, Mask);
-            return DAG.getSetCC(DL, VT, X, DAG.getConstant(0, DL, MVT::i32),
-                                CC);
-          }
-        }
-      }
-
       // cmpeq(trunc(x),C) --> cmpeq(x,C)
       // cmpne(trunc(x),C) --> cmpne(x,C)
       // iff x upper bits are zero.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 3fed003..5298728 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -280,8 +280,7 @@ public:
 
   VPValue *createElementCount(Type *Ty, ElementCount EC) {
     VPlan &Plan = *getInsertBlock()->getPlan();
-    VPValue *RuntimeEC =
-        Plan.getOrAddLiveIn(ConstantInt::get(Ty, EC.getKnownMinValue()));
+    VPValue *RuntimeEC = Plan.getConstantInt(Ty, EC.getKnownMinValue());
     if (EC.isScalable()) {
       VPValue *VScale = createNaryOp(VPInstruction::VScale, {}, Ty);
       RuntimeEC = EC.getKnownMinValue() == 1
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 25bf49d..e5c3f17 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7752,8 +7752,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
     if (CM.isPredicatedInst(I)) {
       SmallVector<VPValue *> Ops(Operands);
       VPValue *Mask = getBlockInMask(Builder.getInsertBlock());
-      VPValue *One =
-          Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
+      VPValue *One = Plan.getConstantInt(I->getType(), 1u);
       auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
       Ops[1] = SafeRHS;
       return new VPWidenRecipe(*I, Ops);
@@ -7806,11 +7805,10 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
   }
   case Instruction::ExtractValue: {
     SmallVector<VPValue *> NewOps(Operands);
-    Type *I32Ty = IntegerType::getInt32Ty(I->getContext());
     auto *EVI = cast<ExtractValueInst>(I);
     assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
     unsigned Idx = EVI->getIndices()[0];
-    NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false)));
+    NewOps.push_back(Plan.getConstantInt(32, Idx));
     return new VPWidenRecipe(*I, NewOps);
   }
   };
@@ -8179,8 +8177,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
            "Expected an ADD or SUB operation for predicated partial "
            "reductions (because the neutral element in the mask is zero)!");
     Cond = getBlockInMask(Builder.getInsertBlock());
-    VPValue *Zero =
-        Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0));
+    VPValue *Zero = Plan.getConstantInt(Reduction->getType(), 0);
     BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc());
   }
   return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
@@ -8643,7 +8640,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       } else if (PhiR->isInLoop() && Kind == RecurKind::AddChainWithSubs &&
                  CurrentLinkI->getOpcode() == Instruction::Sub) {
         Type *PhiTy = PhiR->getUnderlyingValue()->getType();
-        auto *Zero = Plan->getOrAddLiveIn(ConstantInt::get(PhiTy, 0));
+        auto *Zero = Plan->getConstantInt(PhiTy, 0);
         VPWidenRecipe *Sub = new VPWidenRecipe(
             Instruction::Sub, {Zero, CurrentLink->getOperand(1)}, {},
             VPIRMetadata(), CurrentLinkI->getDebugLoc());
@@ -8857,8 +8854,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       ToDelete.push_back(Select);
 
       // Convert the reduction phi to operate on bools.
-      PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
-                              OrigLoop->getHeader()->getContext())));
+      PhiR->setOperand(0, Plan->getFalse());
       continue;
     }
 
@@ -8880,9 +8876,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       unsigned ScaleFactor =
           RecipeBuilder.getScalingForReduction(RdxDesc.getLoopExitInstr())
               .value_or(1);
-      Type *I32Ty = IntegerType::getInt32Ty(PhiTy->getContext());
-      auto *ScaleFactorVPV =
-          Plan->getOrAddLiveIn(ConstantInt::get(I32Ty, ScaleFactor));
+      auto *ScaleFactorVPV = Plan->getConstantInt(32, ScaleFactor);
       VPValue *StartV = PHBuilder.createNaryOp(
           VPInstruction::ReductionStartVector,
           {PhiR->getStartValue(), Iden, ScaleFactorVPV},
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1504acf..08c9c15 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -4393,15 +4393,25 @@ public:
   }
 
   /// Return a VPValue wrapping i1 true.
-  VPValue *getTrue() {
-    LLVMContext &Ctx = getContext();
-    return getOrAddLiveIn(ConstantInt::getTrue(Ctx));
-  }
+  VPValue *getTrue() { return getConstantInt(1, 1); }
 
   /// Return a VPValue wrapping i1 false.
-  VPValue *getFalse() {
-    LLVMContext &Ctx = getContext();
-    return getOrAddLiveIn(ConstantInt::getFalse(Ctx));
+  VPValue *getFalse() { return getConstantInt(1, 0); }
+
+  /// Return a VPValue wrapping a ConstantInt with the given type and value.
+  VPValue *getConstantInt(Type *Ty, uint64_t Val, bool IsSigned = false) {
+    return getOrAddLiveIn(ConstantInt::get(Ty, Val, IsSigned));
+  }
+
+  /// Return a VPValue wrapping a ConstantInt with the given bitwidth and value.
+  VPValue *getConstantInt(unsigned BitWidth, uint64_t Val,
+                          bool IsSigned = false) {
+    return getConstantInt(APInt(BitWidth, Val, IsSigned));
+  }
+
+  /// Return a VPValue wrapping a ConstantInt with the given APInt value.
+  VPValue *getConstantInt(const APInt &Val) {
+    return getOrAddLiveIn(ConstantInt::get(getContext(), Val));
   }
 
   /// Return the live-in VPValue for \p V, if there is one or nullptr otherwise.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 65688a3..1a66d20 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -612,8 +612,7 @@ void VPlanTransforms::addMiddleCheck(VPlan &Plan,
   if (!RequiresScalarEpilogueCheck)
     Cmp = Plan.getFalse();
   else if (TailFolded)
-    Cmp = Plan.getOrAddLiveIn(
-        ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext())));
+    Cmp = Plan.getTrue();
   else
     Cmp = Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(),
                              &Plan.getVectorTripCount(), LatchDL, "cmp.n");
@@ -712,8 +711,8 @@ void VPlanTransforms::addMinimumIterationCheck(
       // additional overflow check is required before entering the vector loop.
 
       // Get the maximum unsigned value for the type.
-      VPValue *MaxUIntTripCount = Plan.getOrAddLiveIn(ConstantInt::get(
-          TripCountTy, cast<IntegerType>(TripCountTy)->getMask()));
+      VPValue *MaxUIntTripCount =
+          Plan.getConstantInt(cast<IntegerType>(TripCountTy)->getMask());
       VPValue *DistanceToMax = Builder.createNaryOp(
           Instruction::Sub, {MaxUIntTripCount, TripCountVPV},
           DebugLoc::getUnknown());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d491d56..6a8231b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -699,8 +699,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
         continue;
 
       const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
-      VPValue *StartV =
-          Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0));
+      VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
       VPValue *StepV = PtrIV->getOperand(1);
       VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
           Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
@@ -836,7 +835,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
   // changed it means the exit is using the incremented value, so we need to
   // add the step.
   if (Incoming != WideIV) {
-    VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1));
+    VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
     EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
   }
 
@@ -882,7 +881,7 @@ static VPValue *optimizeLatchExitInductionUser(
     return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
   if (ScalarTy->isPointerTy()) {
     Type *StepTy = TypeInfo.inferScalarType(Step);
-    auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0));
+    auto *Zero = Plan.getConstantInt(StepTy, 0);
     return B.createPtrAdd(EndValue,
                           B.createNaryOp(Instruction::Sub, {Zero, Step}),
                           DebugLoc::getUnknown(), "ind.escape");
@@ -1574,9 +1573,9 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
       continue;
 
     // Update IV operands and comparison bound to use new narrower type.
-    auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0));
+    auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
     WideIV->setStartValue(NewStart);
-    auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
+    auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
     WideIV->setStepValue(NewStep);
 
     auto *NewBTC = new VPWidenCastRecipe(
@@ -1695,8 +1694,7 @@ static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF,
 
   // When using wide lane masks, the return type of the get.active.lane.mask
   // intrinsic is VF x UF (last operand).
-  VPValue *ALMMultiplier =
-      Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF));
+  VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
   EntryALM->setOperand(2, ALMMultiplier);
   LoopALM->setOperand(2, ALMMultiplier);
 
@@ -2403,7 +2401,7 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
 
   // Create the active lane mask instruction in the VPlan preheader.
   VPValue *ALMMultiplier =
-      Plan.getOrAddLiveIn(ConstantInt::get(TopRegion->getCanonicalIVType(), 1));
+      Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
   auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
                                         {EntryIncrement, TC, ALMMultiplier}, DL,
                                         "active.lane.mask.entry");
@@ -2790,8 +2788,7 @@ void VPlanTransforms::addExplicitVectorLength(
 
   if (MaxSafeElements) {
     // Support for MaxSafeDist for correct loop emission.
-    VPValue *AVLSafe =
-        Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements));
+    VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
     VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
     AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
                                "safe_avl");
@@ -2904,9 +2901,8 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
 
   Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
   VPBuilder Builder(LatchExitingBr);
-  VPValue *Cmp =
-      Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
-                         Plan.getOrAddLiveIn(ConstantInt::getNullValue(AVLTy)));
+  VPValue *Cmp = Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
+                                    Plan.getConstantInt(AVLTy, 0));
   Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
   LatchExitingBr->eraseFromParent();
 }
@@ -2930,8 +2926,7 @@ void VPlanTransforms::replaceSymbolicStrides(
       // Only handle constant strides for now.
       continue;
 
-    auto *CI =
-        Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst));
+    auto *CI = Plan.getConstantInt(*StrideConst);
     if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
 
@@ -2946,7 +2941,7 @@ void VPlanTransforms::replaceSymbolicStrides(
       unsigned BW = U->getType()->getScalarSizeInBits();
       APInt C =
           isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
-      VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C));
+      VPValue *CI = Plan.getConstantInt(C);
       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
     }
     RewriteMap[StrideV] = PSE.getSCEV(StrideV);
@@ -3125,8 +3120,7 @@ void VPlanTransforms::createInterleaveGroups(
                    DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
                        IG->getIndex(IRInsertPos),
                    /*IsSigned=*/true);
-      VPValue *OffsetVPV =
-          Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset));
+      VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
       VPBuilder B(InsertPos);
       Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
     }
@@ -3867,8 +3861,7 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
   VPBuilder Builder(VectorPH, VectorPH->begin());
   auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
   auto *TCMO = Builder.createNaryOp(
-      Instruction::Sub,
-      {Plan.getTripCount(), Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))},
+      Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
       DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
   BTC->replaceAllUsesWith(TCMO);
 }
@@ -3993,9 +3986,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
   if (TailByMasking) {
     TC = Builder.createNaryOp(
         Instruction::Add,
-        {TC, Builder.createNaryOp(
-                 Instruction::Sub,
-                 {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})},
+        {TC, Builder.createNaryOp(Instruction::Sub,
+                                  {Step, Plan.getConstantInt(TCTy, 1)})},
         DebugLoc::getCompilerGenerated(), "n.rnd.up");
   }
 
@@ -4017,8 +4009,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
   if (RequiresScalarEpilogue) {
     assert(!TailByMasking &&
            "requiring scalar epilogue is not supported with fail folding");
-    VPValue *IsZero = Builder.createICmp(
-        CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0)));
+    VPValue *IsZero =
+        Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
     R = Builder.createSelect(IsZero, Step, R);
   }
 
@@ -4056,7 +4048,7 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
   }
   VF.replaceAllUsesWith(RuntimeVF);
 
-  VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
+  VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
   VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
   VFxUF.replaceAllUsesWith(MulByUF);
 }
@@ -4346,7 +4338,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
   } else {
     Inc->setOperand(1, UF);
     Plan.getVF().replaceAllUsesWith(
-        Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
+        Plan.getConstantInt(CanIV->getScalarType(), 1));
   }
   removeDeadRecipes(Plan);
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index f15113c..d6a0028 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -68,9 +68,9 @@ class UnrollState {
   void unrollWidenInductionByUF(VPWidenInductionRecipe *IV,
                                 VPBasicBlock::iterator InsertPtForPhi);
 
-  VPValue *getConstantVPV(unsigned Part) {
+  VPValue *getConstantInt(unsigned Part) {
     Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType();
-    return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part));
+    return Plan.getConstantInt(CanIVIntTy, Part);
   }
 
 public:
@@ -137,7 +137,7 @@ void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
       for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) {
         remapOperands(&PartIR, Part);
         if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) {
-          ScalarIVSteps->addOperand(getConstantVPV(Part));
+          ScalarIVSteps->addOperand(getConstantInt(Part));
         }
 
         addRecipeForPart(&Part0R, &PartIR, Part);
@@ -249,7 +249,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
         for (unsigned Part = 1; Part != UF; ++Part)
           VPV2Parts[VPI][Part - 1] = StartV;
       }
-      Copy->addOperand(getConstantVPV(Part));
+      Copy->addOperand(getConstantInt(Part));
     } else {
       assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
              "unexpected header phi recipe not needing unrolled part");
@@ -318,7 +318,7 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
             VPVectorPointerRecipe, VPVectorEndPointerRecipe>(Copy) ||
         match(Copy,
               m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>()))
-      Copy->addOperand(getConstantVPV(Part));
+      Copy->addOperand(getConstantInt(Part));
 
     if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe>(R))
       Copy->setOperand(0, R.getOperand(0));
@@ -474,8 +474,7 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
     if (LaneDefs != Def2LaneDefs.end())
       return LaneDefs->second[Lane.getKnownLane()];
 
-    VPValue *Idx =
-        Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
+    VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane());
     return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
   }
 
@@ -509,8 +508,7 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
           cast<VPInstruction>(Op)->getOperand(Lane.getKnownLane()));
       continue;
     }
-    VPValue *Idx =
-        Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
+    VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane());
     VPValue *Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
     NewOps.push_back(Ext);
   }
diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
index 362586a..4fc506f 100644
--- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
+++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
@@ -87,6 +87,11 @@ declare void @llvm.nvvm.barrier(i32, i32)
 declare void @llvm.nvvm.barrier.sync(i32)
 declare void @llvm.nvvm.barrier.sync.cnt(i32, i32)
 
+declare float @llvm.nvvm.ex2.approx.f(float)
+declare double @llvm.nvvm.ex2.approx.d(double)
+declare <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half>)
+declare float @llvm.nvvm.ex2.approx.ftz.f(float)
+
 ; CHECK-LABEL: @simple_upgrade
 define void @simple_upgrade(i32 %a, i64 %b, i16 %c) {
 ; CHECK: call i32 @llvm.bitreverse.i32(i32 %a)
@@ -355,3 +360,15 @@ define void @cta_barriers(i32 %x, i32 %y) {
   call void @llvm.nvvm.barrier.sync.cnt(i32 %x, i32 %y)
   ret void
 }
+
+define void @nvvm_ex2_approx(float %a, double %b, half %c, <2 x half> %d) {
+; CHECK: call float @llvm.nvvm.ex2.approx.f32(float %a)
+; CHECK: call double @llvm.nvvm.ex2.approx.f64(double %b)
+; CHECK: call <2 x half> @llvm.nvvm.ex2.approx.v2f16(<2 x half> %d)
+; CHECK: call float @llvm.nvvm.ex2.approx.ftz.f32(float %a)
+  %r1 = call float @llvm.nvvm.ex2.approx.f(float %a)
+  %r2 = call double @llvm.nvvm.ex2.approx.d(double %b)
+  %r3 = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> %d)
+  %r4 = call float @llvm.nvvm.ex2.approx.ftz.f(float %a)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
index 02d0e52..6facdfd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
@@ -104,109 +104,110 @@ define amdgpu_cs <4 x i32> @abs_sgpr_v4i32(<4 x i32> inreg %arg) {
   ret <4 x i32> %res
 }
 
-define amdgpu_cs i16 @abs_vgpr_i16(i16 %arg) {
+define i16 @abs_vgpr_i16(i16 %arg) {
 ; GFX6-LABEL: abs_vgpr_i16:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 0, v0
 ; GFX6-NEXT:    v_max_i32_e32 v0, v0, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_i16:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u16_e32 v1, 0, v0
 ; GFX8-NEXT:    v_max_i16_e32 v0, v0, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_i16:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u16 v1, 0, v0
 ; GFX10-NEXT:    v_max_i16 v0, v0, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_sub_nc_u16 v1, 0, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_max_i16 v0, v0, v1
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
   ret i16 %res
 }
 
-define amdgpu_cs i32 @abs_vgpr_i32(i32 %arg) {
+define i32 @abs_vgpr_i32(i32 %arg) {
 ; GFX6-LABEL: abs_vgpr_i32:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 0, v0
 ; GFX6-NEXT:    v_max_i32_e32 v0, v0, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_i32:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 0, v0
 ; GFX8-NEXT:    v_max_i32_e32 v0, v0, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_i32:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX10-NEXT:    v_max_i32_e32 v0, v0, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_max_i32_e32 v0, v0, v1
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
   ret i32 %res
 }
 
-define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) {
+define i64 @abs_vgpr_i64(i64 %arg) {
 ; GFX6-LABEL: abs_vgpr_i64:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_i64:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v1, v1, v2
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_i64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_mov_b32_e32 v3, v2
@@ -214,17 +215,15 @@ define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) {
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX1250-NEXT:    v_xor_b32_e32 v1, v1, v2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call i64 @llvm.abs.i64(i64 %arg, i1 false)
   ret i64 %res
 }
 
-define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
+define <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
 ; GFX6-LABEL: abs_vgpr_v4i32:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
 ; GFX6-NEXT:    v_max_i32_e32 v0, v0, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0, v1
@@ -233,14 +232,11 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
 ; GFX6-NEXT:    v_max_i32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, v3, v4
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_v4i32:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0, v0
 ; GFX8-NEXT:    v_max_i32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0, v1
@@ -249,14 +245,11 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
 ; GFX8-NEXT:    v_max_i32_e32 v2, v2, v4
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0, v3
 ; GFX8-NEXT:    v_max_i32_e32 v3, v3, v4
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX8-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_v4i32:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 0, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 0, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 0, v2
@@ -265,14 +258,12 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
 ; GFX10-NEXT:    v_max_i32_e32 v1, v1, v5
 ; GFX10-NEXT:    v_max_i32_e32 v2, v2, v6
 ; GFX10-NEXT:    v_max_i32_e32 v3, v3, v7
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_v4i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_sub_nc_u32 v4, 0, v0 :: v_dual_sub_nc_u32 v5, 0, v1
 ; GFX1250-NEXT:    v_dual_sub_nc_u32 v6, 0, v2 :: v_dual_sub_nc_u32 v7, 0, v3
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -281,13 +272,7 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_max_i32_e32 v2, v2, v6
 ; GFX1250-NEXT:    v_max_i32_e32 v3, v3, v7
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX1250-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false)
   ret <4 x i32> %res
 }
@@ -304,44 +289,43 @@ define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) {
   ret <2 x i8> %res
 }
 
-define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
+define <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
 ; GFX6-LABEL: abs_vgpr_v2i8:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0, v0
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX6-NEXT:    v_max_i32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0, v1
 ; GFX6-NEXT:    v_max_i32_e32 v1, v1, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_v2i8:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_sub_u16_sdwa v3, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_sub_u16_sdwa v2, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_max_i16_sdwa v0, sext(v0), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_max_i16_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_v2i8:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX10-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX10-NEXT:    v_sub_nc_u16 v2, 0, v0
 ; GFX10-NEXT:    v_sub_nc_u16 v3, 0, v1
 ; GFX10-NEXT:    v_max_i16 v0, v0, v2
 ; GFX10-NEXT:    v_max_i16 v1, v1, v3
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_v2i8:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX1250-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -350,10 +334,7 @@ define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_max_i16 v0, v0, v2
 ; GFX1250-NEXT:    v_max_i16 v1, v1, v3
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false)
   ret <2 x i8> %res
 }
@@ -372,9 +353,10 @@ define amdgpu_cs <3 x i8> @abs_sgpr_v3i8(<3 x i8> inreg %arg) {
   ret <3 x i8> %res
 }
 
-define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8>  %arg) {
+define <3 x i8> @abs_vgpr_v3i8(<3 x i8>  %arg) {
 ; GFX6-LABEL: abs_vgpr_v3i8:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v0
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 8
@@ -384,13 +366,11 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8>  %arg) {
 ; GFX6-NEXT:    v_max_i32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
 ; GFX6-NEXT:    v_max_i32_e32 v2, v2, v3
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_v3i8:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    v_sub_u16_sdwa v4, v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_max_i16_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -398,13 +378,11 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8>  %arg) {
 ; GFX8-NEXT:    v_sub_u16_sdwa v3, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_max_i16_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_max_i16_sdwa v2, sext(v2), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_v3i8:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX10-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX10-NEXT:    v_bfe_i32 v2, v2, 0, 8
@@ -414,13 +392,12 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8>  %arg) {
 ; GFX10-NEXT:    v_max_i16 v0, v0, v3
 ; GFX10-NEXT:    v_max_i16 v1, v1, v4
 ; GFX10-NEXT:    v_max_i16 v2, v2, v5
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_v3i8:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX1250-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX1250-NEXT:    v_bfe_i32 v2, v2, 0, 8
@@ -433,12 +410,7 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8>  %arg) {
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_max_i16 v1, v1, v4
 ; GFX1250-NEXT:    v_max_i16 v2, v2, v5
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX1250-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false)
   ret <3 x i8> %res
 }
@@ -485,44 +457,44 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) {
   ret <2 x i16> %res
 }
 
-define amdgpu_cs <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
+define <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
 ; GFX6-LABEL: abs_vgpr_v2i16:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0, v0
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_max_i32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0, v1
 ; GFX6-NEXT:    v_max_i32_e32 v1, v1, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_v2i16:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_sub_u16_e32 v1, 0, v0
 ; GFX8-NEXT:    v_sub_u16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_i16_e32 v1, v0, v1
 ; GFX8-NEXT:    v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_v2i16:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v1, 0, v0
 ; GFX10-NEXT:    v_pk_max_i16 v0, v0, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_v2i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_pk_sub_i16 v1, 0, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_pk_max_i16 v0, v0, v1
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false)
   ret <2 x i16> %res
 }
@@ -576,9 +548,10 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) {
   ret <3 x i16> %res
 }
 
-define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
+define <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
 ; GFX6-LABEL: abs_vgpr_v3i16:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v0
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
@@ -588,13 +561,11 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
 ; GFX6-NEXT:    v_max_i32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
 ; GFX6-NEXT:    v_max_i32_e32 v2, v2, v3
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX6-NEXT:    ; return to shader part epilog
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_vgpr_v3i16:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    v_sub_u16_e32 v2, 0, v0
 ; GFX8-NEXT:    v_sub_u16_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -603,31 +574,27 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
 ; GFX8-NEXT:    v_max_i16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_max_i16_e32 v1, v1, v4
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: abs_vgpr_v3i16:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v2, 0, v0
 ; GFX10-NEXT:    v_sub_nc_u16 v3, 0, v1
 ; GFX10-NEXT:    v_pk_max_i16 v0, v0, v2
 ; GFX10-NEXT:    v_max_i16 v1, v1, v3
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: abs_vgpr_v3i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_pk_sub_i16 v2, 0, v0
 ; GFX1250-NEXT:    v_sub_nc_u16 v3, 0, v1
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_pk_max_i16 v0, v0, v2
 ; GFX1250-NEXT:    v_max_i16 v1, v1, v3
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT:    ; return to shader part epilog
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false)
   ret <3 x i16> %res
 }
diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index 1b8e126..a1381ec 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -945,7 +945,6 @@ body: |
     $vgpr0 = V_MOV_B32_e32 0, implicit $exec
 ...
 
-# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
 ---
 name: wait_kmcnt_with_outstanding_vmem_2
 tracksRegLiveness: true
@@ -971,6 +970,7 @@ body: |
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   S_WAIT_KMCNT 0
   ; GCN-NEXT:   $sgpr2 = S_MOV_B32 $sgpr2
+  ; GCN-NEXT:   S_WAIT_XCNT 0
   ; GCN-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   bb.0:
     liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
@@ -986,6 +986,180 @@ body: |
 ...
 
 ---
+name: wait_kmcnt_and_wait_loadcnt
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  ; GCN-LABEL: name: wait_kmcnt_and_wait_loadcnt
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   liveins: $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_WAIT_KMCNT 0
+  ; GCN-NEXT:   $sgpr2 = S_MOV_B32 $sgpr2
+  ; GCN-NEXT:   S_WAIT_LOADCNT 0
+  ; GCN-NEXT:   $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  bb.0:
+    liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    S_CBRANCH_SCC1 %bb.2, implicit $scc
+  bb.1:
+    liveins: $vgpr0_vgpr1, $sgpr2
+    $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  bb.2:
+    liveins: $sgpr2
+    $sgpr2 = S_MOV_B32 $sgpr2
+    $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+...
+
+---
+name: implicit_handling_of_pending_vmem_group
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  ; GCN-LABEL: name: implicit_handling_of_pending_vmem_group
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_WAIT_KMCNT 0
+  ; GCN-NEXT:   $sgpr2 = S_MOV_B32 $sgpr2
+  ; GCN-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+  ; GCN-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   S_WAIT_XCNT 0
+  ; GCN-NEXT:   $sgpr0 = S_MOV_B32 $sgpr0
+  bb.0:
+    liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    S_CBRANCH_SCC1 %bb.2, implicit $scc
+  bb.1:
+    liveins: $vgpr0_vgpr1, $sgpr2
+    $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  bb.2:
+    liveins: $sgpr0_sgpr1, $sgpr2
+    $sgpr2 = S_MOV_B32 $sgpr2
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $sgpr0 = S_MOV_B32 $sgpr0
+...
+
+---
+name: pending_vmem_event_between_block
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  ; GCN-LABEL: name: pending_vmem_event_between_block
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  ; GCN-NEXT:   $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_WAIT_KMCNT 0
+  ; GCN-NEXT:   $sgpr2 = S_MOV_B32 $sgpr2
+  ; GCN-NEXT:   S_WAIT_XCNT 1
+  ; GCN-NEXT:   $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   S_WAIT_XCNT 0
+  ; GCN-NEXT:   $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   $sgpr0 = S_MOV_B32 $sgpr0
+  bb.0:
+    liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    S_CBRANCH_SCC1 %bb.2, implicit $scc
+  bb.1:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
+    $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+    $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
+  bb.2:
+    liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
+    $sgpr2 = S_MOV_B32 $sgpr2
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+    $sgpr0 = S_MOV_B32 $sgpr0
+...
+
+---
+name: flushing_vmem_cnt_on_block_entry
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  ; GCN-LABEL: name: flushing_vmem_cnt_on_block_entry
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  ; GCN-NEXT:   $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_WAIT_XCNT 0
+  ; GCN-NEXT:   $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   $sgpr0 = S_MOV_B32 $sgpr0
+  bb.0:
+    liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    S_CBRANCH_SCC1 %bb.2, implicit $scc
+  bb.1:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
+    $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+    $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
+  bb.2:
+    liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+    $sgpr0 = S_MOV_B32 $sgpr0
+...
+
+---
 name: wait_loadcnt_with_outstanding_smem
 tracksRegLiveness: true
 machineFunctionInfo:
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll b/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll
index ba2118f..b3155c9 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll
@@ -106,6 +106,69 @@ define void @ctlz_v4i64(ptr %src, ptr %dst) nounwind {
   ret void
 }
 
+define void @not_ctlz_v32i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvxori.b $xr0, $xr0, 255
+; CHECK-NEXT:    xvclz.b $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <32 x i8>, ptr %src
+  %neg = xor <32 x i8> %v, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %neg, i1 false)
+  store <32 x i8> %res, ptr %dst
+  ret void
+}
+
+define void @not_ctlz_v16i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvrepli.b $xr1, -1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvclz.h $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <16 x i16>, ptr %src
+  %neg = xor <16 x i16> %v, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %neg, i1 false)
+  store <16 x i16> %res, ptr %dst
+  ret void
+}
+
+define void @not_ctlz_v8i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvrepli.b $xr1, -1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvclz.w $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i32>, ptr %src
+  %neg = xor <8 x i32> %v, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %neg, i1 false)
+  store <8 x i32> %res, ptr %dst
+  ret void
+}
+
+define void @not_ctlz_v4i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvrepli.b $xr1, -1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvclz.d $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i64>, ptr %src
+  %neg = xor <4 x i64> %v, <i64 -1, i64 -1, i64 -1, i64 -1>
+  %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %neg, i1 false)
+  store <4 x i64> %res, ptr %dst
+  ret void
+}
+
 declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>)
 declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>)
 declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>)
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll b/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll
index a9a38e8..6ac7d51 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll
@@ -106,6 +106,69 @@ define void @ctlz_v2i64(ptr %src, ptr %dst) nounwind {
   ret void
 }
 
+define void @not_ctlz_v16i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vxori.b $vr0, $vr0, 255
+; CHECK-NEXT:    vclz.b $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <16 x i8>, ptr %src
+  %neg = xor <16 x i8> %v, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %neg, i1 false)
+  store <16 x i8> %res, ptr %dst
+  ret void
+}
+
+define void @not_ctlz_v8i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vrepli.b $vr1, -1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vclz.h $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i16>, ptr %src
+  %neg = xor <8 x i16> %v, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %neg, i1 false)
+  store <8 x i16> %res, ptr %dst
+  ret void
+}
+
+define void @not_ctlz_v4i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vrepli.b $vr1, -1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vclz.w $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i32>, ptr %src
+  %neg = xor <4 x i32> %v, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %neg, i1 false)
+  store <4 x i32> %res, ptr %dst
+  ret void
+}
+
+define void @not_ctlz_v2i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vrepli.b $vr1, -1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vclz.d $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a1, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i64>, ptr %src
+  %neg = xor <2 x i64> %v, <i64 -1, i64 -1>
+  %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %neg, i1 false)
+  store <2 x i64> %res, ptr %dst
+  ret void
+}
+
 declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>)
 declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
 declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
diff --git a/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll
new file mode 100644
index 0000000..9a806a1
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll
@@ -0,0 +1,758 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx --verify-machineinstrs < %s \
+; RUN:   | FileCheck --check-prefix=LA32 %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx --verify-machineinstrs < %s \
+; RUN:   | FileCheck --check-prefix=LA64 %s
+
+%struct.S = type { i64, i64, i8 }
+%struct.F = type { float, double, float }
+%struct.V = type { <4 x i32>, <4 x i32>, <16 x i16> }
+
+define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_i64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -48
+; LA32-NEXT:    st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s5, $sp, 16 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s6, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    move $s0, $a3
+; LA32-NEXT:    move $s1, $a2
+; LA32-NEXT:    slli.w $a1, $a0, 4
+; LA32-NEXT:    alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT:    add.w $a0, $a4, $a0
+; LA32-NEXT:    sltui $a1, $a3, 1
+; LA32-NEXT:    slti $a2, $a3, 0
+; LA32-NEXT:    masknez $a2, $a2, $a1
+; LA32-NEXT:    sltui $a3, $s1, 1
+; LA32-NEXT:    maskeqz $a1, $a3, $a1
+; LA32-NEXT:    or $a1, $a1, $a2
+; LA32-NEXT:    addi.w $s2, $a0, 8
+; LA32-NEXT:    bnez $a1, .LBB0_3
+; LA32-NEXT:  # %bb.1: # %for.body.preheader
+; LA32-NEXT:    move $fp, $a4
+; LA32-NEXT:    move $s4, $zero
+; LA32-NEXT:    move $s5, $zero
+; LA32-NEXT:    move $s3, $zero
+; LA32-NEXT:    move $s6, $zero
+; LA32-NEXT:    .p2align 4, , 16
+; LA32-NEXT:  .LBB0_2: # %for.body
+; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA32-NEXT:    move $a0, $fp
+; LA32-NEXT:    bl f
+; LA32-NEXT:    ld.w $a0, $s2, 4
+; LA32-NEXT:    ld.w $a1, $s2, 0
+; LA32-NEXT:    add.w $a0, $a0, $s6
+; LA32-NEXT:    add.w $s3, $a1, $s3
+; LA32-NEXT:    sltu $a1, $s3, $a1
+; LA32-NEXT:    addi.w $s4, $s4, 1
+; LA32-NEXT:    sltui $a2, $s4, 1
+; LA32-NEXT:    add.w $s5, $s5, $a2
+; LA32-NEXT:    xor $a2, $s4, $s1
+; LA32-NEXT:    xor $a3, $s5, $s0
+; LA32-NEXT:    or $a2, $a2, $a3
+; LA32-NEXT:    add.w $s6, $a0, $a1
+; LA32-NEXT:    bnez $a2, .LBB0_2
+; LA32-NEXT:    b .LBB0_4
+; LA32-NEXT:  .LBB0_3:
+; LA32-NEXT:    move $s3, $zero
+; LA32-NEXT:    move $s6, $zero
+; LA32-NEXT:  .LBB0_4: # %for.cond.cleanup
+; LA32-NEXT:    st.w $s3, $s2, 0
+; LA32-NEXT:    st.w $s6, $s2, 4
+; LA32-NEXT:    ld.w $s6, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s5, $sp, 16 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 48
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: sink_fold_i64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -48
+; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s2, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    move $s0, $a1
+; LA64-NEXT:    slli.d $a1, $a0, 4
+; LA64-NEXT:    alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT:    add.d $a0, $a2, $a0
+; LA64-NEXT:    addi.d $s1, $a0, 8
+; LA64-NEXT:    blez $s0, .LBB0_3
+; LA64-NEXT:  # %bb.1: # %for.body.preheader
+; LA64-NEXT:    move $fp, $a2
+; LA64-NEXT:    move $s2, $zero
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB0_2: # %for.body
+; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA64-NEXT:    move $a0, $fp
+; LA64-NEXT:    pcaddu18i $ra, %call36(f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $a0, $s1, 0
+; LA64-NEXT:    addi.d $s0, $s0, -1
+; LA64-NEXT:    add.d $s2, $a0, $s2
+; LA64-NEXT:    bnez $s0, .LBB0_2
+; LA64-NEXT:    b .LBB0_4
+; LA64-NEXT:  .LBB0_3:
+; LA64-NEXT:    move $s2, $zero
+; LA64-NEXT:  .LBB0_4: # %for.cond.cleanup
+; LA64-NEXT:    st.d $s2, $s1, 0
+; LA64-NEXT:    ld.d $s2, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 48
+; LA64-NEXT:    ret
+entry:
+  %y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 1
+  %cmp4 = icmp sgt i64 %n, 0
+  br i1 %cmp4, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %s.05 = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  call void @f(ptr %a)
+  %0 = load i64, ptr %y
+  %add = add nsw i64 %0, %s.05
+  %inc = add nuw nsw i64 %i.06, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %s.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  store i64 %s.0.lcssa, ptr %y
+  ret void
+}
+
+define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_f32:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -48
+; LA32-NEXT:    st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT:    fst.d $fs0, $sp, 8 # 8-byte Folded Spill
+; LA32-NEXT:    move $s0, $a3
+; LA32-NEXT:    move $s1, $a2
+; LA32-NEXT:    slli.w $a1, $a0, 4
+; LA32-NEXT:    alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT:    add.w $a0, $a4, $a0
+; LA32-NEXT:    sltui $a1, $a3, 1
+; LA32-NEXT:    slti $a2, $a3, 0
+; LA32-NEXT:    masknez $a2, $a2, $a1
+; LA32-NEXT:    sltui $a3, $s1, 1
+; LA32-NEXT:    maskeqz $a1, $a3, $a1
+; LA32-NEXT:    or $a1, $a1, $a2
+; LA32-NEXT:    addi.w $s2, $a0, 16
+; LA32-NEXT:    bnez $a1, .LBB1_3
+; LA32-NEXT:  # %bb.1: # %for.body.preheader
+; LA32-NEXT:    move $fp, $a4
+; LA32-NEXT:    move $s3, $zero
+; LA32-NEXT:    move $s4, $zero
+; LA32-NEXT:    movgr2fr.w $fs0, $zero
+; LA32-NEXT:    .p2align 4, , 16
+; LA32-NEXT:  .LBB1_2: # %for.body
+; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA32-NEXT:    move $a0, $fp
+; LA32-NEXT:    bl f
+; LA32-NEXT:    fld.s $fa0, $s2, 0
+; LA32-NEXT:    addi.w $s3, $s3, 1
+; LA32-NEXT:    sltui $a0, $s3, 1
+; LA32-NEXT:    add.w $s4, $s4, $a0
+; LA32-NEXT:    xor $a0, $s3, $s1
+; LA32-NEXT:    xor $a1, $s4, $s0
+; LA32-NEXT:    or $a0, $a0, $a1
+; LA32-NEXT:    fadd.s $fs0, $fa0, $fs0
+; LA32-NEXT:    bnez $a0, .LBB1_2
+; LA32-NEXT:    b .LBB1_4
+; LA32-NEXT:  .LBB1_3:
+; LA32-NEXT:    movgr2fr.w $fs0, $zero
+; LA32-NEXT:  .LBB1_4: # %for.cond.cleanup
+; LA32-NEXT:    fst.s $fs0, $s2, 0
+; LA32-NEXT:    fld.d $fs0, $sp, 8 # 8-byte Folded Reload
+; LA32-NEXT:    ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 48
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: sink_fold_f32:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -48
+; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    fst.d $fs0, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    move $s0, $a1
+; LA64-NEXT:    slli.d $a1, $a0, 4
+; LA64-NEXT:    alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT:    add.d $a0, $a2, $a0
+; LA64-NEXT:    addi.d $s1, $a0, 16
+; LA64-NEXT:    blez $s0, .LBB1_3
+; LA64-NEXT:  # %bb.1: # %for.body.preheader
+; LA64-NEXT:    move $fp, $a2
+; LA64-NEXT:    movgr2fr.w $fs0, $zero
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB1_2: # %for.body
+; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA64-NEXT:    move $a0, $fp
+; LA64-NEXT:    pcaddu18i $ra, %call36(f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    fld.s $fa0, $s1, 0
+; LA64-NEXT:    addi.d $s0, $s0, -1
+; LA64-NEXT:    fadd.s $fs0, $fa0, $fs0
+; LA64-NEXT:    bnez $s0, .LBB1_2
+; LA64-NEXT:    b .LBB1_4
+; LA64-NEXT:  .LBB1_3:
+; LA64-NEXT:    movgr2fr.w $fs0, $zero
+; LA64-NEXT:  .LBB1_4: # %for.cond.cleanup
+; LA64-NEXT:    fst.s $fs0, $s1, 0
+; LA64-NEXT:    fld.d $fs0, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 48
+; LA64-NEXT:    ret
+entry:
+  %y = getelementptr inbounds %struct.F, ptr %a, i64 %k, i32 2
+  %cmp4 = icmp sgt i64 %n, 0
+  br i1 %cmp4, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %s.05 = phi float [ 0.0, %entry ], [ %add, %for.body ]
+  call void @f(ptr %a)
+  %0 = load float, ptr %y
+  %add = fadd float %0, %s.05
+  %inc = add nuw nsw i64 %i.06, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %s.0.lcssa = phi float [ 0.0, %entry ], [ %add, %for.body ]
+  store float %s.0.lcssa, ptr %y
+  ret void
+}
+
+define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_v4i32:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -48
+; LA32-NEXT:    st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT:    move $s0, $a3
+; LA32-NEXT:    move $s1, $a2
+; LA32-NEXT:    slli.w $a0, $a0, 6
+; LA32-NEXT:    add.w $a0, $a4, $a0
+; LA32-NEXT:    sltui $a1, $a3, 1
+; LA32-NEXT:    slti $a2, $a3, 0
+; LA32-NEXT:    masknez $a2, $a2, $a1
+; LA32-NEXT:    sltui $a3, $s1, 1
+; LA32-NEXT:    maskeqz $a1, $a3, $a1
+; LA32-NEXT:    or $a1, $a1, $a2
+; LA32-NEXT:    addi.w $s2, $a0, 16
+; LA32-NEXT:    bnez $a1, .LBB2_3
+; LA32-NEXT:  # %bb.1: # %for.body.preheader
+; LA32-NEXT:    move $fp, $a4
+; LA32-NEXT:    move $s3, $zero
+; LA32-NEXT:    move $s4, $zero
+; LA32-NEXT:    vrepli.b $vr0, 0
+; LA32-NEXT:    .p2align 4, , 16
+; LA32-NEXT:  .LBB2_2: # %for.body
+; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA32-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT:    move $a0, $fp
+; LA32-NEXT:    bl f
+; LA32-NEXT:    vld $vr0, $s2, 0
+; LA32-NEXT:    addi.w $s3, $s3, 1
+; LA32-NEXT:    sltui $a0, $s3, 1
+; LA32-NEXT:    add.w $s4, $s4, $a0
+; LA32-NEXT:    xor $a0, $s3, $s1
+; LA32-NEXT:    xor $a1, $s4, $s0
+; LA32-NEXT:    or $a0, $a0, $a1
+; LA32-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT:    vadd.w $vr1, $vr0, $vr1
+; LA32-NEXT:    vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT:    bnez $a0, .LBB2_2
+; LA32-NEXT:    b .LBB2_4
+; LA32-NEXT:  .LBB2_3:
+; LA32-NEXT:    vrepli.b $vr0, 0
+; LA32-NEXT:  .LBB2_4: # %for.cond.cleanup
+; LA32-NEXT:    vst $vr0, $s2, 0
+; LA32-NEXT:    ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 48
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: sink_fold_v4i32:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -48
+; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    slli.d $a0, $a0, 6
+; LA64-NEXT:    add.d $a0, $a2, $a0
+; LA64-NEXT:    addi.d $s1, $a0, 16
+; LA64-NEXT:    blez $a1, .LBB2_3
+; LA64-NEXT:  # %bb.1: # %for.body.preheader
+; LA64-NEXT:    move $fp, $a2
+; LA64-NEXT:    move $s0, $a1
+; LA64-NEXT:    vrepli.b $vr0, 0
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB2_2: # %for.body
+; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    move $a0, $fp
+; LA64-NEXT:    pcaddu18i $ra, %call36(f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    vld $vr0, $s1, 0
+; LA64-NEXT:    addi.d $s0, $s0, -1
+; LA64-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vadd.w $vr1, $vr0, $vr1
+; LA64-NEXT:    vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    bnez $s0, .LBB2_2
+; LA64-NEXT:    b .LBB2_4
+; LA64-NEXT:  .LBB2_3:
+; LA64-NEXT:    vrepli.b $vr0, 0
+; LA64-NEXT:  .LBB2_4: # %for.cond.cleanup
+; LA64-NEXT:    vst $vr0, $s1, 0
+; LA64-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 48
+; LA64-NEXT:    ret
+entry:
+  %y = getelementptr inbounds %struct.V, ptr %a, i64 %k, i32 1
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %sum.0 = phi <4 x i32> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  call void @f(ptr %a)
+  %v = load <4 x i32>, ptr %y
+  %addv = add <4 x i32> %v, %sum.0
+  %inc = add nuw nsw i64 %i.0, 1
+  %exitcond = icmp eq i64 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %sum.lcssa = phi <4 x i32> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  store <4 x i32> %sum.lcssa, ptr %y
+  ret void
+}
+
+define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_v16i16:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -80
+; LA32-NEXT:    st.w $ra, $sp, 76 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 72 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 68 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 64 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s2, $sp, 60 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s3, $sp, 56 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s4, $sp, 52 # 4-byte Folded Spill
+; LA32-NEXT:    move $s0, $a3
+; LA32-NEXT:    move $s1, $a2
+; LA32-NEXT:    slli.w $a0, $a0, 6
+; LA32-NEXT:    add.w $a0, $a4, $a0
+; LA32-NEXT:    sltui $a1, $a3, 1
+; LA32-NEXT:    slti $a2, $a3, 0
+; LA32-NEXT:    masknez $a2, $a2, $a1
+; LA32-NEXT:    sltui $a3, $s1, 1
+; LA32-NEXT:    maskeqz $a1, $a3, $a1
+; LA32-NEXT:    or $a1, $a1, $a2
+; LA32-NEXT:    addi.w $s2, $a0, 32
+; LA32-NEXT:    bnez $a1, .LBB3_3
+; LA32-NEXT:  # %bb.1: # %for.body.preheader
+; LA32-NEXT:    move $fp, $a4
+; LA32-NEXT:    move $s3, $zero
+; LA32-NEXT:    move $s4, $zero
+; LA32-NEXT:    xvrepli.b $xr0, 0
+; LA32-NEXT:    .p2align 4, , 16
+; LA32-NEXT:  .LBB3_2: # %for.body
+; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA32-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT:    move $a0, $fp
+; LA32-NEXT:    bl f
+; LA32-NEXT:    xvld $xr0, $s2, 0
+; LA32-NEXT:    addi.w $s3, $s3, 1
+; LA32-NEXT:    sltui $a0, $s3, 1
+; LA32-NEXT:    add.w $s4, $s4, $a0
+; LA32-NEXT:    xor $a0, $s3, $s1
+; LA32-NEXT:    xor $a1, $s4, $s0
+; LA32-NEXT:    or $a0, $a0, $a1
+; LA32-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT:    xvadd.h $xr1, $xr0, $xr1
+; LA32-NEXT:    xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT:    bnez $a0, .LBB3_2
+; LA32-NEXT:    b .LBB3_4
+; LA32-NEXT:  .LBB3_3:
+; LA32-NEXT:    xvrepli.b $xr0, 0
+; LA32-NEXT:  .LBB3_4: # %for.cond.cleanup
+; LA32-NEXT:    xvst $xr0, $s2, 0
+; LA32-NEXT:    ld.w $s4, $sp, 52 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s3, $sp, 56 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s2, $sp, 60 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s1, $sp, 64 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 68 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 72 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 76 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 80
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: sink_fold_v16i16:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -80
+; LA64-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
+; LA64-NEXT:    slli.d $a0, $a0, 6
+; LA64-NEXT:    add.d $a0, $a2, $a0
+; LA64-NEXT:    addi.d $s1, $a0, 32
+; LA64-NEXT:    blez $a1, .LBB3_3
+; LA64-NEXT:  # %bb.1: # %for.body.preheader
+; LA64-NEXT:    move $fp, $a2
+; LA64-NEXT:    move $s0, $a1
+; LA64-NEXT:    xvrepli.b $xr0, 0
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB3_2: # %for.body
+; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA64-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT:    move $a0, $fp
+; LA64-NEXT:    pcaddu18i $ra, %call36(f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    xvld $xr0, $s1, 0
+; LA64-NEXT:    addi.d $s0, $s0, -1
+; LA64-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT:    xvadd.h $xr1, $xr0, $xr1
+; LA64-NEXT:    xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT:    bnez $s0, .LBB3_2
+; LA64-NEXT:    b .LBB3_4
+; LA64-NEXT:  .LBB3_3:
+; LA64-NEXT:    xvrepli.b $xr0, 0
+; LA64-NEXT:  .LBB3_4: # %for.cond.cleanup
+; LA64-NEXT:    xvst $xr0, $s1, 0
+; LA64-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 80
+; LA64-NEXT:    ret
+entry:
+  %y = getelementptr inbounds %struct.V, ptr %a, i64 %k, i32 2
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %sum.0 = phi <16 x i16> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  call void @f(ptr %a)
+  %v = load <16 x i16>, ptr %y
+  %addv = add <16 x i16> %v, %sum.0
+  %inc = add nuw nsw i64 %i.0, 1
+  %exitcond = icmp eq i64 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %sum.lcssa = phi <16 x i16> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  store <16 x i16> %sum.lcssa, ptr %y
+  ret void
+}
+
+define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_extracti8:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -48
+; LA32-NEXT:    st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT:    move $s0, $a3
+; LA32-NEXT:    move $s1, $a2
+; LA32-NEXT:    slli.w $a1, $a0, 4
+; LA32-NEXT:    alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT:    add.w $a0, $a4, $a0
+; LA32-NEXT:    sltui $a1, $a3, 1
+; LA32-NEXT:    slti $a2, $a3, 0
+; LA32-NEXT:    masknez $a2, $a2, $a1
+; LA32-NEXT:    sltui $a3, $s1, 1
+; LA32-NEXT:    maskeqz $a1, $a3, $a1
+; LA32-NEXT:    or $a1, $a1, $a2
+; LA32-NEXT:    addi.w $s2, $a0, 16
+; LA32-NEXT:    bnez $a1, .LBB4_3
+; LA32-NEXT:  # %bb.1: # %for.body.preheader
+; LA32-NEXT:    move $fp, $a4
+; LA32-NEXT:    move $s3, $zero
+; LA32-NEXT:    move $s4, $zero
+; LA32-NEXT:    vrepli.b $vr0, 0
+; LA32-NEXT:    .p2align 4, , 16
+; LA32-NEXT:  .LBB4_2: # %for.body
+; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA32-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT:    move $a0, $fp
+; LA32-NEXT:    bl f
+; LA32-NEXT:    vldrepl.b $vr0, $s2, 0
+; LA32-NEXT:    addi.w $s3, $s3, 1
+; LA32-NEXT:    sltui $a0, $s3, 1
+; LA32-NEXT:    add.w $s4, $s4, $a0
+; LA32-NEXT:    xor $a0, $s3, $s1
+; LA32-NEXT:    xor $a1, $s4, $s0
+; LA32-NEXT:    or $a0, $a0, $a1
+; LA32-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT:    vadd.b $vr1, $vr0, $vr1
+; LA32-NEXT:    vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT:    bnez $a0, .LBB4_2
+; LA32-NEXT:    b .LBB4_4
+; LA32-NEXT:  .LBB4_3:
+; LA32-NEXT:    vrepli.b $vr0, 0
+; LA32-NEXT:  .LBB4_4: # %for.cond.cleanup
+; LA32-NEXT:    vstelm.b $vr0, $s2, 0, 1
+; LA32-NEXT:    ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 48
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: sink_fold_extracti8:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -48
+; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    move $s0, $a1
+; LA64-NEXT:    slli.d $a1, $a0, 4
+; LA64-NEXT:    alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT:    add.d $a0, $a2, $a0
+; LA64-NEXT:    addi.d $s1, $a0, 16
+; LA64-NEXT:    blez $s0, .LBB4_3
+; LA64-NEXT:  # %bb.1: # %for.body.preheader
+; LA64-NEXT:    move $fp, $a2
+; LA64-NEXT:    vrepli.b $vr0, 0
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB4_2: # %for.body
+; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    move $a0, $fp
+; LA64-NEXT:    pcaddu18i $ra, %call36(f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    vldrepl.b $vr0, $s1, 0
+; LA64-NEXT:    addi.d $s0, $s0, -1
+; LA64-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vadd.b $vr1, $vr0, $vr1
+; LA64-NEXT:    vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    bnez $s0, .LBB4_2
+; LA64-NEXT:    b .LBB4_4
+; LA64-NEXT:  .LBB4_3:
+; LA64-NEXT:    vrepli.b $vr0, 0
+; LA64-NEXT:  .LBB4_4: # %for.cond.cleanup
+; LA64-NEXT:    vstelm.b $vr0, $s1, 0, 1
+; LA64-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 48
+; LA64-NEXT:    ret
+entry:
+  %y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 2
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %sum.0 = phi <16 x i8> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  call void @f(ptr %a)
+  %e = load i8, ptr %y
+  %ins0 = insertelement <16 x i8> poison, i8 %e, i32 0
+  %v = shufflevector <16 x i8> %ins0, <16 x i8> poison, <16 x i32> zeroinitializer
+  %addv = add <16 x i8> %v, %sum.0
+  %inc = add nuw nsw i64 %i.0, 1
+  %exitcond = icmp eq i64 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %sum.lcssa = phi <16 x i8> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  %res = extractelement <16 x i8> %sum.lcssa, i32 1
+  store i8 %res, ptr %y
+  ret void
+}
+
+define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_extractf64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -80
+; LA32-NEXT:    st.w $ra, $sp, 76 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 72 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 68 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 64 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s2, $sp, 60 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s3, $sp, 56 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s4, $sp, 52 # 4-byte Folded Spill
+; LA32-NEXT:    move $s0, $a3
+; LA32-NEXT:    move $s1, $a2
+; LA32-NEXT:    slli.w $a1, $a0, 4
+; LA32-NEXT:    alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT:    add.w $a0, $a4, $a0
+; LA32-NEXT:    sltui $a1, $a3, 1
+; LA32-NEXT:    slti $a2, $a3, 0
+; LA32-NEXT:    masknez $a2, $a2, $a1
+; LA32-NEXT:    sltui $a3, $s1, 1
+; LA32-NEXT:    maskeqz $a1, $a3, $a1
+; LA32-NEXT:    or $a1, $a1, $a2
+; LA32-NEXT:    addi.w $s2, $a0, 8
+; LA32-NEXT:    bnez $a1, .LBB5_3
+; LA32-NEXT:  # %bb.1: # %for.body.preheader
+; LA32-NEXT:    move $fp, $a4
+; LA32-NEXT:    move $s3, $zero
+; LA32-NEXT:    move $s4, $zero
+; LA32-NEXT:    xvrepli.b $xr0, 0
+; LA32-NEXT:    .p2align 4, , 16
+; LA32-NEXT:  .LBB5_2: # %for.body
+; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA32-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT:    move $a0, $fp
+; LA32-NEXT:    bl f
+; LA32-NEXT:    xvldrepl.d $xr0, $s2, 0
+; LA32-NEXT:    addi.w $s3, $s3, 1
+; LA32-NEXT:    sltui $a0, $s3, 1
+; LA32-NEXT:    add.w $s4, $s4, $a0
+; LA32-NEXT:    xor $a0, $s3, $s1
+; LA32-NEXT:    xor $a1, $s4, $s0
+; LA32-NEXT:    or $a0, $a0, $a1
+; LA32-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT:    xvfadd.d $xr1, $xr0, $xr1
+; LA32-NEXT:    xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT:    bnez $a0, .LBB5_2
+; LA32-NEXT:    b .LBB5_4
+; LA32-NEXT:  .LBB5_3:
+; LA32-NEXT:    xvrepli.b $xr0, 0
+; LA32-NEXT:  .LBB5_4: # %for.cond.cleanup
+; LA32-NEXT:    xvstelm.d $xr0, $s2, 0, 1
+; LA32-NEXT:    ld.w $s4, $sp, 52 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s3, $sp, 56 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s2, $sp, 60 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s1, $sp, 64 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 68 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 72 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 76 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 80
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: sink_fold_extractf64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -80
+; LA64-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
+; LA64-NEXT:    move $s0, $a1
+; LA64-NEXT:    slli.d $a1, $a0, 4
+; LA64-NEXT:    alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT:    add.d $a0, $a2, $a0
+; LA64-NEXT:    addi.d $s1, $a0, 8
+; LA64-NEXT:    blez $s0, .LBB5_3
+; LA64-NEXT:  # %bb.1: # %for.body.preheader
+; LA64-NEXT:    move $fp, $a2
+; LA64-NEXT:    xvrepli.b $xr0, 0
+; LA64-NEXT:    .p2align 4, , 16
+; LA64-NEXT:  .LBB5_2: # %for.body
+; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
+; LA64-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT:    move $a0, $fp
+; LA64-NEXT:    pcaddu18i $ra, %call36(f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    xvldrepl.d $xr0, $s1, 0
+; LA64-NEXT:    addi.d $s0, $s0, -1
+; LA64-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT:    xvfadd.d $xr1, $xr0, $xr1
+; LA64-NEXT:    xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT:    bnez $s0, .LBB5_2
+; LA64-NEXT:    b .LBB5_4
+; LA64-NEXT:  .LBB5_3:
+; LA64-NEXT:    xvrepli.b $xr0, 0
+; LA64-NEXT:  .LBB5_4: # %for.cond.cleanup
+; LA64-NEXT:    xvstelm.d $xr0, $s1, 0, 1
+; LA64-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 80
+; LA64-NEXT:    ret
+entry:
+  %y = getelementptr inbounds %struct.F, ptr %a, i64 %k, i32 1
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %sum.0 = phi <4 x double> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  call void @f(ptr %a)
+  %e = load double, ptr %y
+  %ins0 = insertelement <4 x double> poison, double %e, i32 0
+  %v = shufflevector <4 x double> %ins0, <4 x double> poison, <4 x i32> zeroinitializer
+  %addv = fadd <4 x double> %v, %sum.0
+  %inc = add nuw nsw i64 %i.0, 1
+  %exitcond = icmp eq i64 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %sum.lcssa = phi <4 x double> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+  %res = extractelement <4 x double> %sum.lcssa, i32 1
+  store double %res, ptr %y
+  ret void
+}
+
+declare void @f(ptr)
diff --git a/llvm/test/CodeGen/NVPTX/f16-ex2.ll b/llvm/test/CodeGen/NVPTX/f16-ex2.ll
index ee79f9d..af3fe67 100644
--- a/llvm/test/CodeGen/NVPTX/f16-ex2.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-ex2.ll
@@ -1,12 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK-FP16 %s
-; RUN: %if ptxas-sm_75 && ptxas-isa-7.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %}
+; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK-FP16 %s
+; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
 target triple = "nvptx64-nvidia-cuda"
 
 declare half @llvm.nvvm.ex2.approx.f16(half)
-declare <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half>)
+declare <2 x half> @llvm.nvvm.ex2.approx.v2f16(<2 x half>)
+declare bfloat @llvm.nvvm.ex2.approx.ftz.bf16(bfloat)
+declare <2 x bfloat> @llvm.nvvm.ex2.approx.ftz.v2bf16(<2 x bfloat>)
 
-; CHECK-LABEL: ex2_half
 define half @ex2_half(half %0) {
 ; CHECK-FP16-LABEL: ex2_half(
 ; CHECK-FP16:       {
@@ -21,7 +22,6 @@ define half @ex2_half(half %0) {
   ret half %res
 }
 
-; CHECK-LABEL: ex2_2xhalf
 define <2 x half> @ex2_2xhalf(<2 x half> %0) {
 ; CHECK-FP16-LABEL: ex2_2xhalf(
 ; CHECK-FP16:       {
@@ -32,6 +32,34 @@ define <2 x half> @ex2_2xhalf(<2 x half> %0) {
 ; CHECK-FP16-NEXT:    ex2.approx.f16x2 %r2, %r1;
 ; CHECK-FP16-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-FP16-NEXT:    ret;
-  %res = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> %0)
+  %res = call <2 x half> @llvm.nvvm.ex2.approx.v2f16(<2 x half> %0)
   ret <2 x half> %res
 }
+
+define bfloat @ex2_bfloat(bfloat %0) {
+; CHECK-FP16-LABEL: ex2_bfloat(
+; CHECK-FP16:       {
+; CHECK-FP16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT:  // %bb.0:
+; CHECK-FP16-NEXT:    ld.param.b16 %rs1, [ex2_bfloat_param_0];
+; CHECK-FP16-NEXT:    ex2.approx.ftz.bf16 %rs2, %rs1;
+; CHECK-FP16-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-FP16-NEXT:    ret;
+  %res = call bfloat @llvm.nvvm.ex2.approx.ftz.bf16(bfloat %0)
+  ret bfloat %res
+}
+
+define <2 x bfloat> @ex2_2xbfloat(<2 x bfloat> %0) {
+; CHECK-FP16-LABEL: ex2_2xbfloat(
+; CHECK-FP16:       {
+; CHECK-FP16-NEXT:    .reg .b32 %r<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT:  // %bb.0:
+; CHECK-FP16-NEXT:    ld.param.b32 %r1, [ex2_2xbfloat_param_0];
+; CHECK-FP16-NEXT:    ex2.approx.ftz.bf16x2 %r2, %r1;
+; CHECK-FP16-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-FP16-NEXT:    ret;
+  %res = call <2 x bfloat> @llvm.nvvm.ex2.approx.ftz.v2bf16(<2 x bfloat> %0)
+  ret <2 x bfloat> %res
+}
diff --git a/llvm/test/CodeGen/NVPTX/f32-ex2.ll b/llvm/test/CodeGen/NVPTX/f32-ex2.ll
index 796d80d..97b9d35 100644
--- a/llvm/test/CodeGen/NVPTX/f32-ex2.ll
+++ b/llvm/test/CodeGen/NVPTX/f32-ex2.ll
@@ -3,7 +3,8 @@
 ; RUN: %if ptxas-sm_50 && ptxas-isa-3.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %}
 target triple = "nvptx-nvidia-cuda"
 
-declare float @llvm.nvvm.ex2.approx.f(float)
+declare float @llvm.nvvm.ex2.approx.f32(float)
+declare float @llvm.nvvm.ex2.approx.ftz.f32(float)
 
 ; CHECK-LABEL: ex2_float
 define float @ex2_float(float %0) {
@@ -16,7 +17,7 @@ define float @ex2_float(float %0) {
 ; CHECK-NEXT:    ex2.approx.f32 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
-  %res = call float @llvm.nvvm.ex2.approx.f(float %0)
+  %res = call float @llvm.nvvm.ex2.approx.f32(float %0)
   ret float %res
 }
 
@@ -31,6 +32,6 @@ define float @ex2_float_ftz(float %0) {
 ; CHECK-NEXT:    ex2.approx.ftz.f32 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
-  %res = call float @llvm.nvvm.ex2.approx.ftz.f(float %0)
+  %res = call float @llvm.nvvm.ex2.approx.ftz.f32(float %0)
   ret float %res
 }
diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
index 9aefa90..c50a0fb3 100644
--- a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
@@ -7,11 +7,11 @@
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .half   0
 ; Num Functions
-; CHECK-NEXT:   .word   12
+; CHECK-NEXT:   .word   13
 ; Num LargeConstants
-; CHECK-NEXT:   .word   2
+; CHECK-NEXT:   .word   3
 ; Num Callsites
-; CHECK-NEXT:   .word   16
+; CHECK-NEXT:   .word   17
 
 ; Functions and stack size
 ; CHECK-NEXT:   .quad   constantargs
@@ -50,10 +50,14 @@
 ; CHECK-NEXT:   .quad   needsStackRealignment
 ; CHECK-NEXT:   .quad   -1
 ; CHECK-NEXT:   .quad   1
+; CHECK-NEXT:   .quad   floats
+; CHECK-NEXT:   .quad   32
+; CHECK-NEXT:   .quad   1
 
 ; Num LargeConstants
 ; CHECK-NEXT:   .quad   4294967295
 ; CHECK-NEXT:   .quad   4294967296
+; CHECK-NEXT:   .quad   4609434218613702656
 
 ; Constant arguments
 ;
@@ -379,6 +383,104 @@ define void @needsStackRealignment() {
 }
 declare void @escape_values(...)
 
+; CHECK-LABEL:  .word   .L{{.*}}-floats
+; CHECK-NEXT:   .half   0
+; Num Locations
+; CHECK-NEXT:   .half   12
+; Loc 0: constant float as constant integer
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 1: constant double as large constant integer
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 2: constant half as constant integer
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 3: constant bfloat as constant integer
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 4: float value in X register
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   10
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 5: double value in X register
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   11
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 6: half value in X register
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   12
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 7: bfloat value in X register
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   13
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 8: float on stack
+; CHECK-NEXT:   .byte   2
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   2
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 9: double on stack
+; CHECK-NEXT:   .byte   2
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   2
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 10: half on stack
+; CHECK-NEXT:   .byte   2
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   2
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+; Loc 11: bfloat on stack
+; CHECK-NEXT:   .byte   2
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   2
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+define void @floats(float %f, double %g, half %h, bfloat %i) {
+  %ff = alloca float
+  %gg = alloca double
+  %hh = alloca half
+  %ii = alloca bfloat
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 888, i32 0, float 1.25,
+    double 1.5, half 1.5, bfloat 1.5, float %f, double %g, half %h, bfloat %i, ptr %ff, ptr %gg, ptr %hh, ptr %ii)
+  ret void
+}
+
 declare void @llvm.experimental.stackmap(i64, i32, ...)
 declare void @llvm.experimental.patchpoint.void(i64, i32, ptr, i32, ...)
 declare i64 @llvm.experimental.patchpoint.i64(i64, i32, ptr, i32, ...)
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index dffe900..8007d9d 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -203,14 +203,24 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: test_ne_i64:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl $32, %edx
-; X86-NEXT:    shrl $3, %edx
-; X86-NEXT:    movl (%eax,%edx), %eax
-; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    setb %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    testb $32, %cl
+; X86-NEXT:    je .LBB5_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:  .LBB5_2:
+; X86-NEXT:    andl 4(%eax), %esi
+; X86-NEXT:    andl (%eax), %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    setne %al
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_ne_i64:
@@ -232,20 +242,38 @@ define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {
 define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: complement_ne_i64:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $32, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
-; X86-NEXT:    setb %al
-; X86-NEXT:    btcl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    testb $32, %cl
+; X86-NEXT:    je .LBB6_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:  .LBB6_2:
+; X86-NEXT:    movl (%edx), %ecx
+; X86-NEXT:    movl 4(%edx), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl %esi, %ebx
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    andl %eax, %ebp
+; X86-NEXT:    xorl %esi, %edi
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    orl %ebx, %ebp
+; X86-NEXT:    setne %al
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %edi, 4(%edx)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: complement_ne_i64:
@@ -272,20 +300,40 @@ define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
 define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: reset_eq_i64:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $32, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
-; X86-NEXT:    setae %al
-; X86-NEXT:    btrl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    testb $32, %cl
+; X86-NEXT:    je .LBB7_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:  .LBB7_2:
+; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl 4(%edx), %ecx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    andl %edi, %ebx
+; X86-NEXT:    notl %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    andl %esi, %ebp
+; X86-NEXT:    notl %esi
+; X86-NEXT:    andl %ecx, %edi
+; X86-NEXT:    andl %eax, %esi
+; X86-NEXT:    orl %ebx, %ebp
+; X86-NEXT:    sete %al
+; X86-NEXT:    movl %esi, (%edx)
+; X86-NEXT:    movl %edi, 4(%edx)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: reset_eq_i64:
@@ -313,20 +361,38 @@ define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
 define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: set_ne_i64:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $32, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
-; X86-NEXT:    setb %al
-; X86-NEXT:    btsl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    testb $32, %cl
+; X86-NEXT:    je .LBB8_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:  .LBB8_2:
+; X86-NEXT:    movl (%edx), %ecx
+; X86-NEXT:    movl 4(%edx), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl %esi, %ebx
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    andl %eax, %ebp
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    orl %ebx, %ebp
+; X86-NEXT:    setne %al
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %edi, 4(%edx)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: set_ne_i64:
@@ -353,26 +419,52 @@ define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {
 define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; X86-LABEL: init_eq_i64:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    testb $32, %cl
+; X86-NEXT:    je .LBB9_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:  .LBB9_2:
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    notl %ebp
+; X86-NEXT:    je .LBB9_4
+; X86-NEXT:  # %bb.3:
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:  .LBB9_4:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl $32, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    movl (%edx,%esi), %edi
-; X86-NEXT:    btl %ecx, %edi
-; X86-NEXT:    setae %al
-; X86-NEXT:    btrl %ecx, %edi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    movl 4(%ecx), %ecx
+; X86-NEXT:    andl %ecx, %edx
+; X86-NEXT:    andl %ecx, %ebx
 ; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, (%edx,%esi)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl (%edi), %ecx
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    andl %ecx, %ebp
+; X86-NEXT:    orl %esi, %ebp
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl %ebp, (%edi)
+; X86-NEXT:    movl %ebx, 4(%edi)
+; X86-NEXT:    sete %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; SSE-LABEL: init_eq_i64:
@@ -424,25 +516,101 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: test_ne_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl $96, %edx
-; X86-NEXT:    shrl $3, %edx
-; X86-NEXT:    movl (%eax,%edx), %eax
-; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    setb %al
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $48, %esp
+; X86-NEXT:    movzbl 12(%ebp), %ecx
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, (%esp)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %esi
+; X86-NEXT:    movl 24(%esp,%esi), %edi
+; X86-NEXT:    movl 28(%esp,%esi), %eax
+; X86-NEXT:    shldl %cl, %edi, %eax
+; X86-NEXT:    movl 16(%esp,%esi), %edx
+; X86-NEXT:    movl 20(%esp,%esi), %esi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl 8(%ebp), %ebx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    andl 8(%ebx), %edi
+; X86-NEXT:    andl (%ebx), %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    andl 12(%ebx), %eax
+; X86-NEXT:    andl 4(%ebx), %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    setne %al
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_ne_i128:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    andl $96, %eax
-; X64-NEXT:    shrl $3, %eax
-; X64-NEXT:    movl (%rdi,%rax), %eax
-; X64-NEXT:    btl %esi, %eax
-; X64-NEXT:    setb %al
-; X64-NEXT:    retq
+; SSE-LABEL: test_ne_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movl %esi, %ecx
+; SSE-NEXT:    movl $1, %eax
+; SSE-NEXT:    xorl %edx, %edx
+; SSE-NEXT:    shldq %cl, %rax, %rdx
+; SSE-NEXT:    xorl %esi, %esi
+; SSE-NEXT:    shlq %cl, %rax
+; SSE-NEXT:    testb $64, %cl
+; SSE-NEXT:    cmovneq %rax, %rdx
+; SSE-NEXT:    cmovneq %rsi, %rax
+; SSE-NEXT:    andq 8(%rdi), %rdx
+; SSE-NEXT:    andq (%rdi), %rax
+; SSE-NEXT:    orq %rdx, %rax
+; SSE-NEXT:    setne %al
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_ne_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movl %esi, %ecx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    movl $1, %edx
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    shldq %cl, %rdx, %rsi
+; AVX2-NEXT:    shlxq %rcx, %rdx, %rdx
+; AVX2-NEXT:    testb $64, %cl
+; AVX2-NEXT:    cmovneq %rdx, %rsi
+; AVX2-NEXT:    cmovneq %rax, %rdx
+; AVX2-NEXT:    andq 8(%rdi), %rsi
+; AVX2-NEXT:    andq (%rdi), %rdx
+; AVX2-NEXT:    orq %rsi, %rdx
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ne_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movl %esi, %ecx
+; AVX512-NEXT:    movl $1, %eax
+; AVX512-NEXT:    xorl %edx, %edx
+; AVX512-NEXT:    shldq %cl, %rax, %rdx
+; AVX512-NEXT:    xorl %esi, %esi
+; AVX512-NEXT:    shlxq %rcx, %rax, %rax
+; AVX512-NEXT:    testb $64, %cl
+; AVX512-NEXT:    cmovneq %rax, %rdx
+; AVX512-NEXT:    cmovneq %rsi, %rax
+; AVX512-NEXT:    andq 8(%rdi), %rdx
+; AVX512-NEXT:    andq (%rdi), %rax
+; AVX512-NEXT:    orq %rdx, %rax
+; AVX512-NEXT:    setne %al
+; AVX512-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
@@ -455,33 +623,124 @@ define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {
 define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: complement_ne_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $96, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
-; X86-NEXT:    setb %al
-; X86-NEXT:    btcl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    movzbl 12(%ebp), %ecx
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl 56(%esp,%eax), %esi
+; X86-NEXT:    movl 60(%esp,%eax), %edx
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%esp,%eax), %edi
+; X86-NEXT:    movl 52(%esp,%eax), %ebx
+; X86-NEXT:    shldl %cl, %ebx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edi, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl 8(%eax), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    andl %edi, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl 12(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl 4(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %ebx, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    setne %al
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: complement_ne_i128:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    andl $96, %ecx
-; X64-NEXT:    shrl $3, %ecx
-; X64-NEXT:    movl (%rdi,%rcx), %edx
-; X64-NEXT:    btl %esi, %edx
-; X64-NEXT:    setb %al
-; X64-NEXT:    btcl %esi, %edx
-; X64-NEXT:    movl %edx, (%rdi,%rcx)
-; X64-NEXT:    retq
+; SSE-LABEL: complement_ne_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movl %esi, %ecx
+; SSE-NEXT:    movl $1, %edx
+; SSE-NEXT:    xorl %esi, %esi
+; SSE-NEXT:    shldq %cl, %rdx, %rsi
+; SSE-NEXT:    shlq %cl, %rdx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    testb $64, %cl
+; SSE-NEXT:    cmovneq %rdx, %rsi
+; SSE-NEXT:    cmovneq %rax, %rdx
+; SSE-NEXT:    movq (%rdi), %rax
+; SSE-NEXT:    movq 8(%rdi), %rcx
+; SSE-NEXT:    movq %rcx, %r8
+; SSE-NEXT:    andq %rsi, %r8
+; SSE-NEXT:    movq %rax, %r9
+; SSE-NEXT:    andq %rdx, %r9
+; SSE-NEXT:    xorq %rcx, %rsi
+; SSE-NEXT:    xorq %rax, %rdx
+; SSE-NEXT:    orq %r8, %r9
+; SSE-NEXT:    setne %al
+; SSE-NEXT:    movq %rdx, (%rdi)
+; SSE-NEXT:    movq %rsi, 8(%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: complement_ne_i128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %esi, %ecx
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movl $1, %edx
+; AVX-NEXT:    xorl %esi, %esi
+; AVX-NEXT:    shldq %cl, %rdx, %rsi
+; AVX-NEXT:    shlxq %rcx, %rdx, %rdx
+; AVX-NEXT:    testb $64, %cl
+; AVX-NEXT:    cmovneq %rdx, %rsi
+; AVX-NEXT:    cmovneq %rax, %rdx
+; AVX-NEXT:    movq (%rdi), %rax
+; AVX-NEXT:    movq 8(%rdi), %rcx
+; AVX-NEXT:    movq %rcx, %r8
+; AVX-NEXT:    andq %rsi, %r8
+; AVX-NEXT:    movq %rax, %r9
+; AVX-NEXT:    andq %rdx, %r9
+; AVX-NEXT:    xorq %rcx, %rsi
+; AVX-NEXT:    xorq %rax, %rdx
+; AVX-NEXT:    orq %r8, %r9
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movq %rdx, (%rdi)
+; AVX-NEXT:    movq %rsi, 8(%rdi)
+; AVX-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
@@ -496,33 +755,124 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
 define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: reset_eq_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $96, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
-; X86-NEXT:    setae %al
-; X86-NEXT:    btrl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    movzbl 12(%ebp), %ecx
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl 56(%esp,%eax), %edx
+; X86-NEXT:    movl 60(%esp,%eax), %esi
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%esp,%eax), %esi
+; X86-NEXT:    movl 52(%esp,%eax), %edi
+; X86-NEXT:    shldl %cl, %edi, %edx
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movl 8(%ebp), %ebx
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl 8(%ebx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    movl (%ebx), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %esi, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%ebx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl 4(%ebx), %ebx
+; X86-NEXT:    andl %ebx, %edi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    notl %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    andl %ebx, %ecx
+; X86-NEXT:    notl %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl 8(%ebp), %edi
+; X86-NEXT:    movl %edx, 8(%edi)
+; X86-NEXT:    movl %eax, 12(%edi)
+; X86-NEXT:    movl %esi, (%edi)
+; X86-NEXT:    movl %ecx, 4(%edi)
+; X86-NEXT:    sete %al
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: reset_eq_i128:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    andl $96, %ecx
-; X64-NEXT:    shrl $3, %ecx
-; X64-NEXT:    movl (%rdi,%rcx), %edx
-; X64-NEXT:    btl %esi, %edx
-; X64-NEXT:    setae %al
-; X64-NEXT:    btrl %esi, %edx
-; X64-NEXT:    movl %edx, (%rdi,%rcx)
-; X64-NEXT:    retq
+; SSE-LABEL: reset_eq_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movl %esi, %ecx
+; SSE-NEXT:    movl $1, %edx
+; SSE-NEXT:    xorl %esi, %esi
+; SSE-NEXT:    shldq %cl, %rdx, %rsi
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    shlq %cl, %rdx
+; SSE-NEXT:    testb $64, %cl
+; SSE-NEXT:    cmovneq %rdx, %rsi
+; SSE-NEXT:    cmovneq %rax, %rdx
+; SSE-NEXT:    movq (%rdi), %rax
+; SSE-NEXT:    movq 8(%rdi), %rcx
+; SSE-NEXT:    movq %rcx, %r8
+; SSE-NEXT:    andq %rsi, %r8
+; SSE-NEXT:    notq %rsi
+; SSE-NEXT:    movq %rax, %r9
+; SSE-NEXT:    andq %rdx, %r9
+; SSE-NEXT:    notq %rdx
+; SSE-NEXT:    andq %rcx, %rsi
+; SSE-NEXT:    andq %rax, %rdx
+; SSE-NEXT:    orq %r8, %r9
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    movq %rdx, (%rdi)
+; SSE-NEXT:    movq %rsi, 8(%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reset_eq_i128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %esi, %ecx
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movl $1, %edx
+; AVX-NEXT:    xorl %esi, %esi
+; AVX-NEXT:    shldq %cl, %rdx, %rsi
+; AVX-NEXT:    shlxq %rcx, %rdx, %rdx
+; AVX-NEXT:    testb $64, %cl
+; AVX-NEXT:    cmovneq %rdx, %rsi
+; AVX-NEXT:    cmovneq %rax, %rdx
+; AVX-NEXT:    movq (%rdi), %rax
+; AVX-NEXT:    movq 8(%rdi), %rcx
+; AVX-NEXT:    andnq %rcx, %rsi, %r8
+; AVX-NEXT:    andq %rsi, %rcx
+; AVX-NEXT:    andnq %rax, %rdx, %rsi
+; AVX-NEXT:    andq %rdx, %rax
+; AVX-NEXT:    orq %rcx, %rax
+; AVX-NEXT:    sete %al
+; AVX-NEXT:    movq %rsi, (%rdi)
+; AVX-NEXT:    movq %r8, 8(%rdi)
+; AVX-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
@@ -538,33 +888,124 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
 define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: set_ne_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $96, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
-; X86-NEXT:    setb %al
-; X86-NEXT:    btsl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    movzbl 12(%ebp), %ecx
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl 56(%esp,%eax), %esi
+; X86-NEXT:    movl 60(%esp,%eax), %edx
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%esp,%eax), %edi
+; X86-NEXT:    movl 52(%esp,%eax), %ebx
+; X86-NEXT:    shldl %cl, %ebx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edi, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl 8(%eax), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    andl %edi, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl 12(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl 4(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %ebx, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    setne %al
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: set_ne_i128:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    andl $96, %ecx
-; X64-NEXT:    shrl $3, %ecx
-; X64-NEXT:    movl (%rdi,%rcx), %edx
-; X64-NEXT:    btl %esi, %edx
-; X64-NEXT:    setb %al
-; X64-NEXT:    btsl %esi, %edx
-; X64-NEXT:    movl %edx, (%rdi,%rcx)
-; X64-NEXT:    retq
+; SSE-LABEL: set_ne_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movl %esi, %ecx
+; SSE-NEXT:    movl $1, %edx
+; SSE-NEXT:    xorl %esi, %esi
+; SSE-NEXT:    shldq %cl, %rdx, %rsi
+; SSE-NEXT:    shlq %cl, %rdx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    testb $64, %cl
+; SSE-NEXT:    cmovneq %rdx, %rsi
+; SSE-NEXT:    cmovneq %rax, %rdx
+; SSE-NEXT:    movq (%rdi), %rax
+; SSE-NEXT:    movq 8(%rdi), %rcx
+; SSE-NEXT:    movq %rcx, %r8
+; SSE-NEXT:    andq %rsi, %r8
+; SSE-NEXT:    movq %rax, %r9
+; SSE-NEXT:    andq %rdx, %r9
+; SSE-NEXT:    orq %rcx, %rsi
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    orq %r8, %r9
+; SSE-NEXT:    setne %al
+; SSE-NEXT:    movq %rdx, (%rdi)
+; SSE-NEXT:    movq %rsi, 8(%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: set_ne_i128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %esi, %ecx
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movl $1, %edx
+; AVX-NEXT:    xorl %esi, %esi
+; AVX-NEXT:    shldq %cl, %rdx, %rsi
+; AVX-NEXT:    shlxq %rcx, %rdx, %rdx
+; AVX-NEXT:    testb $64, %cl
+; AVX-NEXT:    cmovneq %rdx, %rsi
+; AVX-NEXT:    cmovneq %rax, %rdx
+; AVX-NEXT:    movq (%rdi), %rax
+; AVX-NEXT:    movq 8(%rdi), %rcx
+; AVX-NEXT:    movq %rcx, %r8
+; AVX-NEXT:    andq %rsi, %r8
+; AVX-NEXT:    movq %rax, %r9
+; AVX-NEXT:    andq %rdx, %r9
+; AVX-NEXT:    orq %rcx, %rsi
+; AVX-NEXT:    orq %rax, %rdx
+; AVX-NEXT:    orq %r8, %r9
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movq %rdx, (%rdi)
+; AVX-NEXT:    movq %rsi, 8(%rdi)
+; AVX-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
@@ -579,55 +1020,218 @@ define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
 define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; X86-LABEL: init_eq_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl $96, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    movl (%edx,%esi), %edi
-; X86-NEXT:    btl %ecx, %edi
-; X86-NEXT:    setae %al
-; X86-NEXT:    btrl %ecx, %edi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $128, %esp
+; X86-NEXT:    movzbl 12(%ebp), %ecx
+; X86-NEXT:    movzbl 16(%ebp), %eax
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shrb $3, %dl
+; X86-NEXT:    andb $12, %dl
+; X86-NEXT:    negb %dl
+; X86-NEXT:    movsbl %dl, %esi
+; X86-NEXT:    movl 64(%esp,%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 68(%esp,%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 72(%esp,%esi), %ebx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl 76(%esp,%esi), %edi
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    shldl %cl, %ebx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%esi), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %ebx, %eax
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%esi), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %ebx, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movl 12(%ecx), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %ebx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl 4(%ecx), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %ecx, %ebx
 ; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, (%edx,%esi)
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl 100(%esp,%ecx), %edi
+; X86-NEXT:    movl 104(%esp,%ecx), %ecx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movzbl 12(%ebp), %ecx
+; X86-NEXT:    shldl %cl, %edi, %ebx
+; X86-NEXT:    orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    notl %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl 108(%esp,%ebx), %ebx
+; X86-NEXT:    shldl %cl, %eax, %ebx
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    notl %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl 96(%esp,%ebx), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %ebx, %edi
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, 8(%ecx)
+; X86-NEXT:    movl %esi, 12(%ecx)
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    movl %edx, 4(%ecx)
+; X86-NEXT:    sete %al
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; SSE-LABEL: init_eq_i128:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $96, %esi
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    movl (%rdi,%rsi), %r8d
-; SSE-NEXT:    btl %ecx, %r8d
-; SSE-NEXT:    setae %al
-; SSE-NEXT:    shll %cl, %edx
-; SSE-NEXT:    btrl %ecx, %r8d
-; SSE-NEXT:    orl %r8d, %edx
-; SSE-NEXT:    movl %edx, (%rdi,%rsi)
+; SSE-NEXT:    movl $1, %esi
+; SSE-NEXT:    xorl %r8d, %r8d
+; SSE-NEXT:    shldq %cl, %rsi, %r8
+; SSE-NEXT:    shlq %cl, %rsi
+; SSE-NEXT:    movl %edx, %eax
+; SSE-NEXT:    xorl %edx, %edx
+; SSE-NEXT:    shldq %cl, %rax, %rdx
+; SSE-NEXT:    shlq %cl, %rax
+; SSE-NEXT:    xorl %r9d, %r9d
+; SSE-NEXT:    testb $64, %cl
+; SSE-NEXT:    cmovneq %rsi, %r8
+; SSE-NEXT:    cmovneq %r9, %rsi
+; SSE-NEXT:    cmovneq %rax, %rdx
+; SSE-NEXT:    cmovneq %r9, %rax
+; SSE-NEXT:    movq (%rdi), %rcx
+; SSE-NEXT:    movq 8(%rdi), %r9
+; SSE-NEXT:    movq %r9, %r10
+; SSE-NEXT:    andq %r8, %r10
+; SSE-NEXT:    notq %r8
+; SSE-NEXT:    movq %rcx, %r11
+; SSE-NEXT:    andq %rsi, %r11
+; SSE-NEXT:    notq %rsi
+; SSE-NEXT:    andq %r9, %r8
+; SSE-NEXT:    orq %rdx, %r8
+; SSE-NEXT:    andq %rcx, %rsi
+; SSE-NEXT:    orq %rax, %rsi
+; SSE-NEXT:    orq %r10, %r11
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    movq %rsi, (%rdi)
+; SSE-NEXT:    movq %r8, 8(%rdi)
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: init_eq_i128:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    andl $96, %ecx
-; AVX-NEXT:    shrl $3, %ecx
-; AVX-NEXT:    movl (%rdi,%rcx), %r8d
-; AVX-NEXT:    btl %esi, %r8d
-; AVX-NEXT:    setae %al
-; AVX-NEXT:    btrl %esi, %r8d
-; AVX-NEXT:    shlxl %esi, %edx, %edx
-; AVX-NEXT:    orl %r8d, %edx
-; AVX-NEXT:    movl %edx, (%rdi,%rcx)
-; AVX-NEXT:    retq
+; AVX2-LABEL: init_eq_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movl %esi, %ecx
+; AVX2-NEXT:    movl $1, %esi
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    shldq %cl, %rsi, %rax
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    movl %edx, %edx
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    shldq %cl, %rdx, %r9
+; AVX2-NEXT:    shlxq %rcx, %rsi, %rsi
+; AVX2-NEXT:    testb $64, %cl
+; AVX2-NEXT:    cmovneq %rsi, %rax
+; AVX2-NEXT:    cmovneq %r8, %rsi
+; AVX2-NEXT:    shlxq %rcx, %rdx, %rcx
+; AVX2-NEXT:    cmovneq %rcx, %r9
+; AVX2-NEXT:    cmovneq %r8, %rcx
+; AVX2-NEXT:    movq (%rdi), %rdx
+; AVX2-NEXT:    movq 8(%rdi), %r8
+; AVX2-NEXT:    andnq %r8, %rax, %r10
+; AVX2-NEXT:    andq %rax, %r8
+; AVX2-NEXT:    andnq %rdx, %rsi, %r11
+; AVX2-NEXT:    andq %rsi, %rdx
+; AVX2-NEXT:    orq %r9, %r10
+; AVX2-NEXT:    orq %rcx, %r11
+; AVX2-NEXT:    orq %r8, %rdx
+; AVX2-NEXT:    sete %al
+; AVX2-NEXT:    movq %r11, (%rdi)
+; AVX2-NEXT:    movq %r10, 8(%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: init_eq_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movl %esi, %ecx
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movl $1, %esi
+; AVX512-NEXT:    xorl %r8d, %r8d
+; AVX512-NEXT:    shldq %cl, %rsi, %r8
+; AVX512-NEXT:    shlxq %rcx, %rsi, %rsi
+; AVX512-NEXT:    movl %edx, %edx
+; AVX512-NEXT:    xorl %r9d, %r9d
+; AVX512-NEXT:    shldq %cl, %rdx, %r9
+; AVX512-NEXT:    testb $64, %cl
+; AVX512-NEXT:    cmovneq %rsi, %r8
+; AVX512-NEXT:    cmovneq %rax, %rsi
+; AVX512-NEXT:    shlxq %rcx, %rdx, %rcx
+; AVX512-NEXT:    cmovneq %rcx, %r9
+; AVX512-NEXT:    cmovneq %rax, %rcx
+; AVX512-NEXT:    movq (%rdi), %rax
+; AVX512-NEXT:    movq 8(%rdi), %rdx
+; AVX512-NEXT:    andnq %rdx, %r8, %r10
+; AVX512-NEXT:    andq %r8, %rdx
+; AVX512-NEXT:    andnq %rax, %rsi, %r8
+; AVX512-NEXT:    andq %rsi, %rax
+; AVX512-NEXT:    orq %r9, %r10
+; AVX512-NEXT:    orq %rcx, %r8
+; AVX512-NEXT:    orq %rdx, %rax
+; AVX512-NEXT:    sete %al
+; AVX512-NEXT:    movq %r8, (%rdi)
+; AVX512-NEXT:    movq %r10, 8(%rdi)
+; AVX512-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
@@ -648,25 +1252,344 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: test_ne_i512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    shrl $3, %edx
-; X86-NEXT:    andl $60, %edx
-; X86-NEXT:    movl (%eax,%edx), %eax
-; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    setb %al
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $224, %esp
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrl $3, %eax
+; X86-NEXT:    andl $60, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%edx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%edx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $31, %ecx
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 56(%edx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 60(%edx), %eax
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%edx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%edx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%edx), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%edx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 16(%edx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 20(%edx), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shldl %cl, %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%edx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%edx), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl 52(%edx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl 4(%edx), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edi, %eax
+; X86-NEXT:    movl 8(%ebp), %ebx
+; X86-NEXT:    andl 40(%ebx), %eax
+; X86-NEXT:    andl 8(%ebx), %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 56(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    andl 24(%ebx), %edi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    andl 44(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    andl 12(%ebx), %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    andl 60(%edi), %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 28(%edi), %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%edx), %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    negl %edx
+; X86-NEXT:    movl 192(%esp,%edx), %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %ebx, %edx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl 8(%ebp), %ebx
+; X86-NEXT:    andl 32(%ebx), %ecx
+; X86-NEXT:    andl (%ebx), %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    andl 16(%ebx), %edi
+; X86-NEXT:    andl 48(%ebx), %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 36(%ebx), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    andl 4(%ebx), %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 20(%ebx), %ecx
+; X86-NEXT:    andl 52(%ebx), %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_ne_i512:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    shrl $3, %eax
-; X64-NEXT:    andl $60, %eax
-; X64-NEXT:    movl (%rdi,%rax), %eax
-; X64-NEXT:    btl %esi, %eax
-; X64-NEXT:    setb %al
-; X64-NEXT:    retq
+; SSE-LABEL: test_ne_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movl %esi, %ecx
+; SSE-NEXT:    andl $63, %ecx
+; SSE-NEXT:    shrl $3, %esi
+; SSE-NEXT:    andl $56, %esi
+; SSE-NEXT:    negl %esi
+; SSE-NEXT:    movslq %esi, %rbx
+; SSE-NEXT:    movq -48(%rsp,%rbx), %rdx
+; SSE-NEXT:    movq -40(%rsp,%rbx), %r14
+; SSE-NEXT:    movq %r14, %rax
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq -16(%rsp,%rbx), %r11
+; SSE-NEXT:    movq -8(%rsp,%rbx), %r10
+; SSE-NEXT:    shldq %cl, %r11, %r10
+; SSE-NEXT:    movq -32(%rsp,%rbx), %r9
+; SSE-NEXT:    movq -24(%rsp,%rbx), %r15
+; SSE-NEXT:    movq %r15, %r8
+; SSE-NEXT:    shldq %cl, %r9, %r8
+; SSE-NEXT:    movq -56(%rsp,%rbx), %rsi
+; SSE-NEXT:    shldq %cl, %rsi, %rdx
+; SSE-NEXT:    shldq %cl, %r15, %r11
+; SSE-NEXT:    shldq %cl, %r14, %r9
+; SSE-NEXT:    movq -64(%rsp,%rbx), %rbx
+; SSE-NEXT:    shldq %cl, %rbx, %rsi
+; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT:    shlq %cl, %rbx
+; SSE-NEXT:    andq 32(%rdi), %r9
+; SSE-NEXT:    andq 48(%rdi), %r11
+; SSE-NEXT:    andq 16(%rdi), %rdx
+; SSE-NEXT:    orq %r11, %rdx
+; SSE-NEXT:    andq 40(%rdi), %r8
+; SSE-NEXT:    andq 56(%rdi), %r10
+; SSE-NEXT:    andq 24(%rdi), %rax
+; SSE-NEXT:    orq %r10, %rax
+; SSE-NEXT:    andq (%rdi), %rbx
+; SSE-NEXT:    orq %r9, %rbx
+; SSE-NEXT:    orq %rdx, %rbx
+; SSE-NEXT:    andq 8(%rdi), %rsi
+; SSE-NEXT:    orq %r8, %rsi
+; SSE-NEXT:    orq %rax, %rsi
+; SSE-NEXT:    orq %rbx, %rsi
+; SSE-NEXT:    setne %al
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_ne_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movl %esi, %ecx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    shrl $3, %esi
+; AVX2-NEXT:    andl $56, %esi
+; AVX2-NEXT:    negl %esi
+; AVX2-NEXT:    movslq %esi, %rsi
+; AVX2-NEXT:    movq -48(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq -40(%rsp,%rsi), %rbx
+; AVX2-NEXT:    movq %rbx, %rax
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq -16(%rsp,%rsi), %r11
+; AVX2-NEXT:    movq -8(%rsp,%rsi), %r10
+; AVX2-NEXT:    shldq %cl, %r11, %r10
+; AVX2-NEXT:    movq -32(%rsp,%rsi), %r9
+; AVX2-NEXT:    movq -24(%rsp,%rsi), %r14
+; AVX2-NEXT:    movq %r14, %r8
+; AVX2-NEXT:    shldq %cl, %r9, %r8
+; AVX2-NEXT:    movq -64(%rsp,%rsi), %r15
+; AVX2-NEXT:    movq -56(%rsp,%rsi), %rsi
+; AVX2-NEXT:    shldq %cl, %rsi, %rdx
+; AVX2-NEXT:    shldq %cl, %r14, %r11
+; AVX2-NEXT:    shldq %cl, %rbx, %r9
+; AVX2-NEXT:    shldq %cl, %r15, %rsi
+; AVX2-NEXT:    shlxq %rcx, %r15, %rcx
+; AVX2-NEXT:    andq 32(%rdi), %r9
+; AVX2-NEXT:    andq 48(%rdi), %r11
+; AVX2-NEXT:    andq 16(%rdi), %rdx
+; AVX2-NEXT:    andq 40(%rdi), %r8
+; AVX2-NEXT:    andq 56(%rdi), %r10
+; AVX2-NEXT:    andq 24(%rdi), %rax
+; AVX2-NEXT:    orq %r11, %rdx
+; AVX2-NEXT:    orq %r10, %rax
+; AVX2-NEXT:    andq (%rdi), %rcx
+; AVX2-NEXT:    orq %r9, %rcx
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    andq 8(%rdi), %rsi
+; AVX2-NEXT:    orq %r8, %rsi
+; AVX2-NEXT:    orq %rax, %rsi
+; AVX2-NEXT:    orq %rcx, %rsi
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ne_i512:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
+; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movl %esi, %ecx
+; AVX512-NEXT:    andl $63, %ecx
+; AVX512-NEXT:    shrl $3, %esi
+; AVX512-NEXT:    andl $56, %esi
+; AVX512-NEXT:    negl %esi
+; AVX512-NEXT:    movslq %esi, %rbx
+; AVX512-NEXT:    movq -48(%rsp,%rbx), %rdx
+; AVX512-NEXT:    movq -40(%rsp,%rbx), %r14
+; AVX512-NEXT:    movq %r14, %rax
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq -16(%rsp,%rbx), %r11
+; AVX512-NEXT:    movq -8(%rsp,%rbx), %r10
+; AVX512-NEXT:    shldq %cl, %r11, %r10
+; AVX512-NEXT:    movq -32(%rsp,%rbx), %r9
+; AVX512-NEXT:    movq -24(%rsp,%rbx), %r15
+; AVX512-NEXT:    movq %r15, %r8
+; AVX512-NEXT:    shldq %cl, %r9, %r8
+; AVX512-NEXT:    movq -56(%rsp,%rbx), %rsi
+; AVX512-NEXT:    shldq %cl, %rsi, %rdx
+; AVX512-NEXT:    shldq %cl, %r15, %r11
+; AVX512-NEXT:    shldq %cl, %r14, %r9
+; AVX512-NEXT:    movq -64(%rsp,%rbx), %rbx
+; AVX512-NEXT:    shldq %cl, %rbx, %rsi
+; AVX512-NEXT:    shlxq %rcx, %rbx, %rcx
+; AVX512-NEXT:    andq 32(%rdi), %r9
+; AVX512-NEXT:    andq 48(%rdi), %r11
+; AVX512-NEXT:    andq 16(%rdi), %rdx
+; AVX512-NEXT:    andq 40(%rdi), %r8
+; AVX512-NEXT:    andq 56(%rdi), %r10
+; AVX512-NEXT:    andq 24(%rdi), %rax
+; AVX512-NEXT:    orq %r11, %rdx
+; AVX512-NEXT:    orq %r10, %rax
+; AVX512-NEXT:    andq (%rdi), %rcx
+; AVX512-NEXT:    orq %r9, %rcx
+; AVX512-NEXT:    orq %rdx, %rcx
+; AVX512-NEXT:    andq 8(%rdi), %rsi
+; AVX512-NEXT:    orq %r8, %rsi
+; AVX512-NEXT:    orq %rax, %rsi
+; AVX512-NEXT:    orq %rcx, %rsi
+; AVX512-NEXT:    setne %al
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %rem = and i32 %position, 511
   %ofs = zext nneg i32 %rem to i512
   %bit = shl nuw i512 1, %ofs
@@ -679,33 +1602,572 @@ define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {
 define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: complement_ne_i512:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    andl $60, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
-; X86-NEXT:    setb %al
-; X86-NEXT:    btcl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $272, %esp # imm = 0x110
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrl $3, %eax
+; X86-NEXT:    andl $60, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%edx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%edx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $31, %ecx
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 56(%edx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 60(%edx), %esi
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%edx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%edx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%edx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%edx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 16(%edx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 20(%edx), %ebx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%edx), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%edx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl 52(%edx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 4(%edx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl 40(%edx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    movl 8(%edx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %ebx, %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl 56(%edx), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %edi, %ebx
+; X86-NEXT:    movl 24(%edx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%eax), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl 12(%eax), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    movl 60(%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl 28(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    negl %eax
+; X86-NEXT:    movl 240(%esp,%eax), %esi
+; X86-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    movl 32(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %edi, %eax
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %edx, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl 16(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %ebx, %eax
+; X86-NEXT:    movl 48(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl 36(%esi), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl 20(%esi), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    andl %esi, %edi
+; X86-NEXT:    movl 52(%eax), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    xorl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl %ecx, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ebx, 60(%edx)
+; X86-NEXT:    movl %edi, 56(%edx)
+; X86-NEXT:    movl %ecx, 52(%edx)
+; X86-NEXT:    movl %esi, 44(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 40(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 36(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 32(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 28(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 24(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 20(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 16(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 12(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 8(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 4(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, (%edx)
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 48(%edx)
+; X86-NEXT:    setne %al
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: complement_ne_i512:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    shrl $3, %ecx
-; X64-NEXT:    andl $60, %ecx
-; X64-NEXT:    movl (%rdi,%rcx), %edx
-; X64-NEXT:    btl %esi, %edx
-; X64-NEXT:    setb %al
-; X64-NEXT:    btcl %esi, %edx
-; X64-NEXT:    movl %edx, (%rdi,%rcx)
-; X64-NEXT:    retq
+; SSE-LABEL: complement_ne_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %r13
+; SSE-NEXT:    pushq %r12
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $56, %rsp
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movl %esi, %ecx
+; SSE-NEXT:    andl $63, %ecx
+; SSE-NEXT:    shrl $3, %esi
+; SSE-NEXT:    andl $56, %esi
+; SSE-NEXT:    negl %esi
+; SSE-NEXT:    movslq %esi, %rbx
+; SSE-NEXT:    movq (%rsp,%rbx), %rsi
+; SSE-NEXT:    movq 8(%rsp,%rbx), %r14
+; SSE-NEXT:    movq %r14, %rax
+; SSE-NEXT:    shldq %cl, %rsi, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 32(%rsp,%rbx), %r8
+; SSE-NEXT:    movq 40(%rsp,%rbx), %rbp
+; SSE-NEXT:    shldq %cl, %r8, %rbp
+; SSE-NEXT:    movq 16(%rsp,%rbx), %r9
+; SSE-NEXT:    movq 24(%rsp,%rbx), %r15
+; SSE-NEXT:    movq %r15, %r10
+; SSE-NEXT:    shldq %cl, %r9, %r10
+; SSE-NEXT:    movq -8(%rsp,%rbx), %r11
+; SSE-NEXT:    shldq %cl, %r11, %rsi
+; SSE-NEXT:    shldq %cl, %r15, %r8
+; SSE-NEXT:    shldq %cl, %r14, %r9
+; SSE-NEXT:    movq -16(%rsp,%rbx), %rbx
+; SSE-NEXT:    shldq %cl, %rbx, %r11
+; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT:    shlq %cl, %rbx
+; SSE-NEXT:    movq 24(%rdi), %r15
+; SSE-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 56(%rdi), %rcx
+; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 16(%rdi), %r12
+; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 48(%rdi), %r13
+; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    andq %r8, %r13
+; SSE-NEXT:    andq %rsi, %r12
+; SSE-NEXT:    orq %r13, %r12
+; SSE-NEXT:    movq %rcx, %r13
+; SSE-NEXT:    andq %rbp, %r13
+; SSE-NEXT:    andq %rax, %r15
+; SSE-NEXT:    orq %r13, %r15
+; SSE-NEXT:    movq 32(%rdi), %r14
+; SSE-NEXT:    movq %r14, %rcx
+; SSE-NEXT:    andq %r9, %rcx
+; SSE-NEXT:    movq (%rdi), %r13
+; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    andq %rbx, %r13
+; SSE-NEXT:    orq %rcx, %r13
+; SSE-NEXT:    orq %r12, %r13
+; SSE-NEXT:    movq 40(%rdi), %rcx
+; SSE-NEXT:    movq %rcx, %r12
+; SSE-NEXT:    andq %r10, %r12
+; SSE-NEXT:    movq 8(%rdi), %rdx
+; SSE-NEXT:    movq %rdx, %rax
+; SSE-NEXT:    andq %r11, %rax
+; SSE-NEXT:    orq %r12, %rax
+; SSE-NEXT:    orq %r15, %rax
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; SSE-NEXT:    xorq %rcx, %r10
+; SSE-NEXT:    xorq %r14, %r9
+; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; SSE-NEXT:    xorq %rdx, %r11
+; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; SSE-NEXT:    orq %r13, %rax
+; SSE-NEXT:    movq %r8, 48(%rdi)
+; SSE-NEXT:    movq %rbp, 56(%rdi)
+; SSE-NEXT:    movq %r9, 32(%rdi)
+; SSE-NEXT:    movq %r10, 40(%rdi)
+; SSE-NEXT:    movq %rsi, 16(%rdi)
+; SSE-NEXT:    movq %r15, 24(%rdi)
+; SSE-NEXT:    movq %rbx, (%rdi)
+; SSE-NEXT:    movq %r11, 8(%rdi)
+; SSE-NEXT:    setne %al
+; SSE-NEXT:    addq $56, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r12
+; SSE-NEXT:    popq %r13
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: complement_ne_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    subq $72, %rsp
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
+; AVX2-NEXT:    vmovups %ymm0, (%rsp)
+; AVX2-NEXT:    movl %esi, %ecx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    shrl $3, %esi
+; AVX2-NEXT:    andl $56, %esi
+; AVX2-NEXT:    negl %esi
+; AVX2-NEXT:    movslq %esi, %rbx
+; AVX2-NEXT:    movq 16(%rsp,%rbx), %rsi
+; AVX2-NEXT:    movq 24(%rsp,%rbx), %rbp
+; AVX2-NEXT:    movq %rbp, %rax
+; AVX2-NEXT:    shldq %cl, %rsi, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 48(%rsp,%rbx), %r8
+; AVX2-NEXT:    movq 56(%rsp,%rbx), %r13
+; AVX2-NEXT:    shldq %cl, %r8, %r13
+; AVX2-NEXT:    movq 32(%rsp,%rbx), %r9
+; AVX2-NEXT:    movq 40(%rsp,%rbx), %r14
+; AVX2-NEXT:    movq %r14, %r10
+; AVX2-NEXT:    shldq %cl, %r9, %r10
+; AVX2-NEXT:    movq 8(%rsp,%rbx), %r11
+; AVX2-NEXT:    shldq %cl, %r11, %rsi
+; AVX2-NEXT:    shldq %cl, %r14, %r8
+; AVX2-NEXT:    movq 16(%rdi), %r12
+; AVX2-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 48(%rdi), %r14
+; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    andq %r8, %r14
+; AVX2-NEXT:    andq %rsi, %r12
+; AVX2-NEXT:    orq %r14, %r12
+; AVX2-NEXT:    movq 56(%rdi), %r15
+; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    andq %r13, %r15
+; AVX2-NEXT:    movq 24(%rdi), %r14
+; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    andq %rax, %r14
+; AVX2-NEXT:    orq %r15, %r14
+; AVX2-NEXT:    shldq %cl, %rbp, %r9
+; AVX2-NEXT:    movq (%rsp,%rbx), %rdx
+; AVX2-NEXT:    movq 32(%rdi), %r15
+; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    andq %r9, %r15
+; AVX2-NEXT:    shlxq %rcx, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq (%rdi), %rbx
+; AVX2-NEXT:    movq %rbx, %rbp
+; AVX2-NEXT:    andq %rax, %rbp
+; AVX2-NEXT:    orq %r15, %rbp
+; AVX2-NEXT:    orq %r12, %rbp
+; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; AVX2-NEXT:    shldq %cl, %rdx, %r11
+; AVX2-NEXT:    movq 40(%rdi), %rax
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    andq %r10, %rcx
+; AVX2-NEXT:    movq 8(%rdi), %r15
+; AVX2-NEXT:    movq %r15, %r12
+; AVX2-NEXT:    andq %r11, %r12
+; AVX2-NEXT:    orq %rcx, %r12
+; AVX2-NEXT:    orq %r14, %r12
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX2-NEXT:    xorq %rax, %r10
+; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX2-NEXT:    xorq %r15, %r11
+; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX2-NEXT:    orq %rbp, %r12
+; AVX2-NEXT:    movq %r8, 48(%rdi)
+; AVX2-NEXT:    movq %r13, 56(%rdi)
+; AVX2-NEXT:    movq %r9, 32(%rdi)
+; AVX2-NEXT:    movq %r10, 40(%rdi)
+; AVX2-NEXT:    movq %rsi, 16(%rdi)
+; AVX2-NEXT:    movq %rcx, 24(%rdi)
+; AVX2-NEXT:    movq %rbx, (%rdi)
+; AVX2-NEXT:    movq %r11, 8(%rdi)
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    addq $72, %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: complement_ne_i512:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    subq $72, %rsp
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
+; AVX512-NEXT:    vmovups %ymm0, (%rsp)
+; AVX512-NEXT:    movl %esi, %ecx
+; AVX512-NEXT:    andl $63, %ecx
+; AVX512-NEXT:    shrl $3, %esi
+; AVX512-NEXT:    andl $56, %esi
+; AVX512-NEXT:    negl %esi
+; AVX512-NEXT:    movslq %esi, %rbx
+; AVX512-NEXT:    movq 16(%rsp,%rbx), %rsi
+; AVX512-NEXT:    movq 24(%rsp,%rbx), %rbp
+; AVX512-NEXT:    movq %rbp, %rax
+; AVX512-NEXT:    shldq %cl, %rsi, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 48(%rsp,%rbx), %r8
+; AVX512-NEXT:    movq 56(%rsp,%rbx), %r13
+; AVX512-NEXT:    shldq %cl, %r8, %r13
+; AVX512-NEXT:    movq 32(%rsp,%rbx), %r9
+; AVX512-NEXT:    movq 40(%rsp,%rbx), %r14
+; AVX512-NEXT:    movq %r14, %r10
+; AVX512-NEXT:    shldq %cl, %r9, %r10
+; AVX512-NEXT:    movq 8(%rsp,%rbx), %r11
+; AVX512-NEXT:    shldq %cl, %r11, %rsi
+; AVX512-NEXT:    shldq %cl, %r14, %r8
+; AVX512-NEXT:    movq 16(%rdi), %r12
+; AVX512-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 48(%rdi), %r14
+; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    andq %r8, %r14
+; AVX512-NEXT:    andq %rsi, %r12
+; AVX512-NEXT:    orq %r14, %r12
+; AVX512-NEXT:    movq 56(%rdi), %r15
+; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    andq %r13, %r15
+; AVX512-NEXT:    movq 24(%rdi), %r14
+; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    andq %rax, %r14
+; AVX512-NEXT:    orq %r15, %r14
+; AVX512-NEXT:    shldq %cl, %rbp, %r9
+; AVX512-NEXT:    movq (%rsp,%rbx), %rdx
+; AVX512-NEXT:    movq 32(%rdi), %r15
+; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    andq %r9, %r15
+; AVX512-NEXT:    shlxq %rcx, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq (%rdi), %rbx
+; AVX512-NEXT:    movq %rbx, %rbp
+; AVX512-NEXT:    andq %rax, %rbp
+; AVX512-NEXT:    orq %r15, %rbp
+; AVX512-NEXT:    orq %r12, %rbp
+; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
+; AVX512-NEXT:    shldq %cl, %rdx, %r11
+; AVX512-NEXT:    movq 40(%rdi), %rax
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    andq %r10, %rcx
+; AVX512-NEXT:    movq 8(%rdi), %r15
+; AVX512-NEXT:    movq %r15, %r12
+; AVX512-NEXT:    andq %r11, %r12
+; AVX512-NEXT:    orq %rcx, %r12
+; AVX512-NEXT:    orq %r14, %r12
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX512-NEXT:    xorq %rax, %r10
+; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX512-NEXT:    xorq %r15, %r11
+; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX512-NEXT:    orq %rbp, %r12
+; AVX512-NEXT:    movq %r8, 48(%rdi)
+; AVX512-NEXT:    movq %r13, 56(%rdi)
+; AVX512-NEXT:    movq %r9, 32(%rdi)
+; AVX512-NEXT:    movq %r10, 40(%rdi)
+; AVX512-NEXT:    movq %rsi, 16(%rdi)
+; AVX512-NEXT:    movq %rcx, 24(%rdi)
+; AVX512-NEXT:    movq %rbx, (%rdi)
+; AVX512-NEXT:    movq %r11, 8(%rdi)
+; AVX512-NEXT:    setne %al
+; AVX512-NEXT:    addq $72, %rsp
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %rem = and i32 %position, 511
   %ofs = zext nneg i32 %rem to i512
   %bit = shl nuw i512 1, %ofs
@@ -720,33 +2182,606 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
 define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: reset_eq_i512:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $288, %esp # imm = 0x120
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrl $3, %eax
+; X86-NEXT:    andl $60, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 4(%edi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%edi), %eax
+; X86-NEXT:    andl $31, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    shldl %cl, %edx, %ebx
+; X86-NEXT:    movl 12(%edi), %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    andl $60, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
-; X86-NEXT:    setae %al
-; X86-NEXT:    btrl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 16(%edi), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 20(%edi), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%edi), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%edi), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%edi), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%edi), %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%edi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %eax, %edx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %eax, %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%edi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl 52(%edi), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 56(%edi), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shldl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    movl 56(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %eax, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%esi), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl 44(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%esi), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 60(%edi), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 60(%ebx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %edx, %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%edi), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    negl %eax
+; X86-NEXT:    movl 256(%esp,%eax), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edi, %eax
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    movl 32(%ebx), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %ecx, %edx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%ebx), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %ecx, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 16(%esi), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %ecx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%esi), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%esi), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %ebx, %edx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 4(%esi), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %ebx, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 20(%esi), %edi
+; X86-NEXT:    andl %edi, %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl 8(%ebp), %ebx
+; X86-NEXT:    movl 52(%ebx), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %ebx, %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    andl %edi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    notl %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    notl %edi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    notl %edi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    notl %edi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 60(%eax)
+; X86-NEXT:    movl %esi, 56(%eax)
+; X86-NEXT:    movl %ecx, 52(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 44(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 40(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 36(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 32(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 28(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    movl %ebx, 20(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 48(%eax)
+; X86-NEXT:    sete %al
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: reset_eq_i512:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    shrl $3, %ecx
-; X64-NEXT:    andl $60, %ecx
-; X64-NEXT:    movl (%rdi,%rcx), %edx
-; X64-NEXT:    btl %esi, %edx
-; X64-NEXT:    setae %al
-; X64-NEXT:    btrl %esi, %edx
-; X64-NEXT:    movl %edx, (%rdi,%rcx)
-; X64-NEXT:    retq
+; SSE-LABEL: reset_eq_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %r13
+; SSE-NEXT:    pushq %r12
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $56, %rsp
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movl %esi, %ecx
+; SSE-NEXT:    andl $63, %ecx
+; SSE-NEXT:    shrl $3, %esi
+; SSE-NEXT:    andl $56, %esi
+; SSE-NEXT:    negl %esi
+; SSE-NEXT:    movslq %esi, %rdx
+; SSE-NEXT:    movq (%rsp,%rdx), %r9
+; SSE-NEXT:    movq 8(%rsp,%rdx), %r8
+; SSE-NEXT:    movq %r8, %rsi
+; SSE-NEXT:    shldq %cl, %r9, %rsi
+; SSE-NEXT:    movq -8(%rsp,%rdx), %rax
+; SSE-NEXT:    shldq %cl, %rax, %r9
+; SSE-NEXT:    movq 16(%rsp,%rdx), %r14
+; SSE-NEXT:    movq 24(%rsp,%rdx), %r10
+; SSE-NEXT:    movq %r10, %rbx
+; SSE-NEXT:    shldq %cl, %r14, %rbx
+; SSE-NEXT:    shldq %cl, %r8, %r14
+; SSE-NEXT:    movq 32(%rsp,%rdx), %r13
+; SSE-NEXT:    movq 40(%rsp,%rdx), %r12
+; SSE-NEXT:    shldq %cl, %r13, %r12
+; SSE-NEXT:    shldq %cl, %r10, %r13
+; SSE-NEXT:    movq -16(%rsp,%rdx), %rdx
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT:    shlq %cl, %rdx
+; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq %r12, %rbp
+; SSE-NEXT:    movq %r9, %r15
+; SSE-NEXT:    movq %rsi, %r11
+; SSE-NEXT:    movq 16(%rdi), %r8
+; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 48(%rdi), %rcx
+; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    andq %rcx, %r13
+; SSE-NEXT:    andq %r8, %r9
+; SSE-NEXT:    orq %r13, %r9
+; SSE-NEXT:    movq 56(%rdi), %rcx
+; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    andq %rcx, %r12
+; SSE-NEXT:    movq 24(%rdi), %r10
+; SSE-NEXT:    andq %r10, %rsi
+; SSE-NEXT:    orq %r12, %rsi
+; SSE-NEXT:    movq %r14, %r13
+; SSE-NEXT:    movq 32(%rdi), %rcx
+; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    andq %rcx, %r14
+; SSE-NEXT:    movq %rdx, %r12
+; SSE-NEXT:    movq (%rdi), %rcx
+; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %r14, %rdx
+; SSE-NEXT:    orq %r9, %rdx
+; SSE-NEXT:    movq %rbx, %r14
+; SSE-NEXT:    movq 40(%rdi), %rcx
+; SSE-NEXT:    andq %rcx, %rbx
+; SSE-NEXT:    movq %rax, %r9
+; SSE-NEXT:    movq 8(%rdi), %r8
+; SSE-NEXT:    andq %r8, %rax
+; SSE-NEXT:    orq %rbx, %rax
+; SSE-NEXT:    orq %rsi, %rax
+; SSE-NEXT:    notq %r11
+; SSE-NEXT:    andq %r10, %r11
+; SSE-NEXT:    notq %r15
+; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; SSE-NEXT:    notq %r14
+; SSE-NEXT:    andq %rcx, %r14
+; SSE-NEXT:    notq %r13
+; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; SSE-NEXT:    notq %rbp
+; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE-NEXT:    notq %rcx
+; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; SSE-NEXT:    notq %r9
+; SSE-NEXT:    andq %r8, %r9
+; SSE-NEXT:    notq %r12
+; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; SSE-NEXT:    orq %rdx, %rax
+; SSE-NEXT:    movq %rcx, 48(%rdi)
+; SSE-NEXT:    movq %rbp, 56(%rdi)
+; SSE-NEXT:    movq %r13, 32(%rdi)
+; SSE-NEXT:    movq %r14, 40(%rdi)
+; SSE-NEXT:    movq %r15, 16(%rdi)
+; SSE-NEXT:    movq %r11, 24(%rdi)
+; SSE-NEXT:    movq %r12, (%rdi)
+; SSE-NEXT:    movq %r9, 8(%rdi)
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    addq $56, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r12
+; SSE-NEXT:    popq %r13
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: reset_eq_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    pushq %rax
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movl %esi, %ecx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    shrl $3, %esi
+; AVX2-NEXT:    andl $56, %esi
+; AVX2-NEXT:    negl %esi
+; AVX2-NEXT:    movslq %esi, %rdx
+; AVX2-NEXT:    movq -48(%rsp,%rdx), %r8
+; AVX2-NEXT:    movq -40(%rsp,%rdx), %rbx
+; AVX2-NEXT:    movq %rbx, %rax
+; AVX2-NEXT:    shldq %cl, %r8, %rax
+; AVX2-NEXT:    movq -16(%rsp,%rdx), %r10
+; AVX2-NEXT:    movq -8(%rsp,%rdx), %rsi
+; AVX2-NEXT:    shldq %cl, %r10, %rsi
+; AVX2-NEXT:    movq -32(%rsp,%rdx), %r11
+; AVX2-NEXT:    movq -24(%rsp,%rdx), %r14
+; AVX2-NEXT:    movq %r14, %r9
+; AVX2-NEXT:    shldq %cl, %r11, %r9
+; AVX2-NEXT:    movq -64(%rsp,%rdx), %r15
+; AVX2-NEXT:    movq -56(%rsp,%rdx), %rdx
+; AVX2-NEXT:    shldq %cl, %rdx, %r8
+; AVX2-NEXT:    shldq %cl, %r14, %r10
+; AVX2-NEXT:    shldq %cl, %rbx, %r11
+; AVX2-NEXT:    shldq %cl, %r15, %rdx
+; AVX2-NEXT:    shlxq %rcx, %r15, %rcx
+; AVX2-NEXT:    movq 24(%rdi), %rbx
+; AVX2-NEXT:    movq 56(%rdi), %r14
+; AVX2-NEXT:    movq 16(%rdi), %r15
+; AVX2-NEXT:    movq 48(%rdi), %r13
+; AVX2-NEXT:    movq 32(%rdi), %rbp
+; AVX2-NEXT:    andnq %rbp, %r11, %r12
+; AVX2-NEXT:    andq %r11, %rbp
+; AVX2-NEXT:    andnq %r13, %r10, %r11
+; AVX2-NEXT:    andq %r10, %r13
+; AVX2-NEXT:    andnq %r15, %r8, %r10
+; AVX2-NEXT:    andq %r8, %r15
+; AVX2-NEXT:    movq 40(%rdi), %r8
+; AVX2-NEXT:    orq %r13, %r15
+; AVX2-NEXT:    andnq %r8, %r9, %r13
+; AVX2-NEXT:    andq %r9, %r8
+; AVX2-NEXT:    andnq %r14, %rsi, %r9
+; AVX2-NEXT:    andq %rsi, %r14
+; AVX2-NEXT:    andnq %rbx, %rax, %rsi
+; AVX2-NEXT:    andq %rax, %rbx
+; AVX2-NEXT:    movq (%rdi), %rax
+; AVX2-NEXT:    orq %r14, %rbx
+; AVX2-NEXT:    andnq %rax, %rcx, %r14
+; AVX2-NEXT:    andq %rcx, %rax
+; AVX2-NEXT:    orq %rbp, %rax
+; AVX2-NEXT:    movq 8(%rdi), %rcx
+; AVX2-NEXT:    orq %r15, %rax
+; AVX2-NEXT:    andnq %rcx, %rdx, %r15
+; AVX2-NEXT:    andq %rdx, %rcx
+; AVX2-NEXT:    orq %r8, %rcx
+; AVX2-NEXT:    orq %rbx, %rcx
+; AVX2-NEXT:    orq %rax, %rcx
+; AVX2-NEXT:    movq %r11, 48(%rdi)
+; AVX2-NEXT:    movq %r9, 56(%rdi)
+; AVX2-NEXT:    movq %r12, 32(%rdi)
+; AVX2-NEXT:    movq %r13, 40(%rdi)
+; AVX2-NEXT:    movq %r10, 16(%rdi)
+; AVX2-NEXT:    movq %rsi, 24(%rdi)
+; AVX2-NEXT:    movq %r14, (%rdi)
+; AVX2-NEXT:    movq %r15, 8(%rdi)
+; AVX2-NEXT:    sete %al
+; AVX2-NEXT:    addq $8, %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: reset_eq_i512:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    pushq %rax
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
+; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movl %esi, %ecx
+; AVX512-NEXT:    andl $63, %ecx
+; AVX512-NEXT:    shrl $3, %esi
+; AVX512-NEXT:    andl $56, %esi
+; AVX512-NEXT:    negl %esi
+; AVX512-NEXT:    movslq %esi, %rbx
+; AVX512-NEXT:    movq -48(%rsp,%rbx), %r8
+; AVX512-NEXT:    movq -40(%rsp,%rbx), %r14
+; AVX512-NEXT:    movq %r14, %rax
+; AVX512-NEXT:    shldq %cl, %r8, %rax
+; AVX512-NEXT:    movq -16(%rsp,%rbx), %r10
+; AVX512-NEXT:    movq -8(%rsp,%rbx), %rsi
+; AVX512-NEXT:    shldq %cl, %r10, %rsi
+; AVX512-NEXT:    movq -32(%rsp,%rbx), %r11
+; AVX512-NEXT:    movq -24(%rsp,%rbx), %r15
+; AVX512-NEXT:    movq %r15, %r9
+; AVX512-NEXT:    shldq %cl, %r11, %r9
+; AVX512-NEXT:    movq -56(%rsp,%rbx), %rdx
+; AVX512-NEXT:    shldq %cl, %rdx, %r8
+; AVX512-NEXT:    shldq %cl, %r15, %r10
+; AVX512-NEXT:    shldq %cl, %r14, %r11
+; AVX512-NEXT:    movq -64(%rsp,%rbx), %rbx
+; AVX512-NEXT:    shldq %cl, %rbx, %rdx
+; AVX512-NEXT:    shlxq %rcx, %rbx, %rcx
+; AVX512-NEXT:    movq 24(%rdi), %rbx
+; AVX512-NEXT:    movq 56(%rdi), %r14
+; AVX512-NEXT:    movq 16(%rdi), %r15
+; AVX512-NEXT:    movq 48(%rdi), %r13
+; AVX512-NEXT:    movq 32(%rdi), %rbp
+; AVX512-NEXT:    andnq %rbp, %r11, %r12
+; AVX512-NEXT:    andq %r11, %rbp
+; AVX512-NEXT:    andnq %r13, %r10, %r11
+; AVX512-NEXT:    andq %r10, %r13
+; AVX512-NEXT:    andnq %r15, %r8, %r10
+; AVX512-NEXT:    andq %r8, %r15
+; AVX512-NEXT:    movq 40(%rdi), %r8
+; AVX512-NEXT:    orq %r13, %r15
+; AVX512-NEXT:    andnq %r8, %r9, %r13
+; AVX512-NEXT:    andq %r9, %r8
+; AVX512-NEXT:    andnq %r14, %rsi, %r9
+; AVX512-NEXT:    andq %rsi, %r14
+; AVX512-NEXT:    andnq %rbx, %rax, %rsi
+; AVX512-NEXT:    andq %rax, %rbx
+; AVX512-NEXT:    movq (%rdi), %rax
+; AVX512-NEXT:    orq %r14, %rbx
+; AVX512-NEXT:    andnq %rax, %rcx, %r14
+; AVX512-NEXT:    andq %rcx, %rax
+; AVX512-NEXT:    orq %rbp, %rax
+; AVX512-NEXT:    movq 8(%rdi), %rcx
+; AVX512-NEXT:    orq %r15, %rax
+; AVX512-NEXT:    andnq %rcx, %rdx, %r15
+; AVX512-NEXT:    andq %rdx, %rcx
+; AVX512-NEXT:    orq %r8, %rcx
+; AVX512-NEXT:    orq %rbx, %rcx
+; AVX512-NEXT:    orq %rax, %rcx
+; AVX512-NEXT:    movq %r11, 48(%rdi)
+; AVX512-NEXT:    movq %r9, 56(%rdi)
+; AVX512-NEXT:    movq %r12, 32(%rdi)
+; AVX512-NEXT:    movq %r13, 40(%rdi)
+; AVX512-NEXT:    movq %r10, 16(%rdi)
+; AVX512-NEXT:    movq %rsi, 24(%rdi)
+; AVX512-NEXT:    movq %r14, (%rdi)
+; AVX512-NEXT:    movq %r15, 8(%rdi)
+; AVX512-NEXT:    sete %al
+; AVX512-NEXT:    addq $8, %rsp
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %rem = and i32 %position, 511
   %ofs = zext nneg i32 %rem to i512
   %bit = shl nuw i512 1, %ofs
@@ -762,33 +2797,572 @@ define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {
 define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: set_ne_i512:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    andl $60, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
-; X86-NEXT:    setb %al
-; X86-NEXT:    btsl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $272, %esp # imm = 0x110
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrl $3, %eax
+; X86-NEXT:    andl $60, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%edx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%edx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $31, %ecx
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 56(%edx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 60(%edx), %esi
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%edx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%edx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%edx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%edx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 16(%edx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 20(%edx), %ebx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%edx), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%edx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl 52(%edx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 4(%edx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl 40(%edx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    movl 8(%edx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %ebx, %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl 56(%edx), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %edi, %ebx
+; X86-NEXT:    movl 24(%edx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%eax), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl 12(%eax), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    movl 60(%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl 28(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    negl %eax
+; X86-NEXT:    movl 240(%esp,%eax), %esi
+; X86-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    movl 32(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %edi, %eax
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %edx, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl 16(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %ebx, %eax
+; X86-NEXT:    movl 48(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl 36(%esi), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl 20(%esi), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    andl %esi, %edi
+; X86-NEXT:    movl 52(%eax), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl %ecx, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ebx, 60(%edx)
+; X86-NEXT:    movl %edi, 56(%edx)
+; X86-NEXT:    movl %ecx, 52(%edx)
+; X86-NEXT:    movl %esi, 44(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 40(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 36(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 32(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 28(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 24(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 20(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 16(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 12(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 8(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 4(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, (%edx)
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 48(%edx)
+; X86-NEXT:    setne %al
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: set_ne_i512:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    shrl $3, %ecx
-; X64-NEXT:    andl $60, %ecx
-; X64-NEXT:    movl (%rdi,%rcx), %edx
-; X64-NEXT:    btl %esi, %edx
-; X64-NEXT:    setb %al
-; X64-NEXT:    btsl %esi, %edx
-; X64-NEXT:    movl %edx, (%rdi,%rcx)
-; X64-NEXT:    retq
+; SSE-LABEL: set_ne_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %r13
+; SSE-NEXT:    pushq %r12
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $56, %rsp
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movl %esi, %ecx
+; SSE-NEXT:    andl $63, %ecx
+; SSE-NEXT:    shrl $3, %esi
+; SSE-NEXT:    andl $56, %esi
+; SSE-NEXT:    negl %esi
+; SSE-NEXT:    movslq %esi, %rbx
+; SSE-NEXT:    movq (%rsp,%rbx), %rsi
+; SSE-NEXT:    movq 8(%rsp,%rbx), %r14
+; SSE-NEXT:    movq %r14, %rax
+; SSE-NEXT:    shldq %cl, %rsi, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 32(%rsp,%rbx), %r8
+; SSE-NEXT:    movq 40(%rsp,%rbx), %rbp
+; SSE-NEXT:    shldq %cl, %r8, %rbp
+; SSE-NEXT:    movq 16(%rsp,%rbx), %r9
+; SSE-NEXT:    movq 24(%rsp,%rbx), %r15
+; SSE-NEXT:    movq %r15, %r10
+; SSE-NEXT:    shldq %cl, %r9, %r10
+; SSE-NEXT:    movq -8(%rsp,%rbx), %r11
+; SSE-NEXT:    shldq %cl, %r11, %rsi
+; SSE-NEXT:    shldq %cl, %r15, %r8
+; SSE-NEXT:    shldq %cl, %r14, %r9
+; SSE-NEXT:    movq -16(%rsp,%rbx), %rbx
+; SSE-NEXT:    shldq %cl, %rbx, %r11
+; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT:    shlq %cl, %rbx
+; SSE-NEXT:    movq 24(%rdi), %r15
+; SSE-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 56(%rdi), %rcx
+; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 16(%rdi), %r12
+; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 48(%rdi), %r13
+; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    andq %r8, %r13
+; SSE-NEXT:    andq %rsi, %r12
+; SSE-NEXT:    orq %r13, %r12
+; SSE-NEXT:    movq %rcx, %r13
+; SSE-NEXT:    andq %rbp, %r13
+; SSE-NEXT:    andq %rax, %r15
+; SSE-NEXT:    orq %r13, %r15
+; SSE-NEXT:    movq 32(%rdi), %r14
+; SSE-NEXT:    movq %r14, %rcx
+; SSE-NEXT:    andq %r9, %rcx
+; SSE-NEXT:    movq (%rdi), %r13
+; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    andq %rbx, %r13
+; SSE-NEXT:    orq %rcx, %r13
+; SSE-NEXT:    orq %r12, %r13
+; SSE-NEXT:    movq 40(%rdi), %rcx
+; SSE-NEXT:    movq %rcx, %r12
+; SSE-NEXT:    andq %r10, %r12
+; SSE-NEXT:    movq 8(%rdi), %rdx
+; SSE-NEXT:    movq %rdx, %rax
+; SSE-NEXT:    andq %r11, %rax
+; SSE-NEXT:    orq %r12, %rax
+; SSE-NEXT:    orq %r15, %rax
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; SSE-NEXT:    orq %rcx, %r10
+; SSE-NEXT:    orq %r14, %r9
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; SSE-NEXT:    orq %rdx, %r11
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; SSE-NEXT:    orq %r13, %rax
+; SSE-NEXT:    movq %r8, 48(%rdi)
+; SSE-NEXT:    movq %rbp, 56(%rdi)
+; SSE-NEXT:    movq %r9, 32(%rdi)
+; SSE-NEXT:    movq %r10, 40(%rdi)
+; SSE-NEXT:    movq %rsi, 16(%rdi)
+; SSE-NEXT:    movq %r15, 24(%rdi)
+; SSE-NEXT:    movq %rbx, (%rdi)
+; SSE-NEXT:    movq %r11, 8(%rdi)
+; SSE-NEXT:    setne %al
+; SSE-NEXT:    addq $56, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r12
+; SSE-NEXT:    popq %r13
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: set_ne_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    subq $72, %rsp
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
+; AVX2-NEXT:    vmovups %ymm0, (%rsp)
+; AVX2-NEXT:    movl %esi, %ecx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    shrl $3, %esi
+; AVX2-NEXT:    andl $56, %esi
+; AVX2-NEXT:    negl %esi
+; AVX2-NEXT:    movslq %esi, %rbx
+; AVX2-NEXT:    movq 16(%rsp,%rbx), %rsi
+; AVX2-NEXT:    movq 24(%rsp,%rbx), %rbp
+; AVX2-NEXT:    movq %rbp, %rax
+; AVX2-NEXT:    shldq %cl, %rsi, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 48(%rsp,%rbx), %r8
+; AVX2-NEXT:    movq 56(%rsp,%rbx), %r13
+; AVX2-NEXT:    shldq %cl, %r8, %r13
+; AVX2-NEXT:    movq 32(%rsp,%rbx), %r9
+; AVX2-NEXT:    movq 40(%rsp,%rbx), %r14
+; AVX2-NEXT:    movq %r14, %r10
+; AVX2-NEXT:    shldq %cl, %r9, %r10
+; AVX2-NEXT:    movq 8(%rsp,%rbx), %r11
+; AVX2-NEXT:    shldq %cl, %r11, %rsi
+; AVX2-NEXT:    shldq %cl, %r14, %r8
+; AVX2-NEXT:    movq 16(%rdi), %r12
+; AVX2-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 48(%rdi), %r14
+; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    andq %r8, %r14
+; AVX2-NEXT:    andq %rsi, %r12
+; AVX2-NEXT:    orq %r14, %r12
+; AVX2-NEXT:    movq 56(%rdi), %r15
+; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    andq %r13, %r15
+; AVX2-NEXT:    movq 24(%rdi), %r14
+; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    andq %rax, %r14
+; AVX2-NEXT:    orq %r15, %r14
+; AVX2-NEXT:    shldq %cl, %rbp, %r9
+; AVX2-NEXT:    movq (%rsp,%rbx), %rdx
+; AVX2-NEXT:    movq 32(%rdi), %r15
+; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    andq %r9, %r15
+; AVX2-NEXT:    shlxq %rcx, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq (%rdi), %rbx
+; AVX2-NEXT:    movq %rbx, %rbp
+; AVX2-NEXT:    andq %rax, %rbp
+; AVX2-NEXT:    orq %r15, %rbp
+; AVX2-NEXT:    orq %r12, %rbp
+; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; AVX2-NEXT:    shldq %cl, %rdx, %r11
+; AVX2-NEXT:    movq 40(%rdi), %rax
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    andq %r10, %rcx
+; AVX2-NEXT:    movq 8(%rdi), %r15
+; AVX2-NEXT:    movq %r15, %r12
+; AVX2-NEXT:    andq %r11, %r12
+; AVX2-NEXT:    orq %rcx, %r12
+; AVX2-NEXT:    orq %r14, %r12
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX2-NEXT:    orq %rax, %r10
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX2-NEXT:    orq %r15, %r11
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX2-NEXT:    orq %rbp, %r12
+; AVX2-NEXT:    movq %r8, 48(%rdi)
+; AVX2-NEXT:    movq %r13, 56(%rdi)
+; AVX2-NEXT:    movq %r9, 32(%rdi)
+; AVX2-NEXT:    movq %r10, 40(%rdi)
+; AVX2-NEXT:    movq %rsi, 16(%rdi)
+; AVX2-NEXT:    movq %rcx, 24(%rdi)
+; AVX2-NEXT:    movq %rbx, (%rdi)
+; AVX2-NEXT:    movq %r11, 8(%rdi)
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    addq $72, %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: set_ne_i512:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    subq $72, %rsp
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
+; AVX512-NEXT:    vmovups %ymm0, (%rsp)
+; AVX512-NEXT:    movl %esi, %ecx
+; AVX512-NEXT:    andl $63, %ecx
+; AVX512-NEXT:    shrl $3, %esi
+; AVX512-NEXT:    andl $56, %esi
+; AVX512-NEXT:    negl %esi
+; AVX512-NEXT:    movslq %esi, %rbx
+; AVX512-NEXT:    movq 16(%rsp,%rbx), %rsi
+; AVX512-NEXT:    movq 24(%rsp,%rbx), %rbp
+; AVX512-NEXT:    movq %rbp, %rax
+; AVX512-NEXT:    shldq %cl, %rsi, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 48(%rsp,%rbx), %r8
+; AVX512-NEXT:    movq 56(%rsp,%rbx), %r13
+; AVX512-NEXT:    shldq %cl, %r8, %r13
+; AVX512-NEXT:    movq 32(%rsp,%rbx), %r9
+; AVX512-NEXT:    movq 40(%rsp,%rbx), %r14
+; AVX512-NEXT:    movq %r14, %r10
+; AVX512-NEXT:    shldq %cl, %r9, %r10
+; AVX512-NEXT:    movq 8(%rsp,%rbx), %r11
+; AVX512-NEXT:    shldq %cl, %r11, %rsi
+; AVX512-NEXT:    shldq %cl, %r14, %r8
+; AVX512-NEXT:    movq 16(%rdi), %r12
+; AVX512-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 48(%rdi), %r14
+; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    andq %r8, %r14
+; AVX512-NEXT:    andq %rsi, %r12
+; AVX512-NEXT:    orq %r14, %r12
+; AVX512-NEXT:    movq 56(%rdi), %r15
+; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    andq %r13, %r15
+; AVX512-NEXT:    movq 24(%rdi), %r14
+; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    andq %rax, %r14
+; AVX512-NEXT:    orq %r15, %r14
+; AVX512-NEXT:    shldq %cl, %rbp, %r9
+; AVX512-NEXT:    movq (%rsp,%rbx), %rdx
+; AVX512-NEXT:    movq 32(%rdi), %r15
+; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    andq %r9, %r15
+; AVX512-NEXT:    shlxq %rcx, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq (%rdi), %rbx
+; AVX512-NEXT:    movq %rbx, %rbp
+; AVX512-NEXT:    andq %rax, %rbp
+; AVX512-NEXT:    orq %r15, %rbp
+; AVX512-NEXT:    orq %r12, %rbp
+; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
+; AVX512-NEXT:    shldq %cl, %rdx, %r11
+; AVX512-NEXT:    movq 40(%rdi), %rax
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    andq %r10, %rcx
+; AVX512-NEXT:    movq 8(%rdi), %r15
+; AVX512-NEXT:    movq %r15, %r12
+; AVX512-NEXT:    andq %r11, %r12
+; AVX512-NEXT:    orq %rcx, %r12
+; AVX512-NEXT:    orq %r14, %r12
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX512-NEXT:    orq %rax, %r10
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX512-NEXT:    orq %r15, %r11
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX512-NEXT:    orq %rbp, %r12
+; AVX512-NEXT:    movq %r8, 48(%rdi)
+; AVX512-NEXT:    movq %r13, 56(%rdi)
+; AVX512-NEXT:    movq %r9, 32(%rdi)
+; AVX512-NEXT:    movq %r10, 40(%rdi)
+; AVX512-NEXT:    movq %rsi, 16(%rdi)
+; AVX512-NEXT:    movq %rcx, 24(%rdi)
+; AVX512-NEXT:    movq %rbx, (%rdi)
+; AVX512-NEXT:    movq %r11, 8(%rdi)
+; AVX512-NEXT:    setne %al
+; AVX512-NEXT:    addq $72, %rsp
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %rem = and i32 %position, 511
   %ofs = zext nneg i32 %rem to i512
   %bit = shl nuw i512 1, %ofs
@@ -803,55 +3377,883 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
 define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; X86-LABEL: init_eq_i512:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    andl $60, %esi
-; X86-NEXT:    movl (%edx,%esi), %edi
-; X86-NEXT:    btl %ecx, %edi
-; X86-NEXT:    setae %al
-; X86-NEXT:    btrl %ecx, %edi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $432, %esp # imm = 0x1B0
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    andl $60, %edx
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl %edx, %esi
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 56(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 60(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 52(%esi), %eax
+; X86-NEXT:    movl 48(%esi), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%esi), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%esi), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%esi), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%esi), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%esi), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%esi), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 20(%esi), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 16(%esi), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%esi), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%esi), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%esi), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 4(%esi), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl 16(%ebp), %ebx
+; X86-NEXT:    movzbl %bl, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $31, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, (%edx,%esi)
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    shldl %cl, %edi, %edx
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 8(%ebp), %ebx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 16(%ebx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %eax, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%ebx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %eax, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%ebx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 56(%ebx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%ebx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 52(%ebx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 20(%ebx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ebx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%ebx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %esi, %edi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 60(%ebx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %eax, %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %esi, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%ebx), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %edi, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%ebx), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 4(%ebx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl 56(%edi), %ebx
+; X86-NEXT:    movl 60(%edi), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %ebx, %eax
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl 52(%edi), %eax
+; X86-NEXT:    shldl %cl, %eax, %ebx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl 48(%edi), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    notl %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl 40(%edi), %ebx
+; X86-NEXT:    movl 44(%edi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %ebx, %eax
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl 36(%edi), %eax
+; X86-NEXT:    shldl %cl, %eax, %ebx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl 32(%edi), %ebx
+; X86-NEXT:    shldl %cl, %ebx, %eax
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl 28(%edi), %eax
+; X86-NEXT:    shldl %cl, %eax, %ebx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl 24(%edi), %ebx
+; X86-NEXT:    shldl %cl, %ebx, %eax
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl 20(%edi), %eax
+; X86-NEXT:    shldl %cl, %eax, %ebx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl 16(%edi), %ebx
+; X86-NEXT:    shldl %cl, %ebx, %eax
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl 12(%edi), %eax
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shldl %cl, %eax, %ebx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    notl %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl 8(%edi), %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    notl %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl 4(%edi), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %ebx, %edx
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl (%edi), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %ebx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    notl %edi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, 60(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, 56(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, 52(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, 44(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, 40(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, 36(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, 32(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, 28(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, 24(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, 20(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, 16(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %esi, 48(%eax)
+; X86-NEXT:    sete %al
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; SSE-LABEL: init_eq_i512:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %r13
+; SSE-NEXT:    pushq %r12
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $216, %rsp
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $1, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movl %esi, %ecx
+; SSE-NEXT:    andl $63, %ecx
 ; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $60, %esi
-; SSE-NEXT:    movl (%rdi,%rsi), %r8d
-; SSE-NEXT:    btl %ecx, %r8d
-; SSE-NEXT:    setae %al
-; SSE-NEXT:    shll %cl, %edx
-; SSE-NEXT:    btrl %ecx, %r8d
-; SSE-NEXT:    orl %r8d, %edx
-; SSE-NEXT:    movl %edx, (%rdi,%rsi)
+; SSE-NEXT:    andl $56, %esi
+; SSE-NEXT:    negl %esi
+; SSE-NEXT:    movslq %esi, %r10
+; SSE-NEXT:    movq 184(%rsp,%r10), %r11
+; SSE-NEXT:    movq 192(%rsp,%r10), %rsi
+; SSE-NEXT:    movq %rsi, %r13
+; SSE-NEXT:    shldq %cl, %r11, %r13
+; SSE-NEXT:    movq 200(%rsp,%r10), %r15
+; SSE-NEXT:    shldq %cl, %rsi, %r15
+; SSE-NEXT:    movq 168(%rsp,%r10), %rbx
+; SSE-NEXT:    movq 176(%rsp,%r10), %rsi
+; SSE-NEXT:    movq %rsi, %r14
+; SSE-NEXT:    shldq %cl, %rbx, %r14
+; SSE-NEXT:    shldq %cl, %rsi, %r11
+; SSE-NEXT:    movq 152(%rsp,%r10), %rax
+; SSE-NEXT:    movq 160(%rsp,%r10), %r8
+; SSE-NEXT:    movq %r8, %r12
+; SSE-NEXT:    shldq %cl, %rax, %r12
+; SSE-NEXT:    shldq %cl, %r8, %rbx
+; SSE-NEXT:    movq 144(%rsp,%r10), %r9
+; SSE-NEXT:    movq %r9, %r8
+; SSE-NEXT:    shlq %cl, %r8
+; SSE-NEXT:    shldq %cl, %r9, %rax
+; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movl %edx, %edx
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, (%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq 16(%rdi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 48(%rdi), %rsi
+; SSE-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    andq %rsi, %r13
+; SSE-NEXT:    andq %rdx, %r12
+; SSE-NEXT:    orq %r13, %r12
+; SSE-NEXT:    movq %r15, %rsi
+; SSE-NEXT:    movq 56(%rdi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    andq %rdx, %r15
+; SSE-NEXT:    movq %rbx, %r13
+; SSE-NEXT:    movq 24(%rdi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    andq %rdx, %rbx
+; SSE-NEXT:    orq %r15, %rbx
+; SSE-NEXT:    movq %r14, %rbp
+; SSE-NEXT:    movq 32(%rdi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    andq %rdx, %r14
+; SSE-NEXT:    movq %r8, %r15
+; SSE-NEXT:    movq (%rdi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    andq %rdx, %r8
+; SSE-NEXT:    orq %r14, %r8
+; SSE-NEXT:    orq %r12, %r8
+; SSE-NEXT:    movq %r11, %r12
+; SSE-NEXT:    movq 40(%rdi), %r9
+; SSE-NEXT:    andq %r9, %r11
+; SSE-NEXT:    movq %rax, %r14
+; SSE-NEXT:    movq 8(%rdi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    andq %rdx, %rax
+; SSE-NEXT:    orq %r11, %rax
+; SSE-NEXT:    orq %rbx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    notq %rax
+; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    movq 56(%rsp,%r10), %r11
+; SSE-NEXT:    movq 64(%rsp,%r10), %rax
+; SSE-NEXT:    movq %rax, %rbx
+; SSE-NEXT:    shldq %cl, %r11, %rbx
+; SSE-NEXT:    orq %rbx, %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    notq %rsi
+; SSE-NEXT:    movq 72(%rsp,%r10), %rbx
+; SSE-NEXT:    shldq %cl, %rax, %rbx
+; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; SSE-NEXT:    orq %rbx, %rsi
+; SSE-NEXT:    notq %rbp
+; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; SSE-NEXT:    movq 40(%rsp,%r10), %rax
+; SSE-NEXT:    movq 48(%rsp,%r10), %rdx
+; SSE-NEXT:    movq %rdx, %rbx
+; SSE-NEXT:    shldq %cl, %rax, %rbx
+; SSE-NEXT:    orq %rbx, %rbp
+; SSE-NEXT:    notq %r12
+; SSE-NEXT:    andq %r9, %r12
+; SSE-NEXT:    shldq %cl, %rdx, %r11
+; SSE-NEXT:    movq 24(%rsp,%r10), %r9
+; SSE-NEXT:    movq 32(%rsp,%r10), %rdx
+; SSE-NEXT:    movq %rdx, %rbx
+; SSE-NEXT:    shldq %cl, %r9, %rbx
+; SSE-NEXT:    orq %r11, %r12
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE-NEXT:    notq %r11
+; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    orq %rbx, %r11
+; SSE-NEXT:    notq %r13
+; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; SSE-NEXT:    orq %rax, %r13
+; SSE-NEXT:    notq %r15
+; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; SSE-NEXT:    movq 16(%rsp,%r10), %rax
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    shlq %cl, %rdx
+; SSE-NEXT:    orq %rdx, %r15
+; SSE-NEXT:    notq %r14
+; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT:    shldq %cl, %rax, %r9
+; SSE-NEXT:    orq %r9, %r14
+; SSE-NEXT:    orq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    movq %rax, 48(%rdi)
+; SSE-NEXT:    movq %rsi, 56(%rdi)
+; SSE-NEXT:    movq %rbp, 32(%rdi)
+; SSE-NEXT:    movq %r12, 40(%rdi)
+; SSE-NEXT:    movq %r11, 16(%rdi)
+; SSE-NEXT:    movq %r13, 24(%rdi)
+; SSE-NEXT:    movq %r15, (%rdi)
+; SSE-NEXT:    movq %r14, 8(%rdi)
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    addq $216, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r12
+; SSE-NEXT:    popq %r13
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: init_eq_i512:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    shrl $3, %ecx
-; AVX-NEXT:    andl $60, %ecx
-; AVX-NEXT:    movl (%rdi,%rcx), %r8d
-; AVX-NEXT:    btl %esi, %r8d
-; AVX-NEXT:    setae %al
-; AVX-NEXT:    btrl %esi, %r8d
-; AVX-NEXT:    shlxl %esi, %edx, %edx
-; AVX-NEXT:    orl %r8d, %edx
-; AVX-NEXT:    movl %edx, (%rdi,%rcx)
-; AVX-NEXT:    retq
+; AVX2-LABEL: init_eq_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    subq $200, %rsp
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [1,0,0,0]
+; AVX2-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movl %esi, %r8d
+; AVX2-NEXT:    andl $63, %r8d
+; AVX2-NEXT:    shrl $3, %esi
+; AVX2-NEXT:    andl $56, %esi
+; AVX2-NEXT:    negl %esi
+; AVX2-NEXT:    movslq %esi, %rsi
+; AVX2-NEXT:    movq 144(%rsp,%rsi), %r11
+; AVX2-NEXT:    movq 152(%rsp,%rsi), %r12
+; AVX2-NEXT:    movq %r12, %r10
+; AVX2-NEXT:    movl %r8d, %ecx
+; AVX2-NEXT:    shldq %cl, %r11, %r10
+; AVX2-NEXT:    movq 176(%rsp,%rsi), %r14
+; AVX2-NEXT:    movq 184(%rsp,%rsi), %r9
+; AVX2-NEXT:    shldq %cl, %r14, %r9
+; AVX2-NEXT:    movq 160(%rsp,%rsi), %r15
+; AVX2-NEXT:    movq 168(%rsp,%rsi), %r13
+; AVX2-NEXT:    movq %r13, %rbx
+; AVX2-NEXT:    shldq %cl, %r15, %rbx
+; AVX2-NEXT:    movq 128(%rsp,%rsi), %rbp
+; AVX2-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 136(%rsp,%rsi), %rax
+; AVX2-NEXT:    shldq %cl, %rax, %r11
+; AVX2-NEXT:    shldq %cl, %r13, %r14
+; AVX2-NEXT:    shldq %cl, %r12, %r15
+; AVX2-NEXT:    shldq %cl, %rbp, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movl %edx, %edx
+; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vmovups %xmm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rdx, (%rsp)
+; AVX2-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq 16(%rdi), %r12
+; AVX2-NEXT:    movq 48(%rdi), %rbp
+; AVX2-NEXT:    movq 32(%rdi), %r13
+; AVX2-NEXT:    andnq %r13, %r15, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    andq %r15, %r13
+; AVX2-NEXT:    andnq %rbp, %r14, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    andq %r14, %rbp
+; AVX2-NEXT:    andnq %r12, %r11, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    andq %r11, %r12
+; AVX2-NEXT:    movq 40(%rdi), %rax
+; AVX2-NEXT:    orq %rbp, %r12
+; AVX2-NEXT:    andnq %rax, %rbx, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rax, %rbp
+; AVX2-NEXT:    andq %rbx, %rbp
+; AVX2-NEXT:    movq 56(%rdi), %rcx
+; AVX2-NEXT:    andnq %rcx, %r9, %rbx
+; AVX2-NEXT:    andq %r9, %rcx
+; AVX2-NEXT:    movq 24(%rdi), %rax
+; AVX2-NEXT:    andnq %rax, %r10, %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    andq %r10, %rax
+; AVX2-NEXT:    orq %rcx, %rax
+; AVX2-NEXT:    shlxq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX2-NEXT:    movq (%rdi), %r10
+; AVX2-NEXT:    andnq %r10, %rcx, %r15
+; AVX2-NEXT:    andq %rcx, %r10
+; AVX2-NEXT:    movq 40(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq 48(%rsp,%rsi), %r11
+; AVX2-NEXT:    movq %r11, %r9
+; AVX2-NEXT:    movl %r8d, %ecx
+; AVX2-NEXT:    shldq %cl, %rdx, %r9
+; AVX2-NEXT:    orq %r13, %r10
+; AVX2-NEXT:    orq %r12, %r10
+; AVX2-NEXT:    movq 8(%rdi), %r13
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT:    andnq %r13, %rcx, %r12
+; AVX2-NEXT:    andq %rcx, %r13
+; AVX2-NEXT:    orq %rbp, %r13
+; AVX2-NEXT:    orq %rax, %r13
+; AVX2-NEXT:    movq 56(%rsp,%rsi), %rax
+; AVX2-NEXT:    movl %r8d, %ecx
+; AVX2-NEXT:    shldq %cl, %r11, %rax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT:    orq %r9, %r14
+; AVX2-NEXT:    orq %rax, %rbx
+; AVX2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 24(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq 32(%rsp,%rsi), %r9
+; AVX2-NEXT:    movq %r9, %r11
+; AVX2-NEXT:    shldq %cl, %rax, %r11
+; AVX2-NEXT:    shldq %cl, %r9, %rdx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX2-NEXT:    orq %r11, %rbp
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT:    orq %rdx, %rbx
+; AVX2-NEXT:    movq 8(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq 16(%rsp,%rsi), %r9
+; AVX2-NEXT:    movq %r9, %r11
+; AVX2-NEXT:    shldq %cl, %rdx, %r11
+; AVX2-NEXT:    shldq %cl, %r9, %rax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX2-NEXT:    orq %r11, %r9
+; AVX2-NEXT:    movq (%rsp,%rsi), %rsi
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT:    orq %rax, %r11
+; AVX2-NEXT:    shlxq %r8, %rsi, %rax
+; AVX2-NEXT:    shldq %cl, %rsi, %rdx
+; AVX2-NEXT:    orq %rax, %r15
+; AVX2-NEXT:    orq %rdx, %r12
+; AVX2-NEXT:    orq %r10, %r13
+; AVX2-NEXT:    movq %r14, 48(%rdi)
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    movq %rax, 56(%rdi)
+; AVX2-NEXT:    movq %rbp, 32(%rdi)
+; AVX2-NEXT:    movq %rbx, 40(%rdi)
+; AVX2-NEXT:    movq %r9, 16(%rdi)
+; AVX2-NEXT:    movq %r11, 24(%rdi)
+; AVX2-NEXT:    movq %r15, (%rdi)
+; AVX2-NEXT:    movq %r12, 8(%rdi)
+; AVX2-NEXT:    sete %al
+; AVX2-NEXT:    addq $200, %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: init_eq_i512:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    subq $184, %rsp
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovaps {{.*#+}} xmm1 = [1,0,0,0]
+; AVX512-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movl %esi, %ecx
+; AVX512-NEXT:    andl $63, %ecx
+; AVX512-NEXT:    shrl $3, %esi
+; AVX512-NEXT:    andl $56, %esi
+; AVX512-NEXT:    negl %esi
+; AVX512-NEXT:    movslq %esi, %rsi
+; AVX512-NEXT:    movq 128(%rsp,%rsi), %r10
+; AVX512-NEXT:    movq 136(%rsp,%rsi), %r12
+; AVX512-NEXT:    movq %r12, %rax
+; AVX512-NEXT:    shldq %cl, %r10, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 160(%rsp,%rsi), %r14
+; AVX512-NEXT:    movq 168(%rsp,%rsi), %rax
+; AVX512-NEXT:    shldq %cl, %r14, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 144(%rsp,%rsi), %r15
+; AVX512-NEXT:    movq 152(%rsp,%rsi), %r11
+; AVX512-NEXT:    movq %r11, %rbx
+; AVX512-NEXT:    shldq %cl, %r15, %rbx
+; AVX512-NEXT:    movq 120(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rax, %r10
+; AVX512-NEXT:    shldq %cl, %r11, %r14
+; AVX512-NEXT:    movq %rdi, %r9
+; AVX512-NEXT:    movq 112(%rsp,%rsi), %r11
+; AVX512-NEXT:    shldq %cl, %r12, %r15
+; AVX512-NEXT:    movl %edx, %edx
+; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vmovups %xmm1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq 16(%rdi), %r12
+; AVX512-NEXT:    movq 48(%rdi), %r13
+; AVX512-NEXT:    movq 32(%rdi), %rbp
+; AVX512-NEXT:    andnq %rbp, %r15, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    andq %r15, %rbp
+; AVX512-NEXT:    andnq %r13, %r14, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    andq %r14, %r13
+; AVX512-NEXT:    andnq %r12, %r10, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    andq %r10, %r12
+; AVX512-NEXT:    movq 40(%rdi), %r8
+; AVX512-NEXT:    orq %r13, %r12
+; AVX512-NEXT:    andnq %r8, %rbx, %rdi
+; AVX512-NEXT:    andq %rbx, %r8
+; AVX512-NEXT:    movq 56(%r9), %r13
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512-NEXT:    andnq %r13, %rdx, %r10
+; AVX512-NEXT:    andq %rdx, %r13
+; AVX512-NEXT:    movq 24(%r9), %rax
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512-NEXT:    andnq %rax, %rdx, %r15
+; AVX512-NEXT:    andq %rdx, %rax
+; AVX512-NEXT:    orq %r13, %rax
+; AVX512-NEXT:    shlxq %rcx, %r11, %r13
+; AVX512-NEXT:    movq (%r9), %rdx
+; AVX512-NEXT:    andnq %rdx, %r13, %r14
+; AVX512-NEXT:    andq %r13, %rdx
+; AVX512-NEXT:    orq %rbp, %rdx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %r11, %rbp
+; AVX512-NEXT:    orq %r12, %rdx
+; AVX512-NEXT:    movq 8(%r9), %r13
+; AVX512-NEXT:    andnq %r13, %rbp, %rbx
+; AVX512-NEXT:    andq %rbp, %r13
+; AVX512-NEXT:    orq %r8, %r13
+; AVX512-NEXT:    movq 24(%rsp,%rsi), %r8
+; AVX512-NEXT:    orq %rax, %r13
+; AVX512-NEXT:    movq 32(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, %r12
+; AVX512-NEXT:    shldq %cl, %r8, %r12
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX512-NEXT:    orq %r12, %r11
+; AVX512-NEXT:    movq 40(%rsp,%rsi), %r12
+; AVX512-NEXT:    shldq %cl, %rax, %r12
+; AVX512-NEXT:    orq %r12, %r10
+; AVX512-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 8(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq 16(%rsp,%rsi), %r12
+; AVX512-NEXT:    movq %r12, %rbp
+; AVX512-NEXT:    shldq %cl, %rax, %rbp
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT:    orq %rbp, %r10
+; AVX512-NEXT:    shldq %cl, %r12, %r8
+; AVX512-NEXT:    orq %r8, %rdi
+; AVX512-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq -8(%rsp,%rsi), %r8
+; AVX512-NEXT:    movq (%rsp,%rsi), %r12
+; AVX512-NEXT:    movq %r12, %rbp
+; AVX512-NEXT:    shldq %cl, %r8, %rbp
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; AVX512-NEXT:    orq %rbp, %rdi
+; AVX512-NEXT:    movq -16(%rsp,%rsi), %rsi
+; AVX512-NEXT:    shldq %cl, %r12, %rax
+; AVX512-NEXT:    orq %rax, %r15
+; AVX512-NEXT:    shlxq %rcx, %rsi, %rax
+; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
+; AVX512-NEXT:    shldq %cl, %rsi, %r8
+; AVX512-NEXT:    orq %rax, %r14
+; AVX512-NEXT:    orq %r8, %rbx
+; AVX512-NEXT:    orq %rdx, %r13
+; AVX512-NEXT:    movq %r11, 48(%r9)
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    movq %rax, 56(%r9)
+; AVX512-NEXT:    movq %r10, 32(%r9)
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    movq %rax, 40(%r9)
+; AVX512-NEXT:    movq %rdi, 16(%r9)
+; AVX512-NEXT:    movq %r15, 24(%r9)
+; AVX512-NEXT:    movq %r14, (%r9)
+; AVX512-NEXT:    movq %rbx, 8(%r9)
+; AVX512-NEXT:    sete %al
+; AVX512-NEXT:    addq $184, %rsp
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %rem = and i32 %position, 511
   %ofs = zext nneg i32 %rem to i512
   %bit = shl nuw i512 1, %ofs
@@ -872,25 +4274,2749 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: test_ne_i4096:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl $4064, %edx # imm = 0xFE0
-; X86-NEXT:    shrl $3, %edx
-; X86-NEXT:    movl (%eax,%edx), %eax
-; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    setb %al
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $1792, %esp # imm = 0x700
+; X86-NEXT:    movl 12(%ebp), %ebx
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    andl $508, %ecx # imm = 0x1FC
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl %ecx, %esi
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 248(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 252(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $31, %ebx
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 504(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 508(%esi), %edx
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 120(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 124(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 376(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 380(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 184(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 188(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 440(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 444(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 56(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 60(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 312(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 316(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 216(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 220(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 472(%esi), %edi
+; X86-NEXT:    movl 476(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 88(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 92(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 344(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 348(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 152(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 156(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 408(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 412(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 280(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 284(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 232(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 236(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 488(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 492(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 104(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 108(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 360(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 364(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 168(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 172(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 424(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 428(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 296(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 300(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 200(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 204(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 456(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 460(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 72(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 76(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 328(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 332(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 136(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 140(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 392(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 396(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 264(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 268(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 240(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 244(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 496(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 500(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 112(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 116(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 368(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 372(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 176(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 180(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 432(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 436(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 52(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 304(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 308(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 208(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 212(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 464(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 468(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 84(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 336(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 340(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 144(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 148(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 400(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 404(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 16(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 20(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 272(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 276(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 224(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 228(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 480(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 484(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 96(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 100(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 352(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 356(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 160(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 164(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 416(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 420(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 288(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 292(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 192(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 196(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 448(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 452(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 64(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 68(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 320(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 324(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 128(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 132(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl 256(%esi), %edi
+; X86-NEXT:    movl 260(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl 388(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl 4(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $1, %eax, %edi
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    notb %cl
+; X86-NEXT:    shrdl %cl, %eax, %edi
+; X86-NEXT:    shrl %cl, %ebx
+; X86-NEXT:    movb $32, %cl
+; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    movl (%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    jne .LBB20_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:  .LBB20_2:
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 320(%eax), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    andl 64(%eax), %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl 448(%eax), %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 192(%eax), %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 288(%eax), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl 32(%eax), %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 416(%eax), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    andl 160(%eax), %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 352(%eax), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    andl 96(%eax), %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl 480(%eax), %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 224(%eax), %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 272(%eax), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl 16(%eax), %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 400(%eax), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    andl 144(%eax), %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 336(%eax), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl 80(%eax), %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 464(%eax), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    andl 208(%eax), %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 304(%eax), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl 48(%eax), %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 432(%eax), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    andl 176(%eax), %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 368(%eax), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl 112(%eax), %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 496(%eax), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    andl 240(%eax), %ebx
+; X86-NEXT:    orl %ecx, %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 264(%eax), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl 8(%eax), %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 392(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 136(%ebx), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl 328(%ebx), %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 72(%ebx), %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    andl 456(%ebx), %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    andl 200(%ebx), %esi
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    andl 296(%ebx), %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 40(%ebx), %eax
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    andl 424(%ebx), %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl 168(%ebx), %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 360(%ebx), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 104(%ebx), %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 488(%ebx), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    andl 232(%ebx), %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 280(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 24(%ebx), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 408(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl 152(%ebx), %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 344(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 88(%ebx), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 472(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    andl 216(%ebx), %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 312(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 56(%ebx), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 440(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl 184(%ebx), %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 376(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 120(%ebx), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 504(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    andl 248(%ebx), %edi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 324(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 68(%ebx), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 452(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl 196(%ebx), %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 292(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 36(%ebx), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 420(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl 164(%ebx), %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 356(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 100(%ebx), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 484(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    andl 228(%ebx), %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 276(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 20(%ebx), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 404(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl 148(%ebx), %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 340(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 84(%ebx), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 468(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    andl 212(%ebx), %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 308(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 52(%ebx), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 436(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl 180(%ebx), %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 372(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 116(%ebx), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 500(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    andl 244(%ebx), %edi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 268(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 12(%ebx), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 396(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl 140(%ebx), %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 332(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 76(%ebx), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 460(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    andl 204(%ebx), %edi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 300(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 44(%ebx), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 428(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl 172(%ebx), %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 364(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 108(%ebx), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 492(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    andl 236(%ebx), %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 284(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 28(%ebx), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 412(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    andl 156(%ebx), %edi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 348(%ebx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl 92(%ebx), %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 476(%ebx), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 220(%ebx), %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 316(%ebx), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl 60(%ebx), %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 444(%ebx), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    andl 188(%ebx), %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 380(%ebx), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl 124(%ebx), %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 508(%ebx), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    andl 252(%esi), %ebx
+; X86-NEXT:    orl %ecx, %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    movl 1648(%esp,%ecx), %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    andl 128(%edx), %ecx
+; X86-NEXT:    andl 384(%edx), %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    andl (%edx), %eax
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 256(%edx), %eax
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 260(%edx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    andl 4(%edx), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl 132(%edx), %eax
+; X86-NEXT:    andl 388(%edx), %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    setne %al
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_ne_i4096:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    andl $4064, %eax # imm = 0xFE0
-; X64-NEXT:    shrl $3, %eax
-; X64-NEXT:    movl (%rdi,%rax), %eax
-; X64-NEXT:    btl %esi, %eax
-; X64-NEXT:    setb %al
-; X64-NEXT:    retq
+; SSE-LABEL: test_ne_i4096:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %r13
+; SSE-NEXT:    pushq %r12
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $1576, %rsp # imm = 0x628
+; SSE-NEXT:    movl %esi, %ecx
+; SSE-NEXT:    movl %esi, %eax
+; SSE-NEXT:    andl $4032, %eax # imm = 0xFC0
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    andl $63, %ecx
+; SSE-NEXT:    shrl $3, %eax
+; SSE-NEXT:    negl %eax
+; SSE-NEXT:    movslq %eax, %rsi
+; SSE-NEXT:    movq 1296(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1304(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1552(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1560(%rsp,%rsi), %rax
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1168(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1176(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1424(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1432(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1232(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1240(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1488(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1496(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1104(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1112(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1360(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, (%rsp) # 8-byte Spill
+; SSE-NEXT:    movq 1368(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1264(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1272(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1520(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1528(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1136(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1144(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1392(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1400(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1200(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1208(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1456(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1464(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1072(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1080(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1328(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1336(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1280(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1288(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1536(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1544(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1152(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1160(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1408(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1416(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1216(%rsp,%rsi), %r11
+; SSE-NEXT:    movq 1224(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %r11, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1472(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1480(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1088(%rsp,%rsi), %r9
+; SSE-NEXT:    movq 1096(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %r9, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1344(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1352(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1248(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1256(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rax, %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1504(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1512(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1120(%rsp,%rsi), %rax
+; SSE-NEXT:    movq 1128(%rsp,%rsi), %r8
+; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    shldq %cl, %rax, %r8
+; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1376(%rsp,%rsi), %r13
+; SSE-NEXT:    movq 1384(%rsp,%rsi), %rbx
+; SSE-NEXT:    movq %rbx, %r8
+; SSE-NEXT:    shldq %cl, %r13, %r8
+; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1184(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1192(%rsp,%rsi), %r15
+; SSE-NEXT:    movq %r15, %r14
+; SSE-NEXT:    shldq %cl, %rdx, %r14
+; SSE-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1440(%rsp,%rsi), %r10
+; SSE-NEXT:    movq 1448(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq %rdx, %r14
+; SSE-NEXT:    shldq %cl, %r10, %r14
+; SSE-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1312(%rsp,%rsi), %r14
+; SSE-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 1320(%rsp,%rsi), %rbp
+; SSE-NEXT:    movq %rbp, %r12
+; SSE-NEXT:    shldq %cl, %r14, %r12
+; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %r8, (%rsp) # 8-byte Folded Spill
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT:    movq 1064(%rsp,%rsi), %rbx
+; SSE-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %rbp, %r14
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %rdx, %r11
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %r15, %rdx
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %r15, %r9
+; SSE-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %r15, %r8
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %r15, %rbp
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %r15, %r9
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %r15, %rax
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %r15, %r13
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %r12, %r15
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %r12, %r10
+; SSE-NEXT:    andq 384(%rdi), %r10
+; SSE-NEXT:    andq 128(%rdi), %r15
+; SSE-NEXT:    andq 320(%rdi), %r13
+; SSE-NEXT:    andq 64(%rdi), %rax
+; SSE-NEXT:    orq %r10, %r15
+; SSE-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    orq %r13, %rax
+; SSE-NEXT:    andq 448(%rdi), %r9
+; SSE-NEXT:    andq 192(%rdi), %rbp
+; SSE-NEXT:    orq %r9, %rbp
+; SSE-NEXT:    orq %rax, %rbp
+; SSE-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    andq 288(%rdi), %r8
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE-NEXT:    andq 32(%rdi), %r9
+; SSE-NEXT:    andq 416(%rdi), %rdx
+; SSE-NEXT:    andq 160(%rdi), %r11
+; SSE-NEXT:    orq %r8, %r9
+; SSE-NEXT:    orq %rdx, %r11
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT:    andq 352(%rdi), %rdx
+; SSE-NEXT:    orq %r9, %r11
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    andq 96(%rdi), %rax
+; SSE-NEXT:    orq %rdx, %rax
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    andq 480(%rdi), %rax
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    andq 224(%rdi), %r8
+; SSE-NEXT:    orq %rax, %r8
+; SSE-NEXT:    orq %rdx, %r8
+; SSE-NEXT:    andq 272(%rdi), %r14
+; SSE-NEXT:    orq %r11, %r8
+; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    andq 16(%rdi), %rax
+; SSE-NEXT:    orq %r14, %rax
+; SSE-NEXT:    movq %rax, %r8
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT:    andq 400(%rdi), %rdx
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    andq 144(%rdi), %rax
+; SSE-NEXT:    orq %rdx, %rax
+; SSE-NEXT:    orq %r8, %rax
+; SSE-NEXT:    movq %rax, %r8
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE-NEXT:    andq 336(%rdi), %r9
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    andq 80(%rdi), %rax
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT:    andq 464(%rdi), %rdx
+; SSE-NEXT:    orq %r9, %rax
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE-NEXT:    andq 208(%rdi), %r11
+; SSE-NEXT:    orq %rdx, %r11
+; SSE-NEXT:    orq %rax, %r11
+; SSE-NEXT:    orq %r8, %r11
+; SSE-NEXT:    movq (%rsp), %rdx # 8-byte Reload
+; SSE-NEXT:    andq 304(%rdi), %rdx
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    andq 48(%rdi), %rax
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE-NEXT:    andq 432(%rdi), %r9
+; SSE-NEXT:    orq %rdx, %rax
+; SSE-NEXT:    movq %rax, %r10
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    andq 176(%rdi), %r8
+; SSE-NEXT:    orq %r9, %r8
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE-NEXT:    andq 368(%rdi), %r9
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    andq 112(%rdi), %rax
+; SSE-NEXT:    orq %r10, %r8
+; SSE-NEXT:    movq %r8, %r10
+; SSE-NEXT:    orq %r9, %rax
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    andq 496(%rdi), %r8
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; SSE-NEXT:    andq 240(%rdi), %rbp
+; SSE-NEXT:    orq %r8, %rbp
+; SSE-NEXT:    orq %rax, %rbp
+; SSE-NEXT:    orq %r10, %rbp
+; SSE-NEXT:    orq %r11, %rbp
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    andq 392(%rdi), %rax
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; SSE-NEXT:    andq 136(%rdi), %r12
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT:    andq 328(%rdi), %rdx
+; SSE-NEXT:    orq %rax, %r12
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    andq 72(%rdi), %rax
+; SSE-NEXT:    orq %rdx, %rax
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    andq 456(%rdi), %rax
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; SSE-NEXT:    andq 200(%rdi), %r13
+; SSE-NEXT:    orq %rax, %r13
+; SSE-NEXT:    orq %rdx, %r13
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT:    andq 296(%rdi), %rdx
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    andq 40(%rdi), %rax
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    andq 424(%rdi), %r8
+; SSE-NEXT:    orq %rdx, %rax
+; SSE-NEXT:    movq %rax, %r9
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT:    andq 168(%rdi), %rdx
+; SSE-NEXT:    orq %r8, %rdx
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    andq 360(%rdi), %r8
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    andq 104(%rdi), %rax
+; SSE-NEXT:    orq %r9, %rdx
+; SSE-NEXT:    orq %r8, %rax
+; SSE-NEXT:    movq %rax, %r8
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    andq 488(%rdi), %rax
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT:    andq 232(%rdi), %r15
+; SSE-NEXT:    orq %rax, %r15
+; SSE-NEXT:    orq %r8, %r15
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    andq 280(%rdi), %r8
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    andq 24(%rdi), %rax
+; SSE-NEXT:    orq %rdx, %r15
+; SSE-NEXT:    orq %r8, %rax
+; SSE-NEXT:    movq %rax, %r10
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    andq 408(%rdi), %r8
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    andq 152(%rdi), %rax
+; SSE-NEXT:    orq %r8, %rax
+; SSE-NEXT:    orq %r10, %rax
+; SSE-NEXT:    movq %rax, %r10
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE-NEXT:    andq 344(%rdi), %r11
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    andq 88(%rdi), %r8
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    andq 472(%rdi), %rax
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; SSE-NEXT:    andq 216(%rdi), %r14
+; SSE-NEXT:    orq %r11, %r8
+; SSE-NEXT:    orq %rax, %r14
+; SSE-NEXT:    orq %r8, %r14
+; SSE-NEXT:    orq %r10, %r14
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE-NEXT:    andq 312(%rdi), %r11
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE-NEXT:    andq 56(%rdi), %r10
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    andq 440(%rdi), %r8
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE-NEXT:    andq 184(%rdi), %r9
+; SSE-NEXT:    orq %r11, %r10
+; SSE-NEXT:    orq %r8, %r9
+; SSE-NEXT:    orq %r10, %r9
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT:    shldq %cl, %rax, %rdx
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE-NEXT:    andq 376(%rdi), %r10
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    andq 120(%rdi), %rax
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE-NEXT:    andq 504(%rdi), %r11
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT:    andq 248(%rdi), %r8
+; SSE-NEXT:    orq %r10, %rax
+; SSE-NEXT:    movq %rax, %r10
+; SSE-NEXT:    orq %r11, %r8
+; SSE-NEXT:    movq 1056(%rsp,%rsi), %rax
+; SSE-NEXT:    shldq %cl, %rax, %rbx
+; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT:    shlq %cl, %rax
+; SSE-NEXT:    orq %r10, %r8
+; SSE-NEXT:    orq %r9, %r8
+; SSE-NEXT:    andq 256(%rdi), %rdx
+; SSE-NEXT:    orq %r14, %r8
+; SSE-NEXT:    andq (%rdi), %rax
+; SSE-NEXT:    orq %rdx, %rax
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; SSE-NEXT:    orq %rbp, %rax
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE-NEXT:    andq 264(%rdi), %rcx
+; SSE-NEXT:    andq 8(%rdi), %rbx
+; SSE-NEXT:    orq %rcx, %rbx
+; SSE-NEXT:    orq %r12, %rbx
+; SSE-NEXT:    orq %r13, %rbx
+; SSE-NEXT:    orq %r15, %rbx
+; SSE-NEXT:    orq %r8, %rbx
+; SSE-NEXT:    orq %rax, %rbx
+; SSE-NEXT:    setne %al
+; SSE-NEXT:    addq $1576, %rsp # imm = 0x628
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r12
+; SSE-NEXT:    popq %r13
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_ne_i4096:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    subq $1560, %rsp # imm = 0x618
+; AVX2-NEXT:    movl %esi, %ecx
+; AVX2-NEXT:    movl %esi, %eax
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    andl $4032, %eax # imm = 0xFC0
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
+; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    shrl $3, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    movslq %eax, %rsi
+; AVX2-NEXT:    movq 1280(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1288(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1536(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1544(%rsp,%rsi), %rax
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1152(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1160(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1408(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1416(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1216(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, (%rsp) # 8-byte Spill
+; AVX2-NEXT:    movq 1224(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1472(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1480(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1088(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1096(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1344(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1352(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1248(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1256(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1504(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1512(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1120(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1128(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1376(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1384(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1184(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1192(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1440(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1448(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1056(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1064(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1312(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1320(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1264(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1272(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1520(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1528(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1136(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1144(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1392(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1400(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1200(%rsp,%rsi), %r11
+; AVX2-NEXT:    movq 1208(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %r11, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1456(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1464(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1072(%rsp,%rsi), %r12
+; AVX2-NEXT:    movq 1080(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %r12, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1328(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1336(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rdx, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1232(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1240(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rax, %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1488(%rsp,%rsi), %rbp
+; AVX2-NEXT:    movq 1496(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rbp, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1104(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq 1112(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rax, %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1360(%rsp,%rsi), %r10
+; AVX2-NEXT:    movq 1368(%rsp,%rsi), %r8
+; AVX2-NEXT:    movq %r8, %rdx
+; AVX2-NEXT:    shldq %cl, %r10, %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1168(%rsp,%rsi), %r9
+; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1176(%rsp,%rsi), %rbx
+; AVX2-NEXT:    movq %rbx, %rdx
+; AVX2-NEXT:    shldq %cl, %r9, %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1424(%rsp,%rsi), %r9
+; AVX2-NEXT:    movq 1432(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq %rdx, %r14
+; AVX2-NEXT:    shldq %cl, %r9, %r14
+; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1296(%rsp,%rsi), %r15
+; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 1304(%rsp,%rsi), %r14
+; AVX2-NEXT:    movq %r14, %r13
+; AVX2-NEXT:    shldq %cl, %r15, %r13
+; AVX2-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r15, (%rsp) # 8-byte Folded Spill
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    movq 1048(%rsp,%rsi), %rdx
+; AVX2-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r14, %rbx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r14, %r11
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r14, %r12
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r14, %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r14, %r13
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r14, %rbp
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r14, %rax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r14, %r10
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r15, %r14
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r15, %r9
+; AVX2-NEXT:    andq 384(%rdi), %r9
+; AVX2-NEXT:    andq 128(%rdi), %r14
+; AVX2-NEXT:    andq 320(%rdi), %r10
+; AVX2-NEXT:    orq %r9, %r14
+; AVX2-NEXT:    movq %r14, %r15
+; AVX2-NEXT:    andq 64(%rdi), %rax
+; AVX2-NEXT:    orq %r10, %rax
+; AVX2-NEXT:    andq 448(%rdi), %rbp
+; AVX2-NEXT:    andq 192(%rdi), %r13
+; AVX2-NEXT:    orq %rbp, %r13
+; AVX2-NEXT:    orq %rax, %r13
+; AVX2-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    andq 288(%rdi), %r8
+; AVX2-NEXT:    andq 32(%rdi), %r12
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    andq 416(%rdi), %rax
+; AVX2-NEXT:    orq %r8, %r12
+; AVX2-NEXT:    andq 160(%rdi), %r11
+; AVX2-NEXT:    orq %rax, %r11
+; AVX2-NEXT:    andq 352(%rdi), %rbx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    andq 96(%rdi), %rax
+; AVX2-NEXT:    orq %r12, %r11
+; AVX2-NEXT:    orq %rbx, %rax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT:    andq 480(%rdi), %r10
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    andq 224(%rdi), %r13
+; AVX2-NEXT:    orq %r10, %r13
+; AVX2-NEXT:    orq %rax, %r13
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT:    andq 272(%rdi), %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    andq 16(%rdi), %rax
+; AVX2-NEXT:    orq %r11, %r13
+; AVX2-NEXT:    orq %r8, %rax
+; AVX2-NEXT:    movq %rax, %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX2-NEXT:    andq 400(%rdi), %r9
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    andq 144(%rdi), %rax
+; AVX2-NEXT:    orq %r9, %rax
+; AVX2-NEXT:    orq %r8, %rax
+; AVX2-NEXT:    movq %rax, %r9
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT:    andq 336(%rdi), %r10
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    andq 80(%rdi), %rax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT:    andq 464(%rdi), %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT:    andq 208(%rdi), %r11
+; AVX2-NEXT:    orq %r10, %rax
+; AVX2-NEXT:    orq %r8, %r11
+; AVX2-NEXT:    orq %rax, %r11
+; AVX2-NEXT:    orq %r9, %r11
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX2-NEXT:    andq 304(%rdi), %r9
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT:    andq 48(%rdi), %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT:    andq 432(%rdi), %r10
+; AVX2-NEXT:    movq (%rsp), %rax # 8-byte Reload
+; AVX2-NEXT:    andq 176(%rdi), %rax
+; AVX2-NEXT:    orq %r9, %r8
+; AVX2-NEXT:    movq %r8, %r9
+; AVX2-NEXT:    orq %r10, %rax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT:    andq 368(%rdi), %r8
+; AVX2-NEXT:    orq %r9, %rax
+; AVX2-NEXT:    movq %rax, %r10
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    andq 112(%rdi), %rax
+; AVX2-NEXT:    orq %r8, %rax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT:    andq 496(%rdi), %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX2-NEXT:    andq 240(%rdi), %r9
+; AVX2-NEXT:    orq %r8, %r9
+; AVX2-NEXT:    orq %rax, %r9
+; AVX2-NEXT:    orq %r10, %r9
+; AVX2-NEXT:    orq %r11, %r9
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT:    andq 392(%rdi), %r10
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX2-NEXT:    andq 136(%rdi), %rbp
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT:    andq 328(%rdi), %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    andq 72(%rdi), %rax
+; AVX2-NEXT:    orq %r10, %rbp
+; AVX2-NEXT:    orq %r8, %rax
+; AVX2-NEXT:    movq %rax, %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    andq 456(%rdi), %rax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX2-NEXT:    andq 200(%rdi), %r12
+; AVX2-NEXT:    orq %rax, %r12
+; AVX2-NEXT:    orq %r8, %r12
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT:    andq 296(%rdi), %r10
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT:    andq 40(%rdi), %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT:    andq 424(%rdi), %r11
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    andq 168(%rdi), %rax
+; AVX2-NEXT:    orq %r10, %r8
+; AVX2-NEXT:    movq %r8, %r10
+; AVX2-NEXT:    orq %r11, %rax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT:    andq 360(%rdi), %r8
+; AVX2-NEXT:    orq %r10, %rax
+; AVX2-NEXT:    movq %rax, %r10
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    andq 104(%rdi), %rax
+; AVX2-NEXT:    orq %r8, %rax
+; AVX2-NEXT:    movq %rax, %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    andq 488(%rdi), %rax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT:    andq 232(%rdi), %r14
+; AVX2-NEXT:    orq %rax, %r14
+; AVX2-NEXT:    orq %r8, %r14
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT:    andq 280(%rdi), %r8
+; AVX2-NEXT:    orq %r10, %r14
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    andq 24(%rdi), %rax
+; AVX2-NEXT:    orq %r8, %rax
+; AVX2-NEXT:    movq %rax, %r10
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT:    andq 408(%rdi), %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    andq 152(%rdi), %rax
+; AVX2-NEXT:    orq %r8, %rax
+; AVX2-NEXT:    orq %r10, %rax
+; AVX2-NEXT:    movq %rax, %r10
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT:    andq 344(%rdi), %r11
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT:    andq 88(%rdi), %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    andq 472(%rdi), %rax
+; AVX2-NEXT:    orq %r11, %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT:    andq 216(%rdi), %rbx
+; AVX2-NEXT:    orq %rax, %rbx
+; AVX2-NEXT:    orq %r8, %rbx
+; AVX2-NEXT:    orq %r10, %rbx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT:    andq 312(%rdi), %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    andq 56(%rdi), %rax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT:    andq 440(%rdi), %r10
+; AVX2-NEXT:    orq %r8, %rax
+; AVX2-NEXT:    movq %rax, %r11
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT:    andq 184(%rdi), %r8
+; AVX2-NEXT:    orq %r10, %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT:    andq 376(%rdi), %r10
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    andq 120(%rdi), %rax
+; AVX2-NEXT:    orq %r11, %r8
+; AVX2-NEXT:    movq %r8, %r11
+; AVX2-NEXT:    orq %r10, %rax
+; AVX2-NEXT:    movq %rax, %r10
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT:    andq 504(%rdi), %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    andq 248(%rdi), %rax
+; AVX2-NEXT:    orq %r8, %rax
+; AVX2-NEXT:    orq %r10, %rax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT:    shldq %cl, %r8, %r10
+; AVX2-NEXT:    orq %r11, %rax
+; AVX2-NEXT:    movq 1040(%rsp,%rsi), %rsi
+; AVX2-NEXT:    orq %rbx, %rax
+; AVX2-NEXT:    movq %rax, %r8
+; AVX2-NEXT:    shlxq %rcx, %rsi, %rax
+; AVX2-NEXT:    andq 256(%rdi), %r10
+; AVX2-NEXT:    andq (%rdi), %rax
+; AVX2-NEXT:    orq %r10, %rax
+; AVX2-NEXT:    orq %r15, %rax
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; AVX2-NEXT:    orq %r13, %rax
+; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; AVX2-NEXT:    shldq %cl, %rsi, %rdx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT:    andq 264(%rdi), %rcx
+; AVX2-NEXT:    andq 8(%rdi), %rdx
+; AVX2-NEXT:    orq %r9, %rax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    orq %rbp, %rdx
+; AVX2-NEXT:    orq %r12, %rdx
+; AVX2-NEXT:    orq %r14, %rdx
+; AVX2-NEXT:    orq %r8, %rdx
+; AVX2-NEXT:    orq %rax, %rdx
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    addq $1560, %rsp # imm = 0x618
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ne_i4096:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    subq $1560, %rsp # imm = 0x618
+; AVX512-NEXT:    movl %esi, %ecx
+; AVX512-NEXT:    movl %esi, %eax
+; AVX512-NEXT:    andl $4032, %eax # imm = 0xFC0
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
+; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    andl $63, %ecx
+; AVX512-NEXT:    shrl $3, %eax
+; AVX512-NEXT:    negl %eax
+; AVX512-NEXT:    movslq %eax, %rsi
+; AVX512-NEXT:    movq 1280(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1288(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1536(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1544(%rsp,%rsi), %rax
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1152(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1160(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1408(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1416(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1216(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, (%rsp) # 8-byte Spill
+; AVX512-NEXT:    movq 1224(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1472(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1480(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1088(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1096(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1344(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1352(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1248(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1256(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1504(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1512(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1120(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1128(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1376(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1384(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1184(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1192(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1440(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1448(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1056(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1064(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1312(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1320(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1264(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1272(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1520(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1528(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1136(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1144(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1392(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1400(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1200(%rsp,%rsi), %r10
+; AVX512-NEXT:    movq 1208(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %r10, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1456(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1464(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1072(%rsp,%rsi), %r14
+; AVX512-NEXT:    movq 1080(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %r14, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1328(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1336(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rdx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1232(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1240(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rax, %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1488(%rsp,%rsi), %r12
+; AVX512-NEXT:    movq 1496(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %r12, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1104(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq 1112(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    shldq %cl, %rax, %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1360(%rsp,%rsi), %r11
+; AVX512-NEXT:    movq 1368(%rsp,%rsi), %rbx
+; AVX512-NEXT:    movq %rbx, %rdx
+; AVX512-NEXT:    shldq %cl, %r11, %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1168(%rsp,%rsi), %r9
+; AVX512-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1176(%rsp,%rsi), %r8
+; AVX512-NEXT:    movq %r8, %rdx
+; AVX512-NEXT:    shldq %cl, %r9, %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1424(%rsp,%rsi), %r9
+; AVX512-NEXT:    movq 1432(%rsp,%rsi), %rdx
+; AVX512-NEXT:    movq %rdx, %r15
+; AVX512-NEXT:    shldq %cl, %r9, %r15
+; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1296(%rsp,%rsi), %rbp
+; AVX512-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 1304(%rsp,%rsi), %r15
+; AVX512-NEXT:    movq %r15, %r13
+; AVX512-NEXT:    shldq %cl, %rbp, %r13
+; AVX512-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %r13, (%rsp) # 8-byte Folded Spill
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    movq 1048(%rsp,%rsi), %rdx
+; AVX512-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %r15, %rbx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %r15, %r10
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %r15, %r14
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %r15, %r8
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %r15, %r13
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %r15, %r12
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %r15, %rax
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %r15, %r11
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %rbp, %r15
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %rbp, %r9
+; AVX512-NEXT:    andq 384(%rdi), %r9
+; AVX512-NEXT:    andq 128(%rdi), %r15
+; AVX512-NEXT:    orq %r9, %r15
+; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    andq 320(%rdi), %r11
+; AVX512-NEXT:    andq 64(%rdi), %rax
+; AVX512-NEXT:    orq %r11, %rax
+; AVX512-NEXT:    andq 448(%rdi), %r12
+; AVX512-NEXT:    andq 192(%rdi), %r13
+; AVX512-NEXT:    orq %r12, %r13
+; AVX512-NEXT:    orq %rax, %r13
+; AVX512-NEXT:    andq 288(%rdi), %r8
+; AVX512-NEXT:    andq 32(%rdi), %r14
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    andq 416(%rdi), %rax
+; AVX512-NEXT:    orq %r8, %r14
+; AVX512-NEXT:    andq 160(%rdi), %r10
+; AVX512-NEXT:    orq %rax, %r10
+; AVX512-NEXT:    andq 352(%rdi), %rbx
+; AVX512-NEXT:    orq %r14, %r10
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    andq 96(%rdi), %rax
+; AVX512-NEXT:    orq %rbx, %rax
+; AVX512-NEXT:    movq %rax, %r8
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    andq 480(%rdi), %rax
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT:    andq 224(%rdi), %r15
+; AVX512-NEXT:    orq %rax, %r15
+; AVX512-NEXT:    orq %r8, %r15
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT:    andq 272(%rdi), %r8
+; AVX512-NEXT:    orq %r10, %r15
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    andq 16(%rdi), %rax
+; AVX512-NEXT:    orq %r8, %rax
+; AVX512-NEXT:    movq %rax, %r8
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX512-NEXT:    andq 400(%rdi), %r9
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    andq 144(%rdi), %rax
+; AVX512-NEXT:    orq %r9, %rax
+; AVX512-NEXT:    orq %r8, %rax
+; AVX512-NEXT:    movq %rax, %r9
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT:    andq 336(%rdi), %r10
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    andq 80(%rdi), %rax
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT:    andq 464(%rdi), %r8
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX512-NEXT:    andq 208(%rdi), %r11
+; AVX512-NEXT:    orq %r10, %rax
+; AVX512-NEXT:    orq %r8, %r11
+; AVX512-NEXT:    orq %rax, %r11
+; AVX512-NEXT:    orq %r9, %r11
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT:    andq 304(%rdi), %r10
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    andq 48(%rdi), %rax
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX512-NEXT:    andq 432(%rdi), %r9
+; AVX512-NEXT:    movq (%rsp), %r8 # 8-byte Reload
+; AVX512-NEXT:    andq 176(%rdi), %r8
+; AVX512-NEXT:    orq %r10, %rax
+; AVX512-NEXT:    movq %rax, %r10
+; AVX512-NEXT:    orq %r9, %r8
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX512-NEXT:    andq 368(%rdi), %r9
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    andq 112(%rdi), %rax
+; AVX512-NEXT:    orq %r10, %r8
+; AVX512-NEXT:    movq %r8, %r10
+; AVX512-NEXT:    orq %r9, %rax
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT:    andq 496(%rdi), %r8
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX512-NEXT:    andq 240(%rdi), %r9
+; AVX512-NEXT:    orq %r8, %r9
+; AVX512-NEXT:    orq %rax, %r9
+; AVX512-NEXT:    orq %r10, %r9
+; AVX512-NEXT:    orq %r11, %r9
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT:    andq 392(%rdi), %r10
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX512-NEXT:    andq 136(%rdi), %rbp
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT:    andq 328(%rdi), %r8
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    andq 72(%rdi), %rax
+; AVX512-NEXT:    orq %r10, %rbp
+; AVX512-NEXT:    orq %r8, %rax
+; AVX512-NEXT:    movq %rax, %r8
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    andq 456(%rdi), %rax
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX512-NEXT:    andq 200(%rdi), %r12
+; AVX512-NEXT:    orq %rax, %r12
+; AVX512-NEXT:    orq %r8, %r12
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT:    andq 296(%rdi), %r8
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    andq 40(%rdi), %rax
+; AVX512-NEXT:    orq %r8, %rax
+; AVX512-NEXT:    movq %rax, %r10
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT:    andq 424(%rdi), %r8
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    andq 168(%rdi), %rax
+; AVX512-NEXT:    orq %r8, %rax
+; AVX512-NEXT:    orq %r10, %rax
+; AVX512-NEXT:    movq %rax, %r10
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT:    andq 360(%rdi), %r8
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    andq 104(%rdi), %rax
+; AVX512-NEXT:    orq %r8, %rax
+; AVX512-NEXT:    movq %rax, %r8
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    andq 488(%rdi), %rax
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX512-NEXT:    andq 232(%rdi), %r14
+; AVX512-NEXT:    orq %rax, %r14
+; AVX512-NEXT:    orq %r8, %r14
+; AVX512-NEXT:    orq %r10, %r14
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT:    andq 280(%rdi), %r8
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    andq 24(%rdi), %rax
+; AVX512-NEXT:    orq %r8, %rax
+; AVX512-NEXT:    movq %rax, %r10
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT:    andq 408(%rdi), %r8
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    andq 152(%rdi), %rax
+; AVX512-NEXT:    orq %r8, %rax
+; AVX512-NEXT:    orq %r10, %rax
+; AVX512-NEXT:    movq %rax, %r10
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX512-NEXT:    andq 344(%rdi), %r11
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT:    andq 88(%rdi), %r8
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    andq 472(%rdi), %rax
+; AVX512-NEXT:    orq %r11, %r8
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT:    andq 216(%rdi), %rbx
+; AVX512-NEXT:    orq %rax, %rbx
+; AVX512-NEXT:    orq %r8, %rbx
+; AVX512-NEXT:    orq %r10, %rbx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT:    andq 312(%rdi), %r10
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    andq 56(%rdi), %rax
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT:    andq 440(%rdi), %r8
+; AVX512-NEXT:    orq %r10, %rax
+; AVX512-NEXT:    movq %rax, %r10
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    andq 184(%rdi), %rax
+; AVX512-NEXT:    orq %r8, %rax
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT:    andq 376(%rdi), %r8
+; AVX512-NEXT:    orq %r10, %rax
+; AVX512-NEXT:    movq %rax, %r11
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    andq 120(%rdi), %rax
+; AVX512-NEXT:    orq %r8, %rax
+; AVX512-NEXT:    movq %rax, %r10
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    andq 504(%rdi), %rax
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT:    andq 248(%rdi), %r8
+; AVX512-NEXT:    orq %rax, %r8
+; AVX512-NEXT:    orq %r10, %r8
+; AVX512-NEXT:    orq %r11, %r8
+; AVX512-NEXT:    movq 1040(%rsp,%rsi), %rax
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT:    shldq %cl, %rsi, %r10
+; AVX512-NEXT:    orq %rbx, %r8
+; AVX512-NEXT:    shlxq %rcx, %rax, %rsi
+; AVX512-NEXT:    andq 256(%rdi), %r10
+; AVX512-NEXT:    andq (%rdi), %rsi
+; AVX512-NEXT:    orq %r10, %rsi
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX512-NEXT:    orq %r13, %rsi
+; AVX512-NEXT:    orq %r15, %rsi
+; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
+; AVX512-NEXT:    shldq %cl, %rax, %rdx
+; AVX512-NEXT:    orq %r9, %rsi
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    andq 264(%rdi), %rax
+; AVX512-NEXT:    andq 8(%rdi), %rdx
+; AVX512-NEXT:    orq %rax, %rdx
+; AVX512-NEXT:    orq %rbp, %rdx
+; AVX512-NEXT:    orq %r12, %rdx
+; AVX512-NEXT:    orq %r14, %rdx
+; AVX512-NEXT:    orq %r8, %rdx
+; AVX512-NEXT:    orq %rsi, %rdx
+; AVX512-NEXT:    setne %al
+; AVX512-NEXT:    addq $1560, %rsp # imm = 0x618
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %rem = and i32 %position, 4095
   %ofs = zext nneg i32 %rem to i4096
   %bit = shl nuw i4096 1, %ofs
@@ -1035,8 +7161,8 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    movzbl 12(%ebp), %ecx
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -1049,41 +7175,52 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %edi
-; X86-NEXT:    movl 36(%esp,%edi), %edx
-; X86-NEXT:    movl 40(%esp,%edi), %ebx
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl 32(%esp,%edi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%esp,%edi), %edi
-; X86-NEXT:    shldl %cl, %ebx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %ebx
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 16(%ebp), %eax
-; X86-NEXT:    movl (%eax), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    andl $96, %eax
-; X86-NEXT:    shrl $3, %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl (%ecx,%eax), %eax
-; X86-NEXT:    andl %ebx, (%ecx)
-; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl 56(%esp,%eax), %esi
+; X86-NEXT:    movl 60(%esp,%eax), %edx
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%esp,%eax), %edi
+; X86-NEXT:    movl 52(%esp,%eax), %eax
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edi, %eax
+; X86-NEXT:    movl 8(%ebp), %ebx
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl 8(%ebx), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %esi, %ecx
+; X86-NEXT:    movl (%ebx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %edi, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    movl 12(%ebx), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl 4(%ebx), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl %ebx, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %edx
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    notl %edx
-; X86-NEXT:    movl 8(%ebp), %ebx
-; X86-NEXT:    andl %edx, 4(%ebx)
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl %esi, 8(%ebx)
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    notl %edi
-; X86-NEXT:    andl %edi, 12(%ebx)
-; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    jae .LBB22_2
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    movl %ebx, 8(%esi)
+; X86-NEXT:    movl %ecx, 12(%esi)
+; X86-NEXT:    movl %edi, (%esi)
+; X86-NEXT:    movl %edx, 4(%esi)
+; X86-NEXT:    je .LBB22_2
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:  .LBB22_2:
@@ -1105,75 +7242,52 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
 ; SSE-NEXT:    testb $64, %cl
 ; SSE-NEXT:    cmovneq %rsi, %r8
 ; SSE-NEXT:    cmovneq %rax, %rsi
+; SSE-NEXT:    movq (%rdi), %rcx
+; SSE-NEXT:    movq 8(%rdi), %r9
+; SSE-NEXT:    movq %r9, %r10
+; SSE-NEXT:    andq %r8, %r10
 ; SSE-NEXT:    notq %r8
+; SSE-NEXT:    movq %rcx, %r11
+; SSE-NEXT:    andq %rsi, %r11
 ; SSE-NEXT:    notq %rsi
-; SSE-NEXT:    movl %ecx, %r9d
-; SSE-NEXT:    andl $96, %r9d
-; SSE-NEXT:    shrl $3, %r9d
-; SSE-NEXT:    movl (%rdi,%r9), %r9d
-; SSE-NEXT:    btl %ecx, %r9d
-; SSE-NEXT:    jb .LBB22_2
+; SSE-NEXT:    andq %r9, %r8
+; SSE-NEXT:    andq %rcx, %rsi
+; SSE-NEXT:    orq %r10, %r11
+; SSE-NEXT:    jne .LBB22_2
 ; SSE-NEXT:  # %bb.1:
 ; SSE-NEXT:    movl (%rdx), %eax
 ; SSE-NEXT:  .LBB22_2:
-; SSE-NEXT:    andq %r8, 8(%rdi)
-; SSE-NEXT:    andq %rsi, (%rdi)
+; SSE-NEXT:    movq %rsi, (%rdi)
+; SSE-NEXT:    movq %r8, 8(%rdi)
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: reset_multiload_i128:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    movl $1, %r8d
-; AVX2-NEXT:    xorl %esi, %esi
-; AVX2-NEXT:    shldq %cl, %r8, %rsi
-; AVX2-NEXT:    shlxq %rcx, %r8, %r8
-; AVX2-NEXT:    testb $64, %cl
-; AVX2-NEXT:    cmovneq %r8, %rsi
-; AVX2-NEXT:    cmovneq %rax, %r8
-; AVX2-NEXT:    notq %rsi
-; AVX2-NEXT:    notq %r8
-; AVX2-NEXT:    movl %ecx, %r9d
-; AVX2-NEXT:    andl $96, %r9d
-; AVX2-NEXT:    shrl $3, %r9d
-; AVX2-NEXT:    movl (%rdi,%r9), %r9d
-; AVX2-NEXT:    btl %ecx, %r9d
-; AVX2-NEXT:    jb .LBB22_2
-; AVX2-NEXT:  # %bb.1:
-; AVX2-NEXT:    movl (%rdx), %eax
-; AVX2-NEXT:  .LBB22_2:
-; AVX2-NEXT:    andq %rsi, 8(%rdi)
-; AVX2-NEXT:    andq %r8, (%rdi)
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: reset_multiload_i128:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    movl $1, %r8d
-; AVX512-NEXT:    xorl %esi, %esi
-; AVX512-NEXT:    shldq %cl, %r8, %rsi
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    shlxq %rcx, %r8, %r8
-; AVX512-NEXT:    testb $64, %cl
-; AVX512-NEXT:    cmovneq %r8, %rsi
-; AVX512-NEXT:    cmovneq %rax, %r8
-; AVX512-NEXT:    notq %rsi
-; AVX512-NEXT:    notq %r8
-; AVX512-NEXT:    movl %ecx, %r9d
-; AVX512-NEXT:    andl $96, %r9d
-; AVX512-NEXT:    shrl $3, %r9d
-; AVX512-NEXT:    movl (%rdi,%r9), %r9d
-; AVX512-NEXT:    btl %ecx, %r9d
-; AVX512-NEXT:    jb .LBB22_2
-; AVX512-NEXT:  # %bb.1:
-; AVX512-NEXT:    movl (%rdx), %eax
-; AVX512-NEXT:  .LBB22_2:
-; AVX512-NEXT:    andq %rsi, 8(%rdi)
-; AVX512-NEXT:    andq %r8, (%rdi)
-; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT:    retq
+; AVX-LABEL: reset_multiload_i128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %esi, %ecx
+; AVX-NEXT:    movl $1, %esi
+; AVX-NEXT:    xorl %r8d, %r8d
+; AVX-NEXT:    shldq %cl, %rsi, %r8
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    shlxq %rcx, %rsi, %r9
+; AVX-NEXT:    testb $64, %cl
+; AVX-NEXT:    cmovneq %r9, %r8
+; AVX-NEXT:    cmovneq %rax, %r9
+; AVX-NEXT:    movq (%rdi), %r10
+; AVX-NEXT:    movq 8(%rdi), %r11
+; AVX-NEXT:    andnq %r11, %r8, %rcx
+; AVX-NEXT:    andq %r8, %r11
+; AVX-NEXT:    andnq %r10, %r9, %rsi
+; AVX-NEXT:    andq %r9, %r10
+; AVX-NEXT:    orq %r11, %r10
+; AVX-NEXT:    jne .LBB22_2
+; AVX-NEXT:  # %bb.1:
+; AVX-NEXT:    movl (%rdx), %eax
+; AVX-NEXT:  .LBB22_2:
+; AVX-NEXT:    movq %rsi, (%rdi)
+; AVX-NEXT:    movq %rcx, 8(%rdi)
+; AVX-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
diff --git a/llvm/test/DebugInfo/PDB/Native/pdb-native-index-overflow.test b/llvm/test/DebugInfo/PDB/Native/pdb-native-index-overflow.test
new file mode 100755
index 0000000..aa3f6dc
--- /dev/null
+++ b/llvm/test/DebugInfo/PDB/Native/pdb-native-index-overflow.test
@@ -0,0 +1,13 @@
+; Test that the native PDB reader isn't crashed by index value bigger than
+; number of types in TPI or IPI stream
+; RUN: llvm-pdbutil dump %p/../Inputs/empty.pdb --type-index=20000000\
+; RUN:   | FileCheck -check-prefixes=TYPES,NOT_FOUND %s
+; RUN: llvm-pdbutil dump %p/../Inputs/empty.pdb --id-index=20000000\
+; RUN:   | FileCheck -check-prefixes=IDS,NOT_FOUND %s
+
+TYPES:                     Types (TPI Stream)
+IDS:                       Types (IPI Stream)
+NOT_FOUND:============================================================
+NOT_FOUND:  Showing 1 records.
+NOT_FOUND:  Type 0x1312D00 doesn't exist in TPI stream
+
diff --git a/llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll
deleted file mode 100644
index 9fcac80..0000000
--- a/llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-define void @test (float %b, ptr %p)  {
-; CHECK: extractelement
-; CHECK: fptosi
-  %1 = load <8 x float> , ptr %p
-  %2 = bitcast <8 x float> %1 to <8 x i32>
-  %3 = bitcast <8 x i32> %2 to <8 x float>
-  %a = fptosi <8 x float> %3 to <8 x i32>
-  %4 = fptosi float %b to i32
-  %5 = add i32 %4, -2
-  %6 = extractelement <8 x i32> %a, i32 %5
-  %7 = insertelement <8 x i32> poison, i32 %6, i32 7
-  %8 = sitofp <8 x i32> %7 to <8 x float>
-  store <8 x float> %8, ptr %p
-  ret void    
-}
-
-; PR18600
-define i32 @test2(i32 %i) {
-  %e = extractelement <4 x i32> bitcast (<2 x i64> <i64 1, i64 2> to <4 x i32>), i32 %i
-  ret i32 %e
-
-; CHECK-LABEL: @test2
-; CHECK: extractelement
-}
diff --git a/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll b/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
index 32bf4da..205b4b8 100644
--- a/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
+++ b/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
@@ -1,26 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
-define void @test (float %b, ptr %p)  {
-; CHECK: extractelement
-; CHECK: fptosi
-  %1 = load <8 x float> , ptr %p
+define void @test_poison(float %b, ptr %p) {
+; CHECK-LABEL: define void @test_poison(
+; CHECK-SAME: float [[B:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[P]], align 32
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi float [[B]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], -2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fptosi float [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = sitofp <8 x i32> [[TMP6]] to <8 x float>
+; CHECK-NEXT:    store <8 x float> [[TMP7]], ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %1 = load <8 x float>, ptr %p
   %2 = bitcast <8 x float> %1 to <8 x i32>
   %3 = bitcast <8 x i32> %2 to <8 x float>
   %a = fptosi <8 x float> %3 to <8 x i32>
   %4 = fptosi float %b to i32
   %5 = add i32 %4, -2
   %6 = extractelement <8 x i32> %a, i32 %5
-  %7 = insertelement <8 x i32> undef, i32 %6, i32 7
+  %7 = insertelement <8 x i32> poison, i32 %6, i32 7
   %8 = sitofp <8 x i32> %7 to <8 x float>
   store <8 x float> %8, ptr %p
-  ret void    
+  ret void
 }
 
 ; PR18600
-define i32 @test2(i32 %i) {
+define i32 @test_bitcast(i32 %i) {
+; CHECK-LABEL: define i32 @test_bitcast(
+; CHECK-SAME: i32 [[I:%.*]]) {
+; CHECK-NEXT:    [[E:%.*]] = extractelement <4 x i32> <i32 1, i32 0, i32 2, i32 0>, i32 [[I]]
+; CHECK-NEXT:    ret i32 [[E]]
+;
   %e = extractelement <4 x i32> bitcast (<2 x i64> <i64 1, i64 2> to <4 x i32>), i32 %i
   ret i32 %e
+}
+
+declare void @use(i32)
 
-; CHECK-LABEL: @test2
-; CHECK: extractelement
+define void @test_loop(<4 x float> %in) {
+; CHECK-LABEL: define void @test_loop(
+; CHECK-SAME: <4 x float> [[IN:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[R:%.*]] = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> [[IN]], i32 9)
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEXT:%.*]], %[[LATCH:.*]] ]
+; CHECK-NEXT:    [[COND:%.*]] = icmp samesign ult i32 [[I]], 4
+; CHECK-NEXT:    br i1 [[COND]], label %[[BODY:.*]], label %[[DONE:.*]]
+; CHECK:       [[BODY]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[R]], i32 [[I]]
+; CHECK-NEXT:    [[ELEM:%.*]] = fptosi float [[TMP0]] to i32
+; CHECK-NEXT:    call void @use(i32 [[ELEM]])
+; CHECK-NEXT:    br label %[[LATCH]]
+; CHECK:       [[LATCH]]:
+; CHECK-NEXT:    [[NEXT]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT:    br label %[[LOOP]]
+; CHECK:       [[DONE]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %r = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %in, i32 9)
+  %vi = fptosi <4 x float> %r to <4 x i32>
+  br label %loop
+loop:
+  %i = phi i32 [ 0, %entry ], [ %next, %latch ]
+  %cond = icmp ult i32 %i, 4
+  br i1 %cond, label %body, label %done
+body:
+  %elem = extractelement <4 x i32> %vi, i32 %i
+  call void @use(i32 %elem)
+  br label %latch
+latch:
+  %next = add i32 %i, 1
+  br label %loop
+done:
+  ret void
 }
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll
index d16843c..6629b12 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll
@@ -1,21 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-100 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
 
-define ptr @test(ptr %d) {
+define ptr @test(ptr %d, i64 %v) {
 ; CHECK-LABEL: define ptr @test(
-; CHECK-SAME: ptr [[D:%.*]]) {
+; CHECK-SAME: ptr [[D:%.*]], i64 [[V:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[D]], align 1
 ; CHECK-NEXT:    [[CMP4_2:%.*]] = icmp eq i8 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[CMP4_2]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 0, 0
-; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 1, 0
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[CMP4_2]], i64 0, i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 0, [[V]]
+; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP2]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 1, [[V]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <6 x i64> poison, i64 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <6 x i64> [[TMP5]], i64 [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <6 x i64> [[TMP6]], i64 [[TMP4]], i32 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <6 x i64> [[TMP7]], <6 x i64> poison, <6 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP9:%.*]] = mul <6 x i64> [[TMP8]], <i64 2, i64 6, i64 1, i64 1, i64 1, i64 0>
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <6 x i64> [[TMP8]], <i64 2, i64 6, i64 4, i64 3, i64 5, i64 4>
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <6 x i64> [[TMP9]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <6 x i64> [[TMP9]], i32 1
@@ -31,23 +31,23 @@ define ptr @test(ptr %d) {
 ; CHECK-NEXT:    ret ptr [[TMP20]]
 ;
 entry:
-  %0 = load i8, ptr null, align 1
+  %0 = load i8, ptr %d, align 1
   %cmp4.2 = icmp eq i8 %0, 0
-  %1 = select i1 %cmp4.2, i64 0, i64 0
+  %1 = select i1 %cmp4.2, i64 0, i64 4
   %2 = shl i64 %1, 1
   %3 = getelementptr i8, ptr %d, i64 %2
-  %4 = xor i64 0, 0
-  %5 = udiv i64 %4, 0
+  %4 = xor i64 0, %v
+  %5 = udiv i64 %4, 3
   %6 = mul i64 %5, 6
   %7 = getelementptr i8, ptr %d, i64 %6
-  %8 = shl i64 %1, 0
+  %8 = shl i64 %1, 2
   %scevgep42 = getelementptr i8, ptr %d, i64 %8
-  %9 = mul i64 %5, 1
+  %9 = mul i64 %5, 3
   %10 = getelementptr i8, ptr %d, i64 %9
-  %11 = udiv i64 1, 0
-  %12 = mul i64 %11, 1
+  %11 = udiv i64 1, %v
+  %12 = mul i64 %11, 5
   %13 = getelementptr i8, ptr %d, i64 %12
-  %14 = mul i64 %11, 0
+  %14 = mul i64 %11, 4
   %15 = getelementptr i8, ptr %d, i64 %14
   ret ptr %15
 }
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll
new file mode 100644
index 0000000..bfd216d
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll
@@ -0,0 +1,29 @@
+; RUN: opt < %s -S | FileCheck %s
+
+; Test whether UTC checks empty lines instead of skipping them.
+define i32 @test(i32 %x) {
+entry:
+  br label %block1
+
+block1:
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %block2, label %exit1
+
+block2:
+  br i1 %cmp, label %block3, label %exit2
+
+block3:
+  br i1 %cmp, label %exit3, label %exit4
+
+exit1:
+  ret i32 0
+
+exit2:
+  ret i32 %x
+
+exit3:
+  ret i32 %x
+
+exit4:
+  ret i32 %x
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll.expected
new file mode 100644
index 0000000..c5f822d
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll.expected
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 7
+; RUN: opt < %s -S | FileCheck %s
+
+; Test whether UTC checks empty lines instead of skipping them.
+define i32 @test(i32 %x) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[BLOCK1:.*]]
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[BLOCK1]]:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[BLOCK2:.*]], label %[[EXIT1:.*]]
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[BLOCK2]]:
+; CHECK-NEXT:    br i1 [[CMP]], label %[[BLOCK3:.*]], label %[[EXIT2:.*]]
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[BLOCK3]]:
+; CHECK-NEXT:    br i1 [[CMP]], label %[[EXIT3:.*]], label %[[EXIT4:.*]]
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[EXIT1]]:
+; CHECK-NEXT:    ret i32 0
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[EXIT2]]:
+; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[EXIT3]]:
+; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[EXIT4]]:
+; CHECK-NEXT:    ret i32 [[X]]
+;
+entry:
+  br label %block1
+
+block1:
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %block2, label %exit1
+
+block2:
+  br i1 %cmp, label %block3, label %exit2
+
+block3:
+  br i1 %cmp, label %exit3, label %exit4
+
+exit1:
+  ret i32 0
+
+exit2:
+  ret i32 %x
+
+exit3:
+  ret i32 %x
+
+exit4:
+  ret i32 %x
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected
index b1977e7..8cab0bb 100644
--- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected
@@ -12,13 +12,17 @@ define i8 @testi8(i8 %x) {
 ; CHECK-NEXT:      i8 2, label %[[CASE3:.*]]
 ; CHECK-NEXT:      i8 3, label %[[CASE3]]
 ; CHECK-NEXT:    ]
-; CHECK:       [[DEFAULT]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[DEFAULT]]:
 ; CHECK-NEXT:    ret i8 0
-; CHECK:       [[CASE1]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE1]]:
 ; CHECK-NEXT:    ret i8 1
-; CHECK:       [[CASE2]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE2]]:
 ; CHECK-NEXT:    ret i8 2
-; CHECK:       [[CASE3]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE3]]:
 ; CHECK-NEXT:    ret i8 3
 ;
   switch i8 %x, label %default [
@@ -46,13 +50,17 @@ define i32 @testi32(i32 %x) {
 ; CHECK-NEXT:      i32 2, label %[[CASE3:.*]]
 ; CHECK-NEXT:      i32 3, label %[[CASE3]]
 ; CHECK-NEXT:    ]
-; CHECK:       [[DEFAULT]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[DEFAULT]]:
 ; CHECK-NEXT:    ret i32 0
-; CHECK:       [[CASE1]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE1]]:
 ; CHECK-NEXT:    ret i32 1
-; CHECK:       [[CASE2]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE2]]:
 ; CHECK-NEXT:    ret i32 2
-; CHECK:       [[CASE3]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE3]]:
 ; CHECK-NEXT:    ret i32 3
 ;
   switch i32 %x, label %default [
@@ -80,13 +88,17 @@ define i128 @testi128(i128 %x) {
 ; CHECK-NEXT:      i128 2, label %[[CASE3:.*]]
 ; CHECK-NEXT:      i128 3, label %[[CASE3]]
 ; CHECK-NEXT:    ]
-; CHECK:       [[DEFAULT]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[DEFAULT]]:
 ; CHECK-NEXT:    ret i128 0
-; CHECK:       [[CASE1]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE1]]:
 ; CHECK-NEXT:    ret i128 1
-; CHECK:       [[CASE2]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE2]]:
 ; CHECK-NEXT:    ret i128 2
-; CHECK:       [[CASE3]]:
+; CHECK-EMPTY:
+; CHECK-NEXT:  [[CASE3]]:
 ; CHECK-NEXT:    ret i128 3
 ;
   switch i128 %x, label %default [
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/check_empty.test b/llvm/test/tools/UpdateTestChecks/update_test_checks/check_empty.test
new file mode 100644
index 0000000..670bda2
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/check_empty.test
@@ -0,0 +1,3 @@
+## test whether the UTC generates CHECK-EMPTY for blank lines
+# RUN: cp -f %S/Inputs/check_empty.ll %t.ll && %update_test_checks %t.ll --version 7
+# RUN: diff -u %t.ll %S/Inputs/check_empty.ll.expected
diff --git a/llvm/test/tools/llvm-config/paths.test b/llvm/test/tools/llvm-config/paths.test
index 419f155..61d86f7 100644
--- a/llvm/test/tools/llvm-config/paths.test
+++ b/llvm/test/tools/llvm-config/paths.test
@@ -4,18 +4,34 @@ RUN: llvm-config --bindir 2>&1 | FileCheck --check-prefix=CHECK-BINDIR %s
 CHECK-BINDIR: {{.*}}{{/|\\}}bin
 CHECK-BINDIR-NOT: error:
 CHECK-BINDIR-NOT: warning
+RUN: llvm-config --bindir --quote-paths 2>&1 | FileCheck --check-prefix=CHECK-BINDIR2 %s
+CHECK-BINDIR2: {{.*}}{{/|\\\\}}bin
+CHECK-BINDIR2-NOT: error:
+CHECK-BINDIR2-NOT: warning
 
 RUN: llvm-config --includedir 2>&1 | FileCheck --check-prefix=CHECK-INCLUDEDIR %s
 CHECK-INCLUDEDIR: {{.*}}{{/|\\}}include
 CHECK-INCLUDEDIR-NOT: error:
 CHECK-INCLUDEDIR-NOT: warning
+RUN: llvm-config --includedir --quote-paths 2>&1 | FileCheck --check-prefix=CHECK-INCLUDEDIR2 %s
+CHECK-INCLUDEDIR2: {{.*}}{{/|\\\\}}include
+CHECK-INCLUDEDIR2-NOT: error:
+CHECK-INCLUDEDIR2-NOT: warning
 
 RUN: llvm-config --libdir 2>&1 | FileCheck --check-prefix=CHECK-LIBDIR %s
 CHECK-LIBDIR: {{.*}}{{/|\\}}lib{{.*}}
 CHECK-LIBDIR-NOT: error:
 CHECK-LIBDIR-NOT: warning
+RUN: llvm-config --libdir --quote-paths 2>&1 | FileCheck --check-prefix=CHECK-LIBDIR2 %s
+CHECK-LIBDIR2: {{.*}}{{/|\\\\}}lib{{.*}}
+CHECK-LIBDIR2-NOT: error:
+CHECK-LIBDIR2-NOT: warning
 
 RUN: llvm-config --cmakedir 2>&1 | FileCheck --check-prefix=CHECK-CMAKEDIR %s
 CHECK-CMAKEDIR: {{.*}}{{/|\\}}cmake{{/|\\}}llvm
 CHECK-CMAKEDIR-NOT: error:
 CHECK-CMAKEDIR-NOT: warning
+RUN: llvm-config --cmakedir --quote-paths 2>&1 | FileCheck --check-prefix=CHECK-CMAKEDIR2 %s
+CHECK-CMAKEDIR2: {{.*}}{{/|\\\\}}cmake{{/|\\\\}}llvm
+CHECK-CMAKEDIR2-NOT: error:
+CHECK-CMAKEDIR2-NOT: warning
diff --git a/llvm/tools/llvm-config/llvm-config.cpp b/llvm/tools/llvm-config/llvm-config.cpp
index 020b1b5..5300c5c 100644
--- a/llvm/tools/llvm-config/llvm-config.cpp
+++ b/llvm/tools/llvm-config/llvm-config.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Config/config.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
@@ -232,6 +233,7 @@ Options:\n\
   --link-static     Link the component libraries statically.\n\
   --obj-root        Print the object root used to build LLVM.\n\
   --prefix          Print the installation prefix.\n\
+  --quote-paths     Quote and escape paths when needed.\n\
   --shared-mode     Print how the provided components can be collectively linked (`shared` or `static`).\n\
   --system-libs     System Libraries needed to link against LLVM components.\n\
   --targets-built   List of all targets currently built.\n\
@@ -324,7 +326,7 @@ int main(int argc, char **argv) {
   // information.
   std::string ActivePrefix, ActiveBinDir, ActiveIncludeDir, ActiveLibDir,
               ActiveCMakeDir;
-  std::string ActiveIncludeOption;
+  std::vector<std::string> ActiveIncludeOptions;
   if (IsInDevelopmentTree) {
     ActiveIncludeDir = std::string(LLVM_SRC_ROOT) + "/include";
     ActivePrefix = CurrentExecPrefix;
@@ -350,8 +352,8 @@ int main(int argc, char **argv) {
     }
 
     // We need to include files from both the source and object trees.
-    ActiveIncludeOption =
-        ("-I" + ActiveIncludeDir + " " + "-I" + ActiveObjRoot + "/include");
+    ActiveIncludeOptions.push_back(ActiveIncludeDir);
+    ActiveIncludeOptions.push_back(ActiveObjRoot + "/include");
   } else {
     ActivePrefix = CurrentExecPrefix;
     {
@@ -370,7 +372,7 @@ int main(int argc, char **argv) {
       sys::path::make_absolute(ActivePrefix, Path);
       ActiveCMakeDir = std::string(Path);
     }
-    ActiveIncludeOption = "-I" + ActiveIncludeDir;
+    ActiveIncludeOptions.push_back(ActiveIncludeDir);
   }
 
   /// We only use `shared library` mode in cases where the static library form
@@ -399,7 +401,9 @@ int main(int argc, char **argv) {
       llvm::replace(ActiveBinDir, '/', '\\');
       llvm::replace(ActiveLibDir, '/', '\\');
       llvm::replace(ActiveCMakeDir, '/', '\\');
-      llvm::replace(ActiveIncludeOption, '/', '\\');
+      llvm::replace(ActiveIncludeDir, '/', '\\');
+      for (auto &Include : ActiveIncludeOptions)
+        llvm::replace(Include, '/', '\\');
     }
     SharedDir = ActiveBinDir;
     StaticDir = ActiveLibDir;
@@ -501,6 +505,32 @@ int main(int argc, char **argv) {
   };
 
   raw_ostream &OS = outs();
+
+  // Check if we want quoting and escaping.
+  bool QuotePaths = std::any_of(&argv[0], &argv[argc], [](const char *Arg) {
+    return StringRef(Arg) == "--quote-paths";
+  });
+
+  auto MaybePrintQuoted = [&](StringRef Str) {
+    if (QuotePaths)
+      sys::printArg(OS, Str, /*Quote=*/false); // only add quotes if necessary
+    else
+      OS << Str;
+  };
+
+  // Render include paths and associated flags
+  auto RenderFlags = [&](StringRef Flags) {
+    bool First = true;
+    for (auto &Include : ActiveIncludeOptions) {
+      if (!First)
+        OS << ' ';
+      std::string FlagsStr = "-I" + Include;
+      MaybePrintQuoted(FlagsStr);
+      First = false;
+    }
+    OS << ' ' << Flags << '\n';
+  };
+
   for (int i = 1; i != argc; ++i) {
     StringRef Arg = argv[i];
 
@@ -509,24 +539,32 @@ int main(int argc, char **argv) {
       if (Arg == "--version") {
         OS << PACKAGE_VERSION << '\n';
       } else if (Arg == "--prefix") {
-        OS << ActivePrefix << '\n';
+        MaybePrintQuoted(ActivePrefix);
+        OS << '\n';
       } else if (Arg == "--bindir") {
-        OS << ActiveBinDir << '\n';
+        MaybePrintQuoted(ActiveBinDir);
+        OS << '\n';
       } else if (Arg == "--includedir") {
-        OS << ActiveIncludeDir << '\n';
+        MaybePrintQuoted(ActiveIncludeDir);
+        OS << '\n';
       } else if (Arg == "--libdir") {
-        OS << ActiveLibDir << '\n';
+        MaybePrintQuoted(ActiveLibDir);
+        OS << '\n';
       } else if (Arg == "--cmakedir") {
-        OS << ActiveCMakeDir << '\n';
+        MaybePrintQuoted(ActiveCMakeDir);
+        OS << '\n';
       } else if (Arg == "--cppflags") {
-        OS << ActiveIncludeOption << ' ' << LLVM_CPPFLAGS << '\n';
+        RenderFlags(LLVM_CPPFLAGS);
       } else if (Arg == "--cflags") {
-        OS << ActiveIncludeOption << ' ' << LLVM_CFLAGS << '\n';
+        RenderFlags(LLVM_CFLAGS);
       } else if (Arg == "--cxxflags") {
-        OS << ActiveIncludeOption << ' ' << LLVM_CXXFLAGS << '\n';
+        RenderFlags(LLVM_CXXFLAGS);
       } else if (Arg == "--ldflags") {
-        OS << ((HostTriple.isWindowsMSVCEnvironment()) ? "-LIBPATH:" : "-L")
-           << ActiveLibDir << ' ' << LLVM_LDFLAGS << '\n';
+        std::string LDFlags =
+            HostTriple.isWindowsMSVCEnvironment() ? "-LIBPATH:" : "-L";
+        LDFlags += ActiveLibDir;
+        MaybePrintQuoted(LDFlags);
+        OS << ' ' << LLVM_LDFLAGS << '\n';
       } else if (Arg == "--system-libs") {
         PrintSystemLibs = true;
       } else if (Arg == "--libs") {
@@ -580,7 +618,8 @@ int main(int argc, char **argv) {
       } else if (Arg == "--shared-mode") {
         PrintSharedMode = true;
       } else if (Arg == "--obj-root") {
-        OS << ActivePrefix << '\n';
+        MaybePrintQuoted(ActivePrefix);
+        OS << '\n';
       } else if (Arg == "--ignore-libllvm") {
         LinkDyLib = false;
         LinkMode = BuiltSharedLibs ? LinkModeShared : LinkModeAuto;
@@ -590,6 +629,8 @@ int main(int argc, char **argv) {
         LinkMode = LinkModeStatic;
       } else if (Arg == "--help") {
         usage(false);
+      } else if (Arg == "--quote-paths") {
+        // Was already handled above this loop.
       } else {
         usage();
       }
@@ -682,26 +723,30 @@ int main(int argc, char **argv) {
 
       auto PrintForLib = [&](const StringRef &Lib) {
         const bool Shared = LinkMode == LinkModeShared;
+        std::string LibFileName;
         if (PrintLibNames) {
-          OS << GetComponentLibraryFileName(Lib, Shared);
+          LibFileName = GetComponentLibraryFileName(Lib, Shared);
         } else if (PrintLibFiles) {
-          OS << GetComponentLibraryPath(Lib, Shared);
+          LibFileName = GetComponentLibraryPath(Lib, Shared);
         } else if (PrintLibs) {
           // On Windows, output full path to library without parameters.
           // Elsewhere, if this is a typical library name, include it using -l.
           if (HostTriple.isWindowsMSVCEnvironment()) {
-            OS << GetComponentLibraryPath(Lib, Shared);
+            LibFileName = GetComponentLibraryPath(Lib, Shared);
           } else {
+            LibFileName = "-l";
             StringRef LibName;
             if (GetComponentLibraryNameSlice(Lib, LibName)) {
               // Extract library name (remove prefix and suffix).
-              OS << "-l" << LibName;
+              LibFileName += LibName;
             } else {
               // Lib is already a library name without prefix and suffix.
-              OS << "-l" << Lib;
+              LibFileName += Lib;
             }
           }
         }
+        if (!LibFileName.empty())
+          MaybePrintQuoted(LibFileName);
       };
 
       if (LinkMode == LinkModeShared && LinkDyLib)
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp
index 1823a53..ba14d56 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp
@@ -202,7 +202,7 @@ struct CustomMappingTraits<std::map<exegesis::ValidationEvent, int64_t>> {
       Io.setError("Key is not a valid validation event");
       return;
     }
-    Io.mapRequired(KeyStr.str().c_str(), VI[*Key]);
+    Io.mapRequired(KeyStr, VI[*Key]);
   }
 
   static void output(IO &Io, std::map<exegesis::ValidationEvent, int64_t> &VI) {
diff --git a/llvm/unittests/ADT/ConcurrentHashtableTest.cpp b/llvm/unittests/ADT/ConcurrentHashtableTest.cpp
index ee1ee41..1b82df1 100644
--- a/llvm/unittests/ADT/ConcurrentHashtableTest.cpp
+++ b/llvm/unittests/ADT/ConcurrentHashtableTest.cpp
@@ -21,7 +21,7 @@ using namespace parallel;
 namespace {
 class String {
 public:
-  String() {}
+  String() = default;
   const std::string &getKey() const { return Data; }
 
   template <typename AllocatorTy>
diff --git a/llvm/unittests/ADT/DirectedGraphTest.cpp b/llvm/unittests/ADT/DirectedGraphTest.cpp
index 49ccf06..82a631b 100644
--- a/llvm/unittests/ADT/DirectedGraphTest.cpp
+++ b/llvm/unittests/ADT/DirectedGraphTest.cpp
@@ -43,7 +43,7 @@ public:
 class DGTestGraph : public DGTestBase {
 public:
   DGTestGraph() = default;
-  ~DGTestGraph(){};
+  ~DGTestGraph() = default;
 };
 
 using EdgeListTy = SmallVector<DGTestEdge *, 2>;
diff --git a/llvm/unittests/ADT/IListTest.cpp b/llvm/unittests/ADT/IListTest.cpp
index 2fdc8e1..984014f 100644
--- a/llvm/unittests/ADT/IListTest.cpp
+++ b/llvm/unittests/ADT/IListTest.cpp
@@ -19,7 +19,7 @@ namespace {
 struct Node : ilist_node<Node> {
   int Value;
 
-  Node() {}
+  Node() = default;
   Node(int Value) : Value(Value) {}
   Node(const Node&) = default;
   ~Node() { Value = -1; }
diff --git a/llvm/unittests/ADT/SmallVectorTest.cpp b/llvm/unittests/ADT/SmallVectorTest.cpp
index 1a01f30..74fc737 100644
--- a/llvm/unittests/ADT/SmallVectorTest.cpp
+++ b/llvm/unittests/ADT/SmallVectorTest.cpp
@@ -159,7 +159,7 @@ int Constructable::numCopyAssignmentCalls;
 int Constructable::numMoveAssignmentCalls;
 
 struct NonCopyable {
-  NonCopyable() {}
+  NonCopyable() = default;
   NonCopyable(NonCopyable &&) {}
   NonCopyable &operator=(NonCopyable &&) { return *this; }
 private:
diff --git a/llvm/unittests/ADT/StringMapTest.cpp b/llvm/unittests/ADT/StringMapTest.cpp
index 92ae364..1d92de4 100644
--- a/llvm/unittests/ADT/StringMapTest.cpp
+++ b/llvm/unittests/ADT/StringMapTest.cpp
@@ -367,7 +367,7 @@ TEST_F(StringMapTest, NonDefaultConstructable) {
 }
 
 struct Immovable {
-  Immovable() {}
+  Immovable() = default;
   Immovable(Immovable &&) = delete; // will disable the other special members
 };
 
diff --git a/llvm/unittests/ADT/TypeSwitchTest.cpp b/llvm/unittests/ADT/TypeSwitchTest.cpp
index b801228..0a92717 100644
--- a/llvm/unittests/ADT/TypeSwitchTest.cpp
+++ b/llvm/unittests/ADT/TypeSwitchTest.cpp
@@ -167,7 +167,7 @@ TEST(TypeSwitchTest, DefaultNullptr) {
 TEST(TypeSwitchTest, DefaultNullptrForPointerLike) {
   struct Value {
     void *ptr;
-    Value(const Value &other) : ptr(other.ptr) {}
+    Value(const Value &other) = default;
     Value(std::nullptr_t) : ptr(nullptr) {}
     Value() : Value(nullptr) {}
   };
diff --git a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
index b06aa25..7b563d7 100644
--- a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
@@ -26,6 +26,7 @@ add_llvm_unittest(OrcJITTests
   IndirectionUtilsTest.cpp
   JITTargetMachineBuilderTest.cpp
   LazyCallThroughAndReexportsTest.cpp
+  LibraryResolverTest.cpp
   LookupAndRecordAddrsTest.cpp
   MachOPlatformTest.cpp
   MapperJITLinkMemoryManagerTest.cpp
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_linux.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_linux.yaml
new file mode 100644
index 0000000..afd1d9e
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_linux.yaml
@@ -0,0 +1,460 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_X86_64
+ProgramHeaders:
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .rela.plt
+    Align:           0x1000
+    Offset:          0x0
+  - Type:            PT_LOAD
+    Flags:           [ PF_X, PF_R ]
+    FirstSec:        .init
+    LastSec:         .fini
+    VAddr:           0x1000
+    Align:           0x1000
+    Offset:          0x1000
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .rodata
+    LastSec:         .eh_frame
+    VAddr:           0x2000
+    Align:           0x1000
+    Offset:          0x2000
+  - Type:            PT_LOAD
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .bss
+    VAddr:           0x3E10
+    Align:           0x1000
+    Offset:          0x2E10
+  - Type:            PT_DYNAMIC
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .dynamic
+    LastSec:         .dynamic
+    VAddr:           0x3E20
+    Align:           0x8
+    Offset:          0x2E20
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.build-id
+    LastSec:         .note.gnu.build-id
+    VAddr:           0x2C8
+    Align:           0x4
+    Offset:          0x2C8
+  - Type:            PT_GNU_PROPERTY
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_GNU_EH_FRAME
+    Flags:           [ PF_R ]
+    FirstSec:        .eh_frame_hdr
+    LastSec:         .eh_frame_hdr
+    VAddr:           0x2010
+    Align:           0x4
+    Offset:          0x2010
+  - Type:            PT_GNU_STACK
+    Flags:           [ PF_W, PF_R ]
+    Align:           0x10
+    Offset:          0x0
+  - Type:            PT_GNU_RELRO
+    Flags:           [ PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .got
+    VAddr:           0x3E10
+    Offset:          0x2E10
+Sections:
+  - Name:            .note.gnu.property
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2A8
+    AddressAlign:    0x8
+    Notes:
+      - Name:            GNU
+        Desc:            020000C0040000000300000000000000
+        Type:            NT_GNU_PROPERTY_TYPE_0
+  - Name:            .note.gnu.build-id
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2C8
+    AddressAlign:    0x4
+    Notes:
+      - Name:            GNU
+        Desc:            73604396C95840D5C380A0950F085A778F94EE7C
+        Type:            NT_PRPSINFO
+  - Name:            .gnu.hash
+    Type:            SHT_GNU_HASH
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2F0
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Header:
+      SymNdx:          0x6
+      Shift2:          0x6
+    BloomFilter:     [ 0x400000080000 ]
+    HashBuckets:     [ 0x0, 0x6 ]
+    HashValues:      [ 0x7C9DCB93 ]
+  - Name:            .dynsym
+    Type:            SHT_DYNSYM
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x318
+    Link:            .dynstr
+    AddressAlign:    0x8
+  - Name:            .dynstr
+    Type:            SHT_STRTAB
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x3C0
+    AddressAlign:    0x1
+  - Name:            .gnu.version
+    Type:            SHT_GNU_versym
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x436
+    Link:            .dynsym
+    AddressAlign:    0x2
+    Entries:         [ 0, 1, 2, 1, 1, 2, 1 ]
+  - Name:            .gnu.version_r
+    Type:            SHT_GNU_verneed
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x448
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Dependencies:
+      - Version:         1
+        File:            libc.so.6
+        Entries:
+          - Name:            GLIBC_2.2.5
+            Hash:            157882997
+            Flags:           0
+            Other:           2
+  - Name:            .rela.dyn
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x468
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Relocations:
+      - Offset:          0x3E10
+        Type:            R_X86_64_RELATIVE
+        Addend:          4368
+      - Offset:          0x3E18
+        Type:            R_X86_64_RELATIVE
+        Addend:          4304
+      - Offset:          0x4020
+        Type:            R_X86_64_RELATIVE
+        Addend:          16416
+      - Offset:          0x3FE0
+        Symbol:          _ITM_deregisterTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FE8
+        Symbol:          __gmon_start__
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF0
+        Symbol:          _ITM_registerTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF8
+        Symbol:          __cxa_finalize
+        Type:            R_X86_64_GLOB_DAT
+  - Name:            .rela.plt
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC, SHF_INFO_LINK ]
+    Address:         0x510
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Info:            .got.plt
+    Relocations:
+      - Offset:          0x4018
+        Symbol:          puts
+        Type:            R_X86_64_JUMP_SLOT
+  - Name:            .init
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1000
+    AddressAlign:    0x4
+    Offset:          0x1000
+    Content:         F30F1EFA4883EC08488B05D92F00004885C07402FFD04883C408C3
+  - Name:            .plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1020
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         FF35E22F0000F2FF25E32F00000F1F00F30F1EFA6800000000F2E9E1FFFFFF90
+  - Name:            .plt.got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1040
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25AD2F00000F1F440000
+  - Name:            .plt.sec
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1050
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25BD2F00000F1F440000
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1060
+    AddressAlign:    0x10
+    Content:         488D3DC12F0000488D05BA2F00004839F87415488B05662F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D912F0000488D358A2F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B05352F00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D4D2F000000752B5548833D122F0000004889E5740C488B3D2E2F0000E849FFFFFFE864FFFFFFC605252F0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFFF30F1EFA554889E5488D05D80E00004889C7E820FFFFFF905DC3
+  - Name:            .fini
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1134
+    AddressAlign:    0x4
+    Content:         F30F1EFA4883EC084883C408C3
+  - Name:            .rodata
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2000
+    AddressAlign:    0x1
+    Offset:          0x2000
+    Content:         48656C6C6F2066726F6D204100
+  - Name:            .eh_frame_hdr
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2010
+    AddressAlign:    0x4
+    Content:         011B033B2C0000000400000010F0FFFF4800000030F0FFFF7000000040F0FFFF8800000009F1FFFFA0000000
+  - Name:            .eh_frame
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2040
+    AddressAlign:    0x8
+    Content:         1400000000000000017A5200017810011B0C070890010000240000001C000000C0EFFFFF20000000000E10460E184A0F0B770880003F1A3A2A332422000000001400000044000000B8EFFFFF100000000000000000000000140000005C000000B0EFFFFF1000000000000000000000001C0000007400000061F0FFFF1A00000000450E108602430D06510C070800000000000000
+  - Name:            .init_array
+    Type:            SHT_INIT_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E10
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Offset:          0x2E10
+    Content:         '1011000000000000'
+  - Name:            .fini_array
+    Type:            SHT_FINI_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E18
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         D010000000000000
+  - Name:            .dynamic
+    Type:            SHT_DYNAMIC
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E20
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Entries:
+      - Tag:             DT_NEEDED
+        Value:           0x5F
+      - Tag:             DT_INIT
+        Value:           0x1000
+      - Tag:             DT_FINI
+        Value:           0x1134
+      - Tag:             DT_INIT_ARRAY
+        Value:           0x3E10
+      - Tag:             DT_INIT_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_FINI_ARRAY
+        Value:           0x3E18
+      - Tag:             DT_FINI_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_GNU_HASH
+        Value:           0x2F0
+      - Tag:             DT_STRTAB
+        Value:           0x3C0
+      - Tag:             DT_SYMTAB
+        Value:           0x318
+      - Tag:             DT_STRSZ
+        Value:           0x75
+      - Tag:             DT_SYMENT
+        Value:           0x18
+      - Tag:             DT_PLTGOT
+        Value:           0x4000
+      - Tag:             DT_PLTRELSZ
+        Value:           0x18
+      - Tag:             DT_PLTREL
+        Value:           0x7
+      - Tag:             DT_JMPREL
+        Value:           0x510
+      - Tag:             DT_RELA
+        Value:           0x468
+      - Tag:             DT_RELASZ
+        Value:           0xA8
+      - Tag:             DT_RELAENT
+        Value:           0x18
+      - Tag:             DT_VERNEED
+        Value:           0x448
+      - Tag:             DT_VERNEEDNUM
+        Value:           0x1
+      - Tag:             DT_VERSYM
+        Value:           0x436
+      - Tag:             DT_RELACOUNT
+        Value:           0x3
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+  - Name:            .got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3FE0
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '0000000000000000000000000000000000000000000000000000000000000000'
+  - Name:            .got.plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4000
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '203E000000000000000000000000000000000000000000003010000000000000'
+  - Name:            .data
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4020
+    AddressAlign:    0x8
+    Content:         '2040000000000000'
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4028
+    AddressAlign:    0x1
+    Size:            0x8
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         4743433A20285562756E74752031312E342E302D317562756E7475317E32322E30342E32292031312E342E3000
+Symbols:
+  - Name:            crtstuff.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            deregister_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1060
+  - Name:            register_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1090
+  - Name:            __do_global_dtors_aux
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x10D0
+  - Name:            completed.0
+    Type:            STT_OBJECT
+    Section:         .bss
+    Value:           0x4028
+    Size:            0x1
+  - Name:            __do_global_dtors_aux_fini_array_entry
+    Type:            STT_OBJECT
+    Section:         .fini_array
+    Value:           0x3E18
+  - Name:            frame_dummy
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1110
+  - Name:            __frame_dummy_init_array_entry
+    Type:            STT_OBJECT
+    Section:         .init_array
+    Value:           0x3E10
+  - Name:            libA.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            'crtstuff.c (1)'
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            __FRAME_END__
+    Type:            STT_OBJECT
+    Section:         .eh_frame
+    Value:           0x20D0
+  - Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            _fini
+    Type:            STT_FUNC
+    Section:         .fini
+    Value:           0x1134
+  - Name:            __dso_handle
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4020
+  - Name:            _DYNAMIC
+    Type:            STT_OBJECT
+    Section:         .dynamic
+    Value:           0x3E20
+  - Name:            __GNU_EH_FRAME_HDR
+    Section:         .eh_frame_hdr
+    Value:           0x2010
+  - Name:            __TMC_END__
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4028
+  - Name:            _GLOBAL_OFFSET_TABLE_
+    Type:            STT_OBJECT
+    Section:         .got.plt
+    Value:           0x4000
+  - Name:            _init
+    Type:            STT_FUNC
+    Section:         .init
+    Value:           0x1000
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            'puts@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            sayA
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1119
+    Size:            0x1A
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            '__cxa_finalize@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_WEAK
+DynamicSymbols:
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            puts
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            __cxa_finalize
+    Type:            STT_FUNC
+    Binding:         STB_WEAK
+  - Name:            sayA
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1119
+    Size:            0x1A
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_macho.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_macho.yaml
new file mode 100644
index 0000000..2e851a90
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_macho.yaml
@@ -0,0 +1,723 @@
+--- !fat-mach-o
+FatHeader:
+  magic:           0xCAFEBABE
+  nfat_arch:       3
+FatArchs:
+  - cputype:         0x1000007
+    cpusubtype:      0x3
+    offset:          0x1000
+    size:            8376
+    align:           12
+  - cputype:         0x100000C
+    cpusubtype:      0x0
+    offset:          0x4000
+    size:            33376
+    align:           14
+  - cputype:         0x100000C
+    cpusubtype:      0x80000002
+    offset:          0x10000
+    size:            33376
+    align:           14
+Slices:
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x1000007
+      cpusubtype:      0x3
+      filetype:        0x6
+      ncmds:           14
+      sizeofcmds:      960
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          4096
+        fileoff:         0
+        filesize:        4096
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0xF80
+            size:            20
+            offset:          0xF80
+            align:           4
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         554889E5488D3D0F000000B000E8020000005DC3
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0xF94
+            size:            6
+            offset:          0xF94
+            align:           1
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x6
+            reserved3:       0x0
+            content:         FF2566000000
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0xF9A
+            size:            14
+            offset:          0xF9A
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D20410A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0xFA8
+            size:            88
+            offset:          0xFA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000800F00004000000040000000940F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000100000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          4096
+        vmsize:          4096
+        fileoff:         4096
+        filesize:        4096
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x1000
+            size:            8
+            offset:          0x1000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '0000000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          8192
+        vmsize:          4096
+        fileoff:         8192
+        filesize:        184
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libA.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         8192
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         8288
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          8320
+        nsyms:           2
+        stroff:          8360
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  8352
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            ADFFA141-C3EE-37CD-B1E7-906D69F81BCB
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         8312
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         8320
+        datasize:        0
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayA
+            Flags:           0x0
+            Address:         0xF80
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         3968
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayA
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0xF80 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x10, 0x6, 0x0, 
+                         0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x0
+      filetype:        0x6
+      ncmds:           15
+      sizeofcmds:      976
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F70
+            size:            28
+            offset:          0x3F70
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8C0035FD6
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0x3F8C
+            size:            12
+            offset:          0x3F8C
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0xC
+            reserved3:       0x0
+            content:         100000B0100240F900021FD6
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0x3F98
+            size:            14
+            offset:          0x3F98
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D20410A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000703F000040000000400000008C3F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            8
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '0000000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        608
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libA.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32864
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32896
+        nsyms:           2
+        stroff:          32936
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32928
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            C45227E0-C6C0-3137-969B-36AABF9D5487
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32888
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32896
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         32960
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayA
+            Flags:           0x0
+            Address:         0x3F70
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16240
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayA
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0x3F70 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0x6, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x80000002
+      filetype:        0x6
+      ncmds:           15
+      sizeofcmds:      976
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F68
+            size:            32
+            offset:          0x3F68
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         7F2303D5FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8FF0F5FD6
+          - sectname:        __auth_stubs
+            segname:         __TEXT
+            addr:            0x3F88
+            size:            16
+            offset:          0x3F88
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x10
+            reserved3:       0x0
+            content:         110000B031020091300240F9110A1FD7
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0x3F98
+            size:            14
+            offset:          0x3F98
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D20410A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000683F00004000000040000000883F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __auth_got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            8
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         00000000000001C0
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        608
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libA.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32864
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32896
+        nsyms:           2
+        stroff:          32936
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32928
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            C9DC00C2-E721-365C-9C2D-E9FDB7C838BB
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32888
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32896
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         32960
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayA
+            Flags:           0x0
+            Address:         0x3F68
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16232
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayA
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0x3F68 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0xC, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_linux.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_linux.yaml
new file mode 100644
index 0000000..fe4393e
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_linux.yaml
@@ -0,0 +1,460 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_X86_64
+ProgramHeaders:
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .rela.plt
+    Align:           0x1000
+    Offset:          0x0
+  - Type:            PT_LOAD
+    Flags:           [ PF_X, PF_R ]
+    FirstSec:        .init
+    LastSec:         .fini
+    VAddr:           0x1000
+    Align:           0x1000
+    Offset:          0x1000
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .rodata
+    LastSec:         .eh_frame
+    VAddr:           0x2000
+    Align:           0x1000
+    Offset:          0x2000
+  - Type:            PT_LOAD
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .bss
+    VAddr:           0x3E10
+    Align:           0x1000
+    Offset:          0x2E10
+  - Type:            PT_DYNAMIC
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .dynamic
+    LastSec:         .dynamic
+    VAddr:           0x3E20
+    Align:           0x8
+    Offset:          0x2E20
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.build-id
+    LastSec:         .note.gnu.build-id
+    VAddr:           0x2C8
+    Align:           0x4
+    Offset:          0x2C8
+  - Type:            PT_GNU_PROPERTY
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_GNU_EH_FRAME
+    Flags:           [ PF_R ]
+    FirstSec:        .eh_frame_hdr
+    LastSec:         .eh_frame_hdr
+    VAddr:           0x2010
+    Align:           0x4
+    Offset:          0x2010
+  - Type:            PT_GNU_STACK
+    Flags:           [ PF_W, PF_R ]
+    Align:           0x10
+    Offset:          0x0
+  - Type:            PT_GNU_RELRO
+    Flags:           [ PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .got
+    VAddr:           0x3E10
+    Offset:          0x2E10
+Sections:
+  - Name:            .note.gnu.property
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2A8
+    AddressAlign:    0x8
+    Notes:
+      - Name:            GNU
+        Desc:            020000C0040000000300000000000000
+        Type:            NT_GNU_PROPERTY_TYPE_0
+  - Name:            .note.gnu.build-id
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2C8
+    AddressAlign:    0x4
+    Notes:
+      - Name:            GNU
+        Desc:            6337F7C1BF21A1DE17630C55602EB4CAC50435BB
+        Type:            NT_PRPSINFO
+  - Name:            .gnu.hash
+    Type:            SHT_GNU_HASH
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2F0
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Header:
+      SymNdx:          0x6
+      Shift2:          0x6
+    BloomFilter:     [ 0x400000100000 ]
+    HashBuckets:     [ 0x6, 0x0 ]
+    HashValues:      [ 0x7C9DCB95 ]
+  - Name:            .dynsym
+    Type:            SHT_DYNSYM
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x318
+    Link:            .dynstr
+    AddressAlign:    0x8
+  - Name:            .dynstr
+    Type:            SHT_STRTAB
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x3C0
+    AddressAlign:    0x1
+  - Name:            .gnu.version
+    Type:            SHT_GNU_versym
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x436
+    Link:            .dynsym
+    AddressAlign:    0x2
+    Entries:         [ 0, 1, 2, 1, 1, 2, 1 ]
+  - Name:            .gnu.version_r
+    Type:            SHT_GNU_verneed
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x448
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Dependencies:
+      - Version:         1
+        File:            libc.so.6
+        Entries:
+          - Name:            GLIBC_2.2.5
+            Hash:            157882997
+            Flags:           0
+            Other:           2
+  - Name:            .rela.dyn
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x468
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Relocations:
+      - Offset:          0x3E10
+        Type:            R_X86_64_RELATIVE
+        Addend:          4368
+      - Offset:          0x3E18
+        Type:            R_X86_64_RELATIVE
+        Addend:          4304
+      - Offset:          0x4020
+        Type:            R_X86_64_RELATIVE
+        Addend:          16416
+      - Offset:          0x3FE0
+        Symbol:          _ITM_deregisterTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FE8
+        Symbol:          __gmon_start__
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF0
+        Symbol:          _ITM_registerTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF8
+        Symbol:          __cxa_finalize
+        Type:            R_X86_64_GLOB_DAT
+  - Name:            .rela.plt
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC, SHF_INFO_LINK ]
+    Address:         0x510
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Info:            .got.plt
+    Relocations:
+      - Offset:          0x4018
+        Symbol:          puts
+        Type:            R_X86_64_JUMP_SLOT
+  - Name:            .init
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1000
+    AddressAlign:    0x4
+    Offset:          0x1000
+    Content:         F30F1EFA4883EC08488B05D92F00004885C07402FFD04883C408C3
+  - Name:            .plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1020
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         FF35E22F0000F2FF25E32F00000F1F00F30F1EFA6800000000F2E9E1FFFFFF90
+  - Name:            .plt.got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1040
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25AD2F00000F1F440000
+  - Name:            .plt.sec
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1050
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25BD2F00000F1F440000
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1060
+    AddressAlign:    0x10
+    Content:         488D3DC12F0000488D05BA2F00004839F87415488B05662F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D912F0000488D358A2F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B05352F00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D4D2F000000752B5548833D122F0000004889E5740C488B3D2E2F0000E849FFFFFFE864FFFFFFC605252F0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFFF30F1EFA554889E5488D05D80E00004889C7E820FFFFFF905DC3
+  - Name:            .fini
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1134
+    AddressAlign:    0x4
+    Content:         F30F1EFA4883EC084883C408C3
+  - Name:            .rodata
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2000
+    AddressAlign:    0x1
+    Offset:          0x2000
+    Content:         48656C6C6F2066726F6D204200
+  - Name:            .eh_frame_hdr
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2010
+    AddressAlign:    0x4
+    Content:         011B033B2C0000000400000010F0FFFF4800000030F0FFFF7000000040F0FFFF8800000009F1FFFFA0000000
+  - Name:            .eh_frame
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2040
+    AddressAlign:    0x8
+    Content:         1400000000000000017A5200017810011B0C070890010000240000001C000000C0EFFFFF20000000000E10460E184A0F0B770880003F1A3A2A332422000000001400000044000000B8EFFFFF100000000000000000000000140000005C000000B0EFFFFF1000000000000000000000001C0000007400000061F0FFFF1A00000000450E108602430D06510C070800000000000000
+  - Name:            .init_array
+    Type:            SHT_INIT_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E10
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Offset:          0x2E10
+    Content:         '1011000000000000'
+  - Name:            .fini_array
+    Type:            SHT_FINI_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E18
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         D010000000000000
+  - Name:            .dynamic
+    Type:            SHT_DYNAMIC
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E20
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Entries:
+      - Tag:             DT_NEEDED
+        Value:           0x5F
+      - Tag:             DT_INIT
+        Value:           0x1000
+      - Tag:             DT_FINI
+        Value:           0x1134
+      - Tag:             DT_INIT_ARRAY
+        Value:           0x3E10
+      - Tag:             DT_INIT_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_FINI_ARRAY
+        Value:           0x3E18
+      - Tag:             DT_FINI_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_GNU_HASH
+        Value:           0x2F0
+      - Tag:             DT_STRTAB
+        Value:           0x3C0
+      - Tag:             DT_SYMTAB
+        Value:           0x318
+      - Tag:             DT_STRSZ
+        Value:           0x75
+      - Tag:             DT_SYMENT
+        Value:           0x18
+      - Tag:             DT_PLTGOT
+        Value:           0x4000
+      - Tag:             DT_PLTRELSZ
+        Value:           0x18
+      - Tag:             DT_PLTREL
+        Value:           0x7
+      - Tag:             DT_JMPREL
+        Value:           0x510
+      - Tag:             DT_RELA
+        Value:           0x468
+      - Tag:             DT_RELASZ
+        Value:           0xA8
+      - Tag:             DT_RELAENT
+        Value:           0x18
+      - Tag:             DT_VERNEED
+        Value:           0x448
+      - Tag:             DT_VERNEEDNUM
+        Value:           0x1
+      - Tag:             DT_VERSYM
+        Value:           0x436
+      - Tag:             DT_RELACOUNT
+        Value:           0x3
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+  - Name:            .got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3FE0
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '0000000000000000000000000000000000000000000000000000000000000000'
+  - Name:            .got.plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4000
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '203E000000000000000000000000000000000000000000003010000000000000'
+  - Name:            .data
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4020
+    AddressAlign:    0x8
+    Content:         '2040000000000000'
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4028
+    AddressAlign:    0x1
+    Size:            0x8
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         4743433A20285562756E74752031312E342E302D317562756E7475317E32322E30342E32292031312E342E3000
+Symbols:
+  - Name:            crtstuff.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            deregister_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1060
+  - Name:            register_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1090
+  - Name:            __do_global_dtors_aux
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x10D0
+  - Name:            completed.0
+    Type:            STT_OBJECT
+    Section:         .bss
+    Value:           0x4028
+    Size:            0x1
+  - Name:            __do_global_dtors_aux_fini_array_entry
+    Type:            STT_OBJECT
+    Section:         .fini_array
+    Value:           0x3E18
+  - Name:            frame_dummy
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1110
+  - Name:            __frame_dummy_init_array_entry
+    Type:            STT_OBJECT
+    Section:         .init_array
+    Value:           0x3E10
+  - Name:            libB.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            'crtstuff.c (1)'
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            __FRAME_END__
+    Type:            STT_OBJECT
+    Section:         .eh_frame
+    Value:           0x20D0
+  - Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            _fini
+    Type:            STT_FUNC
+    Section:         .fini
+    Value:           0x1134
+  - Name:            __dso_handle
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4020
+  - Name:            _DYNAMIC
+    Type:            STT_OBJECT
+    Section:         .dynamic
+    Value:           0x3E20
+  - Name:            __GNU_EH_FRAME_HDR
+    Section:         .eh_frame_hdr
+    Value:           0x2010
+  - Name:            __TMC_END__
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4028
+  - Name:            _GLOBAL_OFFSET_TABLE_
+    Type:            STT_OBJECT
+    Section:         .got.plt
+    Value:           0x4000
+  - Name:            _init
+    Type:            STT_FUNC
+    Section:         .init
+    Value:           0x1000
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            'puts@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            sayB
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1119
+    Size:            0x1A
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            '__cxa_finalize@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_WEAK
+DynamicSymbols:
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            puts
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            __cxa_finalize
+    Type:            STT_FUNC
+    Binding:         STB_WEAK
+  - Name:            sayB
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1119
+    Size:            0x1A
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_macho.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_macho.yaml
new file mode 100644
index 0000000..3d57c4f
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_macho.yaml
@@ -0,0 +1,723 @@
+--- !fat-mach-o
+FatHeader:
+  magic:           0xCAFEBABE
+  nfat_arch:       3
+FatArchs:
+  - cputype:         0x1000007
+    cpusubtype:      0x3
+    offset:          0x1000
+    size:            8376
+    align:           12
+  - cputype:         0x100000C
+    cpusubtype:      0x0
+    offset:          0x4000
+    size:            33376
+    align:           14
+  - cputype:         0x100000C
+    cpusubtype:      0x80000002
+    offset:          0x10000
+    size:            33376
+    align:           14
+Slices:
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x1000007
+      cpusubtype:      0x3
+      filetype:        0x6
+      ncmds:           14
+      sizeofcmds:      960
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          4096
+        fileoff:         0
+        filesize:        4096
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0xF80
+            size:            20
+            offset:          0xF80
+            align:           4
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         554889E5488D3D0F000000B000E8020000005DC3
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0xF94
+            size:            6
+            offset:          0xF94
+            align:           1
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x6
+            reserved3:       0x0
+            content:         FF2566000000
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0xF9A
+            size:            14
+            offset:          0xF9A
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D20420A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0xFA8
+            size:            88
+            offset:          0xFA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000800F00004000000040000000940F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000100000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          4096
+        vmsize:          4096
+        fileoff:         4096
+        filesize:        4096
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x1000
+            size:            8
+            offset:          0x1000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '0000000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          8192
+        vmsize:          4096
+        fileoff:         8192
+        filesize:        184
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libB.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         8192
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         8288
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          8320
+        nsyms:           2
+        stroff:          8360
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  8352
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            88B60B3C-13D3-3D7E-AEED-5F3E991FDF08
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         8312
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         8320
+        datasize:        0
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayB
+            Flags:           0x0
+            Address:         0xF80
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         3968
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayB
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0xF80 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x10, 0x6, 0x0, 
+                         0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x0
+      filetype:        0x6
+      ncmds:           15
+      sizeofcmds:      976
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F70
+            size:            28
+            offset:          0x3F70
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8C0035FD6
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0x3F8C
+            size:            12
+            offset:          0x3F8C
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0xC
+            reserved3:       0x0
+            content:         100000B0100240F900021FD6
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0x3F98
+            size:            14
+            offset:          0x3F98
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D20420A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000703F000040000000400000008C3F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            8
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '0000000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        608
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libB.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32864
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32896
+        nsyms:           2
+        stroff:          32936
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32928
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            90C3787A-22E1-35AE-9284-97A4842F88AF
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32888
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32896
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         32960
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayB
+            Flags:           0x0
+            Address:         0x3F70
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16240
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayB
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0x3F70 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0x6, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x80000002
+      filetype:        0x6
+      ncmds:           15
+      sizeofcmds:      976
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F68
+            size:            32
+            offset:          0x3F68
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         7F2303D5FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8FF0F5FD6
+          - sectname:        __auth_stubs
+            segname:         __TEXT
+            addr:            0x3F88
+            size:            16
+            offset:          0x3F88
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x10
+            reserved3:       0x0
+            content:         110000B031020091300240F9110A1FD7
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0x3F98
+            size:            14
+            offset:          0x3F98
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D20420A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000683F00004000000040000000883F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __auth_got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            8
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         00000000000001C0
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        608
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libB.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32864
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32896
+        nsyms:           2
+        stroff:          32936
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32928
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            76B41B3A-00EC-388B-A432-478A96772CC4
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32888
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32896
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         32960
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayB
+            Flags:           0x0
+            Address:         0x3F68
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16232
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayB
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0x3F68 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0xC, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_linux.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_linux.yaml
new file mode 100644
index 0000000..3fabf9a
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_linux.yaml
@@ -0,0 +1,450 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_X86_64
+ProgramHeaders:
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .rela.plt
+    Align:           0x1000
+    Offset:          0x0
+  - Type:            PT_LOAD
+    Flags:           [ PF_X, PF_R ]
+    FirstSec:        .init
+    LastSec:         .fini
+    VAddr:           0x1000
+    Align:           0x1000
+    Offset:          0x1000
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .eh_frame_hdr
+    LastSec:         .eh_frame
+    VAddr:           0x2000
+    Align:           0x1000
+    Offset:          0x2000
+  - Type:            PT_LOAD
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .bss
+    VAddr:           0x3E10
+    Align:           0x1000
+    Offset:          0x2E10
+  - Type:            PT_DYNAMIC
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .dynamic
+    LastSec:         .dynamic
+    VAddr:           0x3E20
+    Align:           0x8
+    Offset:          0x2E20
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.build-id
+    LastSec:         .note.gnu.build-id
+    VAddr:           0x2C8
+    Align:           0x4
+    Offset:          0x2C8
+  - Type:            PT_GNU_PROPERTY
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_GNU_EH_FRAME
+    Flags:           [ PF_R ]
+    FirstSec:        .eh_frame_hdr
+    LastSec:         .eh_frame_hdr
+    VAddr:           0x2000
+    Align:           0x4
+    Offset:          0x2000
+  - Type:            PT_GNU_STACK
+    Flags:           [ PF_W, PF_R ]
+    Align:           0x10
+    Offset:          0x0
+  - Type:            PT_GNU_RELRO
+    Flags:           [ PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .got
+    VAddr:           0x3E10
+    Offset:          0x2E10
+Sections:
+  - Name:            .note.gnu.property
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2A8
+    AddressAlign:    0x8
+    Notes:
+      - Name:            GNU
+        Desc:            020000C0040000000300000000000000
+        Type:            NT_GNU_PROPERTY_TYPE_0
+  - Name:            .note.gnu.build-id
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2C8
+    AddressAlign:    0x4
+    Notes:
+      - Name:            GNU
+        Desc:            0318D63E46BF31CEFF90D5C7F0475D9F78676EC8
+        Type:            NT_PRPSINFO
+  - Name:            .gnu.hash
+    Type:            SHT_GNU_HASH
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2F0
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Header:
+      SymNdx:          0x8
+      Shift2:          0x6
+    BloomFilter:     [ 0x400000200000 ]
+    HashBuckets:     [ 0x0, 0x8 ]
+    HashValues:      [ 0x7C9DCB95 ]
+  - Name:            .dynsym
+    Type:            SHT_DYNSYM
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x318
+    Link:            .dynstr
+    AddressAlign:    0x8
+  - Name:            .dynstr
+    Type:            SHT_STRTAB
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x3F0
+    AddressAlign:    0x1
+    Content:         "6C6962412E736F006C6962422E736F006C69625A2E736F00244F524947494E2F2E2E2F413A244F524947494E2F2E2E2F423A244F524947494E2F2E2E2F5A"
+  - Name:            .rela.dyn
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x498
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Relocations:
+      - Offset:          0x3E10
+        Type:            R_X86_64_RELATIVE
+        Addend:          4432
+      - Offset:          0x3E18
+        Type:            R_X86_64_RELATIVE
+        Addend:          4368
+      - Offset:          0x4030
+        Type:            R_X86_64_RELATIVE
+        Addend:          16432
+      - Offset:          0x3FE0
+        Symbol:          __cxa_finalize
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FE8
+        Symbol:          _ITM_registerTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF0
+        Symbol:          _ITM_deregisterTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF8
+        Symbol:          __gmon_start__
+        Type:            R_X86_64_GLOB_DAT
+  - Name:            .rela.plt
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC, SHF_INFO_LINK ]
+    Address:         0x540
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Info:            .got.plt
+    Relocations:
+      - Offset:          0x4018
+        Symbol:          sayA
+        Type:            R_X86_64_JUMP_SLOT
+      - Offset:          0x4020
+        Symbol:          sayB
+        Type:            R_X86_64_JUMP_SLOT
+      - Offset:          0x4028
+        Symbol:          sayZ
+        Type:            R_X86_64_JUMP_SLOT
+  - Name:            .init
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1000
+    AddressAlign:    0x4
+    Offset:          0x1000
+    Content:         F30F1EFA4883EC08488B05E92F00004885C07402FFD04883C408C3
+  - Name:            .plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1020
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         FF35E22F0000F2FF25E32F00000F1F00F30F1EFA6800000000F2E9E1FFFFFF90F30F1EFA6801000000F2E9D1FFFFFF90F30F1EFA6802000000F2E9C1FFFFFF90
+  - Name:            .plt.got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1060
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25752F00000F1F440000
+  - Name:            .plt.sec
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1070
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF259D2F00000F1F440000F30F1EFAF2FF25952F00000F1F440000F30F1EFAF2FF258D2F00000F1F440000
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x10A0
+    AddressAlign:    0x10
+    Content:         488D3D912F0000488D058A2F00004839F87415488B05362F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D612F0000488D355A2F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B05ED2E00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D1D2F000000752B5548833DBA2E0000004889E5740C488B3DFE2E0000E829FFFFFFE864FFFFFFC605F52E0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFFF30F1EFA554889E5B800000000E805FFFFFFB800000000E80BFFFFFFB800000000E811FFFFFF905DC3
+  - Name:            .fini
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1184
+    AddressAlign:    0x4
+    Content:         F30F1EFA4883EC084883C408C3
+  - Name:            .eh_frame_hdr
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2000
+    AddressAlign:    0x4
+    Offset:          0x2000
+    Content:         011B033B2C0000000400000020F0FFFF4800000060F0FFFF7000000070F0FFFF8800000059F1FFFFA0000000
+  - Name:            .eh_frame
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2030
+    AddressAlign:    0x8
+    Content:         1400000000000000017A5200017810011B0C070890010000240000001C000000D0EFFFFF40000000000E10460E184A0F0B770880003F1A3A2A332422000000001400000044000000E8EFFFFF100000000000000000000000140000005C000000E0EFFFFF3000000000000000000000001C00000074000000B1F0FFFF2900000000450E108602430D06600C070800000000000000
+  - Name:            .init_array
+    Type:            SHT_INIT_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E10
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Offset:          0x2E10
+    Content:         '5011000000000000'
+  - Name:            .fini_array
+    Type:            SHT_FINI_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E18
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '1011000000000000'
+  - Name:            .dynamic
+    Type:            SHT_DYNAMIC
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E20
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Entries:
+      - Tag:             DT_NEEDED
+        Value:           0x0
+      - Tag:             DT_NEEDED
+        Value:           0x8
+      - Tag:             DT_NEEDED
+        Value:           0x10
+      - Tag:             DT_RUNPATH
+        Value:           0x18
+      - Tag:             DT_INIT
+        Value:           0x1000
+      - Tag:             DT_FINI
+        Value:           0x1184
+      - Tag:             DT_INIT_ARRAY
+        Value:           0x3E10
+      - Tag:             DT_INIT_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_FINI_ARRAY
+        Value:           0x3E18
+      - Tag:             DT_FINI_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_GNU_HASH
+        Value:           0x2F0
+      - Tag:             DT_STRTAB
+        Value:           0x3F0
+      - Tag:             DT_SYMTAB
+        Value:           0x318
+      - Tag:             DT_STRSZ
+        Value:           0xA8
+      - Tag:             DT_SYMENT
+        Value:           0x18
+      - Tag:             DT_PLTGOT
+        Value:           0x4000
+      - Tag:             DT_PLTRELSZ
+        Value:           0x48
+      - Tag:             DT_PLTREL
+        Value:           0x7
+      - Tag:             DT_JMPREL
+        Value:           0x540
+      - Tag:             DT_RELA
+        Value:           0x498
+      - Tag:             DT_RELASZ
+        Value:           0xA8
+      - Tag:             DT_RELAENT
+        Value:           0x18
+      - Tag:             DT_RELACOUNT
+        Value:           0x3
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+  - Name:            .got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3FE0
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '0000000000000000000000000000000000000000000000000000000000000000'
+  - Name:            .got.plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4000
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '203E00000000000000000000000000000000000000000000301000000000000040100000000000005010000000000000'
+  - Name:            .data
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4030
+    AddressAlign:    0x8
+    Content:         '3040000000000000'
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4038
+    AddressAlign:    0x1
+    Size:            0x8
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         4743433A20285562756E74752031312E342E302D317562756E7475317E32322E30342E32292031312E342E3000
+Symbols:
+  - Name:            crtstuff.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            deregister_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x10A0
+  - Name:            register_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x10D0
+  - Name:            __do_global_dtors_aux
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1110
+  - Name:            completed.0
+    Type:            STT_OBJECT
+    Section:         .bss
+    Value:           0x4038
+    Size:            0x1
+  - Name:            __do_global_dtors_aux_fini_array_entry
+    Type:            STT_OBJECT
+    Section:         .fini_array
+    Value:           0x3E18
+  - Name:            frame_dummy
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1150
+  - Name:            __frame_dummy_init_array_entry
+    Type:            STT_OBJECT
+    Section:         .init_array
+    Value:           0x3E10
+  - Name:            libC.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            'crtstuff.c (1)'
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            __FRAME_END__
+    Type:            STT_OBJECT
+    Section:         .eh_frame
+    Value:           0x20C0
+  - Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            _DYNAMIC
+    Type:            STT_OBJECT
+    Section:         .dynamic
+    Value:           0x3E20
+  - Name:            __TMC_END__
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4038
+  - Name:            __dso_handle
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4030
+  - Name:            _init
+    Type:            STT_FUNC
+    Section:         .init
+    Value:           0x1000
+  - Name:            __GNU_EH_FRAME_HDR
+    Section:         .eh_frame_hdr
+    Value:           0x2000
+  - Name:            _fini
+    Type:            STT_FUNC
+    Section:         .fini
+    Value:           0x1184
+  - Name:            _GLOBAL_OFFSET_TABLE_
+    Type:            STT_OBJECT
+    Section:         .got.plt
+    Value:           0x4000
+  - Name:            __cxa_finalize
+    Binding:         STB_WEAK
+  - Name:            sayC
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1159
+    Size:            0x29
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            sayA
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            sayB
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            sayZ
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+DynamicSymbols:
+  - Name:            __cxa_finalize
+    Binding:         STB_WEAK
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            sayA
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            sayB
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            sayZ
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            sayC
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1159
+    Size:            0x29
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_macho.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_macho.yaml
new file mode 100644
index 0000000..ba33483
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_macho.yaml
@@ -0,0 +1,870 @@
+--- !fat-mach-o
+FatHeader:
+  magic:           0xCAFEBABE
+  nfat_arch:       3
+FatArchs:
+  - cputype:         0x1000007
+    cpusubtype:      0x3
+    offset:          0x1000
+    size:            8456
+    align:           12
+  - cputype:         0x100000C
+    cpusubtype:      0x0
+    offset:          0x4000
+    size:            33456
+    align:           14
+  - cputype:         0x100000C
+    cpusubtype:      0x80000002
+    offset:          0x10000
+    size:            33456
+    align:           14
+Slices:
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x1000007
+      cpusubtype:      0x3
+      filetype:        0x6
+      ncmds:           20
+      sizeofcmds:      1120
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         312
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          4096
+        fileoff:         0
+        filesize:        4096
+        maxprot:         5
+        initprot:        5
+        nsects:          3
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0xF70
+            size:            27
+            offset:          0xF70
+            align:           4
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         554889E5B000E811000000B000E810000000B000E80F0000005DC3
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0xF8C
+            size:            18
+            offset:          0xF8C
+            align:           1
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x6
+            reserved3:       0x0
+            content:         FF256E000000FF2570000000FF2572000000
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0xFA0
+            size:            88
+            offset:          0xFA0
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000700F000040000000400000008B0F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000100000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          4096
+        vmsize:          4096
+        fileoff:         4096
+        filesize:        4096
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x1000
+            size:            24
+            offset:          0x1000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x3
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '000000000000108001000000000010800200000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          8192
+        vmsize:          4096
+        fileoff:         8192
+        filesize:        264
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libC.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         8192
+        datasize:        112
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         8304
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          8336
+        nsyms:           4
+        stroff:          8424
+        strsize:         32
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       3
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  8400
+        nindirectsyms:   6
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            2AA1F9E9-F250-366F-B382-51A91DE06BED
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libA.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libB.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libZ.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../A'
+        ZeroPadBytes:    3
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../B'
+        ZeroPadBytes:    3
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../Z'
+        ZeroPadBytes:    3
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         8328
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         8336
+        datasize:        0
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayC
+            Flags:           0x0
+            Address:         0xF70
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         3952
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+        - n_strx:          14
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          512
+          n_value:         0
+        - n_strx:          20
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          768
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayC
+        - _sayA
+        - _sayB
+        - _sayZ
+        - ''
+        - ''
+        - ''
+        - ''
+        - ''
+        - ''
+      IndirectSymbols: [ 0x1, 0x2, 0x3, 0x1, 0x2, 0x3 ]
+      FunctionStarts:  [ 0xF70 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x58, 0x0, 0x0, 0x0, 0x3, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x10, 0x6, 0x0, 
+                         0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x2, 0xE, 0x0, 0x0, 0x3, 0x1A, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x73, 0x61, 0x79, 
+                         0x41, 0x0, 0x5F, 0x73, 0x61, 0x79, 0x42, 0x0, 
+                         0x5F, 0x73, 0x61, 0x79, 0x5A, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x0
+      filetype:        0x6
+      ncmds:           21
+      sizeofcmds:      1136
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         312
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          3
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F68
+            size:            28
+            offset:          0x3F68
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         FD7BBFA9FD030091050000940700009409000094FD7BC1A8C0035FD6
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0x3F84
+            size:            36
+            offset:          0x3F84
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0xC
+            reserved3:       0x0
+            content:         100000B0100240F900021FD6100000B0100640F900021FD6100000B0100A40F900021FD6
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000683F00004000000040000000843F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            24
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x3
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '000000000000108001000000000010800200000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        688
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libC.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        112
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32880
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32912
+        nsyms:           4
+        stroff:          33000
+        strsize:         32
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       3
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32976
+        nindirectsyms:   6
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            02B69690-925D-35EE-A8AB-6D99813D2A16
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libA.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libB.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libZ.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../A'
+        ZeroPadBytes:    3
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../B'
+        ZeroPadBytes:    3
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../Z'
+        ZeroPadBytes:    3
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32904
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32912
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         33040
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayC
+            Flags:           0x0
+            Address:         0x3F68
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16232
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+        - n_strx:          14
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          512
+          n_value:         0
+        - n_strx:          20
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          768
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayC
+        - _sayA
+        - _sayB
+        - _sayZ
+        - ''
+        - ''
+        - ''
+        - ''
+        - ''
+        - ''
+      IndirectSymbols: [ 0x1, 0x2, 0x3, 0x1, 0x2, 0x3 ]
+      FunctionStarts:  [ 0x3F68 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x58, 0x0, 0x0, 0x0, 0x3, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0x6, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x2, 0xE, 0x0, 0x0, 0x3, 0x1A, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x73, 0x61, 0x79, 
+                         0x41, 0x0, 0x5F, 0x73, 0x61, 0x79, 0x42, 0x0, 
+                         0x5F, 0x73, 0x61, 0x79, 0x5A, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x80000002
+      filetype:        0x6
+      ncmds:           21
+      sizeofcmds:      1136
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         312
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          3
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F58
+            size:            32
+            offset:          0x3F58
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         7F2303D5FD7BBFA9FD03009105000094080000940B000094FD7BC1A8FF0F5FD6
+          - sectname:        __auth_stubs
+            segname:         __TEXT
+            addr:            0x3F78
+            size:            48
+            offset:          0x3F78
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x10
+            reserved3:       0x0
+            content:         110000B031020091300240F9110A1FD7110000B031220091300240F9110A1FD7110000B031420091300240F9110A1FD7
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000583F00004000000040000000783F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __auth_got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            24
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x3
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         00000000000009C001000000000009C002000000000001C0
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        688
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libC.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        112
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32880
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32912
+        nsyms:           4
+        stroff:          33000
+        strsize:         32
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       3
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32976
+        nindirectsyms:   6
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            F54076AA-8888-3DED-8BDF-BC7FB3E6FE8A
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libA.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libB.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libZ.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../A'
+        ZeroPadBytes:    3
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../B'
+        ZeroPadBytes:    3
+      - cmd:             LC_RPATH
+        cmdsize:         32
+        path:            12
+        Content:         '@loader_path/../Z'
+        ZeroPadBytes:    3
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32904
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32912
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         33040
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayC
+            Flags:           0x0
+            Address:         0x3F58
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16216
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+        - n_strx:          14
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          512
+          n_value:         0
+        - n_strx:          20
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          768
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayC
+        - _sayA
+        - _sayB
+        - _sayZ
+        - ''
+        - ''
+        - ''
+        - ''
+        - ''
+        - ''
+      IndirectSymbols: [ 0x1, 0x2, 0x3, 0x1, 0x2, 0x3 ]
+      FunctionStarts:  [ 0x3F58 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x58, 0x0, 0x0, 0x0, 0x3, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0xC, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x2, 0xE, 0x0, 0x0, 0x3, 0x1A, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x73, 0x61, 0x79, 
+                         0x41, 0x0, 0x5F, 0x73, 0x61, 0x79, 0x42, 0x0, 
+                         0x5F, 0x73, 0x61, 0x79, 0x5A, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0 ]
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_linux.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_linux.yaml
new file mode 100644
index 0000000..5561f29
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_linux.yaml
@@ -0,0 +1,460 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_X86_64
+ProgramHeaders:
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .rela.plt
+    Align:           0x1000
+    Offset:          0x0
+  - Type:            PT_LOAD
+    Flags:           [ PF_X, PF_R ]
+    FirstSec:        .init
+    LastSec:         .fini
+    VAddr:           0x1000
+    Align:           0x1000
+    Offset:          0x1000
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .rodata
+    LastSec:         .eh_frame
+    VAddr:           0x2000
+    Align:           0x1000
+    Offset:          0x2000
+  - Type:            PT_LOAD
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .bss
+    VAddr:           0x3E10
+    Align:           0x1000
+    Offset:          0x2E10
+  - Type:            PT_DYNAMIC
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .dynamic
+    LastSec:         .dynamic
+    VAddr:           0x3E20
+    Align:           0x8
+    Offset:          0x2E20
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.build-id
+    LastSec:         .note.gnu.build-id
+    VAddr:           0x2C8
+    Align:           0x4
+    Offset:          0x2C8
+  - Type:            PT_GNU_PROPERTY
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x2A8
+    Align:           0x8
+    Offset:          0x2A8
+  - Type:            PT_GNU_EH_FRAME
+    Flags:           [ PF_R ]
+    FirstSec:        .eh_frame_hdr
+    LastSec:         .eh_frame_hdr
+    VAddr:           0x2010
+    Align:           0x4
+    Offset:          0x2010
+  - Type:            PT_GNU_STACK
+    Flags:           [ PF_W, PF_R ]
+    Align:           0x10
+    Offset:          0x0
+  - Type:            PT_GNU_RELRO
+    Flags:           [ PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .got
+    VAddr:           0x3E10
+    Offset:          0x2E10
+Sections:
+  - Name:            .note.gnu.property
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2A8
+    AddressAlign:    0x8
+    Notes:
+      - Name:            GNU
+        Desc:            020000C0040000000300000000000000
+        Type:            NT_GNU_PROPERTY_TYPE_0
+  - Name:            .note.gnu.build-id
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2C8
+    AddressAlign:    0x4
+    Notes:
+      - Name:            GNU
+        Desc:            640A4A3AC0DF6BA3DAC3B51CCD727245117E0B30
+        Type:            NT_PRPSINFO
+  - Name:            .gnu.hash
+    Type:            SHT_GNU_HASH
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2F0
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Header:
+      SymNdx:          0x6
+      Shift2:          0x6
+    BloomFilter:     [ 0x500000000000 ]
+    HashBuckets:     [ 0x6, 0x0 ]
+    HashValues:      [ 0x7C9DCBAD ]
+  - Name:            .dynsym
+    Type:            SHT_DYNSYM
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x318
+    Link:            .dynstr
+    AddressAlign:    0x8
+  - Name:            .dynstr
+    Type:            SHT_STRTAB
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x3C0
+    AddressAlign:    0x1
+  - Name:            .gnu.version
+    Type:            SHT_GNU_versym
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x436
+    Link:            .dynsym
+    AddressAlign:    0x2
+    Entries:         [ 0, 1, 2, 1, 1, 2, 1 ]
+  - Name:            .gnu.version_r
+    Type:            SHT_GNU_verneed
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x448
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Dependencies:
+      - Version:         1
+        File:            libc.so.6
+        Entries:
+          - Name:            GLIBC_2.2.5
+            Hash:            157882997
+            Flags:           0
+            Other:           2
+  - Name:            .rela.dyn
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x468
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Relocations:
+      - Offset:          0x3E10
+        Type:            R_X86_64_RELATIVE
+        Addend:          4368
+      - Offset:          0x3E18
+        Type:            R_X86_64_RELATIVE
+        Addend:          4304
+      - Offset:          0x4020
+        Type:            R_X86_64_RELATIVE
+        Addend:          16416
+      - Offset:          0x3FE0
+        Symbol:          _ITM_deregisterTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FE8
+        Symbol:          __gmon_start__
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF0
+        Symbol:          _ITM_registerTMCloneTable
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x3FF8
+        Symbol:          __cxa_finalize
+        Type:            R_X86_64_GLOB_DAT
+  - Name:            .rela.plt
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC, SHF_INFO_LINK ]
+    Address:         0x510
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Info:            .got.plt
+    Relocations:
+      - Offset:          0x4018
+        Symbol:          puts
+        Type:            R_X86_64_JUMP_SLOT
+  - Name:            .init
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1000
+    AddressAlign:    0x4
+    Offset:          0x1000
+    Content:         F30F1EFA4883EC08488B05D92F00004885C07402FFD04883C408C3
+  - Name:            .plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1020
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         FF35E22F0000F2FF25E32F00000F1F00F30F1EFA6800000000F2E9E1FFFFFF90
+  - Name:            .plt.got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1040
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25AD2F00000F1F440000
+  - Name:            .plt.sec
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1050
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         F30F1EFAF2FF25BD2F00000F1F440000
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1060
+    AddressAlign:    0x10
+    Content:         488D3DC12F0000488D05BA2F00004839F87415488B05662F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D912F0000488D358A2F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B05352F00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D4D2F000000752B5548833D122F0000004889E5740C488B3D2E2F0000E849FFFFFFE864FFFFFFC605252F0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFFF30F1EFA554889E5488D05D80E00004889C7E820FFFFFF905DC3
+  - Name:            .fini
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1134
+    AddressAlign:    0x4
+    Content:         F30F1EFA4883EC084883C408C3
+  - Name:            .rodata
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2000
+    AddressAlign:    0x1
+    Offset:          0x2000
+    Content:         48656C6C6F2066726F6D205A00
+  - Name:            .eh_frame_hdr
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2010
+    AddressAlign:    0x4
+    Content:         011B033B2C0000000400000010F0FFFF4800000030F0FFFF7000000040F0FFFF8800000009F1FFFFA0000000
+  - Name:            .eh_frame
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2040
+    AddressAlign:    0x8
+    Content:         1400000000000000017A5200017810011B0C070890010000240000001C000000C0EFFFFF20000000000E10460E184A0F0B770880003F1A3A2A332422000000001400000044000000B8EFFFFF100000000000000000000000140000005C000000B0EFFFFF1000000000000000000000001C0000007400000061F0FFFF1A00000000450E108602430D06510C070800000000000000
+  - Name:            .init_array
+    Type:            SHT_INIT_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E10
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Offset:          0x2E10
+    Content:         '1011000000000000'
+  - Name:            .fini_array
+    Type:            SHT_FINI_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E18
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         D010000000000000
+  - Name:            .dynamic
+    Type:            SHT_DYNAMIC
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3E20
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Entries:
+      - Tag:             DT_NEEDED
+        Value:           0x5F
+      - Tag:             DT_INIT
+        Value:           0x1000
+      - Tag:             DT_FINI
+        Value:           0x1134
+      - Tag:             DT_INIT_ARRAY
+        Value:           0x3E10
+      - Tag:             DT_INIT_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_FINI_ARRAY
+        Value:           0x3E18
+      - Tag:             DT_FINI_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_GNU_HASH
+        Value:           0x2F0
+      - Tag:             DT_STRTAB
+        Value:           0x3C0
+      - Tag:             DT_SYMTAB
+        Value:           0x318
+      - Tag:             DT_STRSZ
+        Value:           0x75
+      - Tag:             DT_SYMENT
+        Value:           0x18
+      - Tag:             DT_PLTGOT
+        Value:           0x4000
+      - Tag:             DT_PLTRELSZ
+        Value:           0x18
+      - Tag:             DT_PLTREL
+        Value:           0x7
+      - Tag:             DT_JMPREL
+        Value:           0x510
+      - Tag:             DT_RELA
+        Value:           0x468
+      - Tag:             DT_RELASZ
+        Value:           0xA8
+      - Tag:             DT_RELAENT
+        Value:           0x18
+      - Tag:             DT_VERNEED
+        Value:           0x448
+      - Tag:             DT_VERNEEDNUM
+        Value:           0x1
+      - Tag:             DT_VERSYM
+        Value:           0x436
+      - Tag:             DT_RELACOUNT
+        Value:           0x3
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+  - Name:            .got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3FE0
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '0000000000000000000000000000000000000000000000000000000000000000'
+  - Name:            .got.plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4000
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '203E000000000000000000000000000000000000000000003010000000000000'
+  - Name:            .data
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4020
+    AddressAlign:    0x8
+    Content:         '2040000000000000'
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x4028
+    AddressAlign:    0x1
+    Size:            0x8
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         4743433A20285562756E74752031312E342E302D317562756E7475317E32322E30342E32292031312E342E3000
+Symbols:
+  - Name:            crtstuff.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            deregister_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1060
+  - Name:            register_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1090
+  - Name:            __do_global_dtors_aux
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x10D0
+  - Name:            completed.0
+    Type:            STT_OBJECT
+    Section:         .bss
+    Value:           0x4028
+    Size:            0x1
+  - Name:            __do_global_dtors_aux_fini_array_entry
+    Type:            STT_OBJECT
+    Section:         .fini_array
+    Value:           0x3E18
+  - Name:            frame_dummy
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x1110
+  - Name:            __frame_dummy_init_array_entry
+    Type:            STT_OBJECT
+    Section:         .init_array
+    Value:           0x3E10
+  - Name:            libZ.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            'crtstuff.c (1)'
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            __FRAME_END__
+    Type:            STT_OBJECT
+    Section:         .eh_frame
+    Value:           0x20D0
+  - Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            _fini
+    Type:            STT_FUNC
+    Section:         .fini
+    Value:           0x1134
+  - Name:            __dso_handle
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4020
+  - Name:            _DYNAMIC
+    Type:            STT_OBJECT
+    Section:         .dynamic
+    Value:           0x3E20
+  - Name:            __GNU_EH_FRAME_HDR
+    Section:         .eh_frame_hdr
+    Value:           0x2010
+  - Name:            __TMC_END__
+    Type:            STT_OBJECT
+    Section:         .data
+    Value:           0x4028
+  - Name:            _GLOBAL_OFFSET_TABLE_
+    Type:            STT_OBJECT
+    Section:         .got.plt
+    Value:           0x4000
+  - Name:            _init
+    Type:            STT_FUNC
+    Section:         .init
+    Value:           0x1000
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            'puts@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            sayZ
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1119
+    Size:            0x1A
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            '__cxa_finalize@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_WEAK
+DynamicSymbols:
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            puts
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            __cxa_finalize
+    Type:            STT_FUNC
+    Binding:         STB_WEAK
+  - Name:            sayZ
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x1119
+    Size:            0x1A
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_macho.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_macho.yaml
new file mode 100644
index 0000000..c0c1826
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_macho.yaml
@@ -0,0 +1,723 @@
+--- !fat-mach-o
+FatHeader:
+  magic:           0xCAFEBABE
+  nfat_arch:       3
+FatArchs:
+  - cputype:         0x1000007
+    cpusubtype:      0x3
+    offset:          0x1000
+    size:            8376
+    align:           12
+  - cputype:         0x100000C
+    cpusubtype:      0x0
+    offset:          0x4000
+    size:            33376
+    align:           14
+  - cputype:         0x100000C
+    cpusubtype:      0x80000002
+    offset:          0x10000
+    size:            33376
+    align:           14
+Slices:
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x1000007
+      cpusubtype:      0x3
+      filetype:        0x6
+      ncmds:           14
+      sizeofcmds:      960
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          4096
+        fileoff:         0
+        filesize:        4096
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0xF80
+            size:            20
+            offset:          0xF80
+            align:           4
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         554889E5488D3D0F000000B000E8020000005DC3
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0xF94
+            size:            6
+            offset:          0xF94
+            align:           1
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x6
+            reserved3:       0x0
+            content:         FF2566000000
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0xF9A
+            size:            14
+            offset:          0xF9A
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D205A0A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0xFA8
+            size:            88
+            offset:          0xFA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000800F00004000000040000000940F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000100000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          4096
+        vmsize:          4096
+        fileoff:         4096
+        filesize:        4096
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x1000
+            size:            8
+            offset:          0x1000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '0000000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          8192
+        vmsize:          4096
+        fileoff:         8192
+        filesize:        184
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libZ.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         8192
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         8288
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          8320
+        nsyms:           2
+        stroff:          8360
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  8352
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            399E203C-FF9A-3B80-872C-85F3A759A78B
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         8312
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         8320
+        datasize:        0
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayZ
+            Flags:           0x0
+            Address:         0xF80
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         3968
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayZ
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0xF80 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x10, 0x6, 0x0, 
+                         0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x0
+      filetype:        0x6
+      ncmds:           15
+      sizeofcmds:      976
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F70
+            size:            28
+            offset:          0x3F70
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8C0035FD6
+          - sectname:        __stubs
+            segname:         __TEXT
+            addr:            0x3F8C
+            size:            12
+            offset:          0x3F8C
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0xC
+            reserved3:       0x0
+            content:         100000B0100240F900021FD6
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0x3F98
+            size:            14
+            offset:          0x3F98
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D205A0A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000703F000040000000400000008C3F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            8
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         '0000000000000080'
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        608
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libZ.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32864
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32896
+        nsyms:           2
+        stroff:          32936
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32928
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            6E8E78AF-EDB2-3830-BE1E-013390302CC5
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32888
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32896
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         32960
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayZ
+            Flags:           0x0
+            Address:         0x3F70
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16240
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayZ
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0x3F70 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0x6, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x80000002
+      filetype:        0x6
+      ncmds:           15
+      sizeofcmds:      976
+      flags:           0x100085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         392
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          4
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x3F68
+            size:            32
+            offset:          0x3F68
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         7F2303D5FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8FF0F5FD6
+          - sectname:        __auth_stubs
+            segname:         __TEXT
+            addr:            0x3F88
+            size:            16
+            offset:          0x3F88
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000408
+            reserved1:       0x0
+            reserved2:       0x10
+            reserved3:       0x0
+            content:         110000B031020091300240F9110A1FD7
+          - sectname:        __cstring
+            segname:         __TEXT
+            addr:            0x3F98
+            size:            14
+            offset:          0x3F98
+            align:           0
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x2
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         48656C6C6F2066726F6D205A0A00
+          - sectname:        __unwind_info
+            segname:         __TEXT
+            addr:            0x3FA8
+            size:            88
+            offset:          0x3FA8
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x0
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         010000001C000000000000001C000000000000001C00000002000000683F00004000000040000000883F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA_CONST
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        16384
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           16
+        Sections:
+          - sectname:        __auth_got
+            segname:         __DATA_CONST
+            addr:            0x4000
+            size:            8
+            offset:          0x4000
+            align:           3
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x6
+            reserved1:       0x1
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         00000000000001C0
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         32768
+        filesize:        608
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:         '@rpath/libZ.dylib'
+        ZeroPadBytes:    7
+      - cmd:             LC_DYLD_CHAINED_FIXUPS
+        cmdsize:         16
+        dataoff:         32768
+        datasize:        96
+      - cmd:             LC_DYLD_EXPORTS_TRIE
+        cmdsize:         16
+        dataoff:         32864
+        datasize:        24
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          32896
+        nsyms:           2
+        stroff:          32936
+        strsize:         16
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  32928
+        nindirectsyms:   2
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            E74F368D-238F-31FA-BF40-FA2964FED986
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        1
+        minos:           983040
+        sdk:             983552
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         73074435
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 88539136
+          compatibility_version: 65536
+        Content:         '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         32888
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         32896
+        datasize:        0
+      - cmd:             LC_CODE_SIGNATURE
+        cmdsize:         16
+        dataoff:         32960
+        datasize:        416
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      13
+            Name:            _sayZ
+            Flags:           0x0
+            Address:         0x3F68
+            Other:           0x0
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         16232
+        - n_strx:          8
+          n_type:          0x1
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _sayZ
+        - _printf
+      IndirectSymbols: [ 0x1, 0x1 ]
+      FunctionStarts:  [ 0x3F68 ]
+      ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48, 
+                         0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0, 
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0xC, 0x0, 
+                         0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0, 
+                         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72, 
+                         0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                         0x0, 0x0, 0x0 ]
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp b/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp
new file mode 100644
index 0000000..2a396da
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp
@@ -0,0 +1,764 @@
+//===- LibraryResolverTest.cpp - Unit tests for LibraryResolver -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h"
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h"
+#include "llvm/ObjectYAML/MachOYAML.h"
+#include "llvm/ObjectYAML/yaml2obj.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/YAMLParser.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/Testing/Support/SupportHelpers.h"
+
+#include "gtest/gtest.h"
+
+#include <algorithm>
+#include <optional>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::orc;
+
+// Disabled due to test setup issue — YAML to shared library creation seems
+// invalid on some build bots. (PR #165360) Not related to code logic.
+#if 0
+// TODO: Add COFF (Windows) support for these tests.
+// this facility also works correctly on Windows (COFF),
+// so we should eventually enable and run these tests for that platform as well.
+namespace {
+
+#if defined(__APPLE__)
+constexpr const char *ext = ".dylib";
+#elif defined(_WIN32)
+constexpr const char *ext = ".dll";
+#else
+constexpr const char *ext = ".so";
+#endif
+
+bool EnvReady = false;
+
+Triple getTargetTriple() {
+  auto JTMB = JITTargetMachineBuilder::detectHost();
+  if (!JTMB) {
+    consumeError(JTMB.takeError());
+    return Triple();
+  }
+  return JTMB->getTargetTriple();
+}
+
+static bool CheckHostSupport() {
+  auto Triple = getTargetTriple();
+  // TODO: Extend support to COFF (Windows) once test setup and YAML conversion
+  // are verified.
+  if (!Triple.isOSBinFormatMachO() &&
+      !(Triple.isOSBinFormatELF() && Triple.getArch() == Triple::x86_64))
+    return false;
+
+  return true;
+}
+
+std::string getYamlFilePlatformExt() {
+  auto Triple = getTargetTriple();
+  if (Triple.isOSBinFormatMachO())
+    return "_macho";
+  else if (Triple.isOSBinFormatELF())
+    return "_linux";
+
+  return "";
+}
+
+unsigned getYamlDocNum() {
+  // auto Triple = getTargetTriple();
+  // if (Triple.isOSBinFormatELF())
+  //   return 1;
+
+  return 1;
+}
+
+class LibraryTestEnvironment : public ::testing::Environment {
+  std::vector<std::string> CreatedDylibsDir;
+  std::vector<std::string> CreatedDylibs;
+  SmallVector<char, 128> DirPath;
+
+public:
+  void SetUp() override {
+    if (!CheckHostSupport()) {
+      EnvReady = false;
+      return;
+    }
+
+    StringRef ThisFile = __FILE__;
+    SmallVector<char, 128> InputDirPath(ThisFile.begin(), ThisFile.end());
+    sys::path::remove_filename(InputDirPath);
+    sys::path::append(InputDirPath, "Inputs");
+    if (!sys::fs::exists(InputDirPath))
+      return;
+
+    SmallString<128> UniqueDir;
+    sys::path::append(UniqueDir, InputDirPath);
+    std::error_code EC = sys::fs::createUniqueDirectory(UniqueDir, DirPath);
+
+    if (EC)
+      return;
+
+    // given yamlPath + DylibPath, validate + convert
+    auto processYamlToDylib = [&](const SmallVector<char, 128> &YamlPath,
+                                  const SmallVector<char, 128> &DylibPath,
+                                  unsigned DocNum) -> bool {
+      if (!sys::fs::exists(YamlPath)) {
+        errs() << "YAML file missing: "
+               << StringRef(YamlPath.data(), YamlPath.size()) << "\n";
+        EnvReady = false;
+        return false;
+      }
+
+      auto BufOrErr = MemoryBuffer::getFile(YamlPath);
+      if (!BufOrErr) {
+        errs() << "Failed to read "
+               << StringRef(YamlPath.data(), YamlPath.size()) << ": "
+               << BufOrErr.getError().message() << "\n";
+        EnvReady = false;
+        return false;
+      }
+
+      yaml::Input yin(BufOrErr->get()->getBuffer());
+      std::error_code EC;
+      raw_fd_ostream outFile(StringRef(DylibPath.data(), DylibPath.size()), EC,
+                             sys::fs::OF_None);
+
+      if (EC) {
+        errs() << "Failed to open "
+               << StringRef(DylibPath.data(), DylibPath.size())
+               << " for writing: " << EC.message() << "\n";
+        EnvReady = false;
+        return false;
+      }
+
+      if (!yaml::convertYAML(
+              yin, outFile,
+              [](const Twine &M) {
+                // Handle or ignore errors here
+                errs() << "Yaml Error :" << M << "\n";
+              },
+              DocNum)) {
+        errs() << "Failed to convert "
+               << StringRef(YamlPath.data(), YamlPath.size()) << " to "
+               << StringRef(DylibPath.data(), DylibPath.size()) << "\n";
+        EnvReady = false;
+        return false;
+      }
+
+      CreatedDylibsDir.push_back(std::string(sys::path::parent_path(
+          StringRef(DylibPath.data(), DylibPath.size()))));
+      CreatedDylibs.push_back(std::string(DylibPath.begin(), DylibPath.end()));
+      return true;
+    };
+
+    std::vector<const char *> LibDirs = {"Z", "A", "B", "C"};
+
+    unsigned DocNum = getYamlDocNum();
+    std::string YamlPltExt = getYamlFilePlatformExt();
+    for (const auto &LibdirName : LibDirs) {
+      // YAML path
+      SmallVector<char, 128> YamlPath(InputDirPath.begin(), InputDirPath.end());
+      SmallVector<char, 128> YamlFileName;
+      YamlFileName.append(LibdirName, LibdirName + strlen(LibdirName));
+      YamlFileName.append(YamlPltExt.begin(), YamlPltExt.end());
+      sys::path::append(YamlPath, LibdirName, YamlFileName);
+      sys::path::replace_extension(YamlPath, ".yaml");
+
+      // dylib path
+      SmallVector<char, 128> DylibPath(DirPath.begin(), DirPath.end());
+      SmallVector<char, 128> DylibFileName;
+      StringRef prefix("lib");
+      DylibFileName.append(prefix.begin(), prefix.end());
+      DylibFileName.append(LibdirName, LibdirName + strlen(LibdirName));
+
+      sys::path::append(DylibPath, LibdirName);
+      if (!sys::fs::exists(DylibPath)) {
+        auto EC = sys::fs::create_directory(DylibPath);
+        if (EC)
+          return;
+      }
+      sys::path::append(DylibPath, DylibFileName);
+      sys::path::replace_extension(DylibPath, ext);
+      if (!processYamlToDylib(YamlPath, DylibPath, DocNum))
+        return;
+    }
+
+    EnvReady = true;
+  }
+
+  void TearDown() override { sys::fs::remove_directories(DirPath); }
+
+  std::string getBaseDir() const {
+    return std::string(DirPath.begin(), DirPath.end());
+  }
+
+  std::vector<std::string> getDylibPaths() const { return CreatedDylibs; }
+};
+
+static LibraryTestEnvironment *GlobalEnv =
+    static_cast<LibraryTestEnvironment *>(
+        ::testing::AddGlobalTestEnvironment(new LibraryTestEnvironment()));
+
+inline std::string libPath(const std::string &BaseDir,
+                           const std::string &name) {
+#if defined(__APPLE__)
+  return BaseDir + "/" + name + ".dylib";
+#elif defined(_WIN32)
+  return BaseDir + "/" + name + ".dll";
+#else
+  return BaseDir + "/" + name + ".so";
+#endif
+}
+
+inline std::string withext(const std::string &lib) {
+  SmallString<128> P(lib);
+  sys::path::replace_extension(P, ext);
+  return P.str().str();
+}
+
+inline std::string platformSymbolName(const std::string &name) {
+#if defined(__APPLE__)
+  return "_" + name; // macOS prepends underscore
+#else
+  return name;
+#endif
+}
+
+struct TestLibrary {
+  std::string path;
+  std::vector<std::string> Syms;
+};
+
+class LibraryResolverIT : public ::testing::Test {
+protected:
+  std::string BaseDir;
+  std::unordered_map<std::string, TestLibrary> libs;
+
+  void addLib(const std::string &name) {
+    SmallString<512> path;
+    std::error_code EC =
+        sys::fs::real_path(libPath(BaseDir, name + "/lib" + name), path);
+    if (EC || path.empty() || !sys::fs::exists(path))
+      GTEST_SKIP();
+    libs[name] = {path.str().str(), {platformSymbolName("say" + name)}};
+  }
+
+  void SetUp() override {
+    if (!EnvReady || GlobalEnv == nullptr)
+      GTEST_SKIP() << "Skipping test: environment setup failed.";
+
+    {
+      SmallString<512> path;
+      std::error_code EC = sys::fs::real_path(GlobalEnv->getBaseDir(), path);
+      if (path.empty() || EC)
+        GTEST_SKIP() << "Base directory resolution failed: " << EC.message();
+      BaseDir = path.str().str();
+    }
+
+    for (const auto &P : GlobalEnv->getDylibPaths()) {
+      if (!sys::fs::exists(P))
+        GTEST_SKIP() << "Missing dylib path: " << P;
+    }
+
+    const std::vector<std::string> libNames = {"A", "B", "C", "Z"};
+    for (const auto &name : libNames)
+      addLib(name);
+
+    if (!EnvReady)
+      GTEST_SKIP() << "Skipping test: environment setup failed.";
+  }
+
+  const std::vector<std::string> &sym(const std::string &key) {
+    return libs[key].Syms;
+  }
+  const std::string &lib(const std::string &key) { return libs[key].path; }
+  const std::string libdir(const std::string &key) {
+    SmallString<512> P(libs[key].path);
+    sys::path::remove_filename(P);
+    return P.str().str();
+  }
+  const std::string libname(const std::string &key) {
+    return sys::path::filename(libs[key].path).str();
+  }
+};
+
+// Helper: allow either "sayA" or "_sayA" depending on how your
+// SymbolEnumerator reports.
+static bool matchesEitherUnderscore(const std::string &got,
+                                    const std::string &bare) {
+  return got == bare || got == ("_" + bare);
+}
+
+// Helper: normalize path ending check (we only care that it resolved to the
+// right dylib)
+static bool endsWith(const std::string &s, const std::string &suffix) {
+  if (s.size() < suffix.size())
+    return false;
+  return std::equal(suffix.rbegin(), suffix.rend(), s.rbegin());
+}
+
+TEST_F(LibraryResolverIT, EnumerateSymbols_ExportsOnly_DefaultFlags) {
+  const std::string libC = lib("C");
+  SymbolEnumeratorOptions Opts = SymbolEnumeratorOptions::defaultOptions();
+
+  std::vector<std::string> seen;
+  auto onEach = [&](llvm::StringRef sym) -> EnumerateResult {
+    seen.emplace_back(sym.str());
+    return EnumerateResult::Continue;
+  };
+
+  ASSERT_TRUE(SymbolEnumerator::enumerateSymbols(libC, onEach, Opts));
+
+  // sayC is exported, others are undefined → only sayC expected
+  EXPECT_TRUE(any_of(seen, [&](const std::string &s) {
+    return matchesEitherUnderscore(s, "sayC");
+  }));
+  EXPECT_FALSE(any_of(seen, [&](const std::string &s) {
+    return matchesEitherUnderscore(s, "sayA");
+  }));
+  EXPECT_FALSE(any_of(seen, [&](const std::string &s) {
+    return matchesEitherUnderscore(s, "sayB");
+  }));
+}
+
+TEST_F(LibraryResolverIT, EnumerateSymbols_IncludesUndefineds) {
+  const std::string libC = lib("C");
+
+  SymbolEnumeratorOptions Opts;
+  Opts.FilterFlags =
+      SymbolEnumeratorOptions::IgnoreWeak |
+      SymbolEnumeratorOptions::IgnoreIndirect; // no IgnoreUndefined
+
+  std::vector<std::string> seen;
+  auto onEach = [&](llvm::StringRef sym) -> EnumerateResult {
+    seen.emplace_back(sym.str());
+    return EnumerateResult::Continue;
+  };
+
+  ASSERT_TRUE(SymbolEnumerator::enumerateSymbols(libC, onEach, Opts));
+
+  // Now we should see both sayC (export) and the undefined refs sayA, sayB,
+  // sayZ
+  EXPECT_TRUE(any_of(seen, [&](const std::string &s) {
+    return matchesEitherUnderscore(s, "sayC");
+  }));
+  EXPECT_TRUE(any_of(seen, [&](const std::string &s) {
+    return matchesEitherUnderscore(s, "sayA");
+  }));
+  EXPECT_TRUE(any_of(seen, [&](const std::string &s) {
+    return matchesEitherUnderscore(s, "sayB");
+  }));
+}
+
+// Full resolution via LibraryResolutionDriver/LibraryResolver ---
+TEST_F(LibraryResolverIT, DriverResolvesSymbolsToCorrectLibraries) {
+  // Create the resolver from real base paths (our fixtures dir)
+  auto Stup = LibraryResolver::Setup::create({BaseDir});
+
+  // Full system behavior: no mocks
+  auto Driver = LibraryResolutionDriver::create(Stup);
+  ASSERT_NE(Driver, nullptr);
+
+  // Tell the Driver about the scan path kinds (User/System) as your
+  // production code expects.
+  Driver->addScanPath(libdir("A"), PathType::User);
+  Driver->addScanPath(libdir("B"), PathType::User);
+  Driver->addScanPath(libdir("Z"), PathType::User);
+
+  // Symbols to resolve (bare names; class handles underscore differences
+  // internally)
+  std::vector<std::string> Syms = {platformSymbolName("sayA"),
+                                   platformSymbolName("sayB"),
+                                   platformSymbolName("sayZ")};
+
+  bool CallbackRan = false;
+  Driver->resolveSymbols(Syms, [&](SymbolQuery &Q) {
+    CallbackRan = true;
+
+    // sayA should resolve to A.dylib
+    {
+      auto lib = Q.getResolvedLib(platformSymbolName("sayA"));
+      ASSERT_TRUE(lib.has_value()) << "sayA should be resolved";
+      EXPECT_TRUE(endsWith(lib->str(), libname("A")))
+          << "sayA resolved to: " << lib->str();
+    }
+
+    // sayB should resolve to B.dylib
+    {
+      auto lib = Q.getResolvedLib(platformSymbolName("sayB"));
+      ASSERT_TRUE(lib.has_value()) << "sayB should be resolved";
+      EXPECT_TRUE(endsWith(lib->str(), libname("B")))
+          << "sayB resolved to: " << lib->str();
+    }
+
+    // sayZ should resolve to B.dylib
+    {
+      auto lib = Q.getResolvedLib(platformSymbolName("sayZ"));
+      ASSERT_TRUE(lib.has_value()) << "sayZ should be resolved";
+      EXPECT_TRUE(endsWith(lib->str(), libname("Z")))
+          << "sayZ resolved to: " << lib->str();
+    }
+
+    EXPECT_TRUE(Q.allResolved());
+  });
+
+  EXPECT_TRUE(CallbackRan);
+}
+
+// stress SymbolQuery with the real resolve flow
+// And resolve libC dependency libA, libB, libZ ---
+TEST_F(LibraryResolverIT, ResolveManySymbols) {
+  auto Stup = LibraryResolver::Setup::create({BaseDir});
+  auto Driver = LibraryResolutionDriver::create(Stup);
+  ASSERT_NE(Driver, nullptr);
+  Driver->addScanPath(libdir("C"), PathType::User);
+
+  // Many duplicates to provoke concurrent updates inside SymbolQuery
+  std::vector<std::string> Syms = {
+      platformSymbolName("sayA"), platformSymbolName("sayB"),
+      platformSymbolName("sayA"), platformSymbolName("sayB"),
+      platformSymbolName("sayZ"), platformSymbolName("sayZ"),
+      platformSymbolName("sayZ"), platformSymbolName("sayZ"),
+      platformSymbolName("sayA"), platformSymbolName("sayB"),
+      platformSymbolName("sayA"), platformSymbolName("sayB")};
+
+  bool CallbackRan = false;
+  Driver->resolveSymbols(Syms, [&](SymbolQuery &Q) {
+    CallbackRan = true;
+    EXPECT_TRUE(Q.isResolved(platformSymbolName("sayA")));
+    EXPECT_TRUE(Q.isResolved(platformSymbolName("sayB")));
+    EXPECT_TRUE(Q.isResolved(platformSymbolName("sayZ")));
+
+    auto A = Q.getResolvedLib(platformSymbolName("sayA"));
+    auto B = Q.getResolvedLib(platformSymbolName("sayB"));
+    auto Z = Q.getResolvedLib(platformSymbolName("sayZ"));
+    ASSERT_TRUE(A.has_value());
+    ASSERT_TRUE(B.has_value());
+    ASSERT_TRUE(Z.has_value());
+    EXPECT_TRUE(endsWith(A->str(), libname("A")));
+    EXPECT_TRUE(endsWith(B->str(), libname("B")));
+    EXPECT_TRUE(endsWith(Z->str(), libname("Z")));
+    EXPECT_TRUE(Q.allResolved());
+  });
+
+  EXPECT_TRUE(CallbackRan);
+}
+
+TEST_F(LibraryResolverIT, ScanAndResolveDependencyGraph) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+  LibraryScanHelper ScanH({}, LibPathCache, PResolver);
+
+  ScanH.addBasePath(libdir("C"), PathType::User);
+
+  LibraryManager LibMgr;
+  LibraryScanner Scanner(ScanH, LibMgr);
+
+  Scanner.scanNext(PathType::User, 0);
+
+  size_t numLibs = 0;
+  LibMgr.forEachLibrary([&](const LibraryInfo &L) {
+    numLibs++;
+    return true;
+  });
+
+  EXPECT_GT(numLibs, 0u) << "Expected at least one library scanned";
+
+  // Validate that each scanned library path is resolvable
+  std::error_code EC;
+  LibMgr.forEachLibrary([&](const LibraryInfo &L) {
+    auto R = PResolver->resolve(L.getFullPath(), EC);
+    EXPECT_TRUE(R.has_value());
+    EXPECT_FALSE(EC);
+    return true;
+  });
+}
+
+TEST_F(LibraryResolverIT, ScanEmptyPath) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+  LibraryScanHelper ScanH({}, LibPathCache, PResolver);
+
+  ScanH.addBasePath("/tmp/empty", PathType::User);
+
+  LibraryManager LibMgr;
+  LibraryScanner Scanner(ScanH, LibMgr);
+
+  Scanner.scanNext(PathType::User, 0);
+
+  size_t count = 0;
+  LibMgr.forEachLibrary([&](const LibraryInfo &) {
+    count++;
+    return true;
+  });
+  EXPECT_EQ(count, 0u);
+}
+
+TEST_F(LibraryResolverIT, PathResolverResolvesKnownPaths) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  std::error_code EC;
+  auto Missing = PResolver->resolve("temp/foo/bar", EC);
+  EXPECT_FALSE(Missing.has_value()) << "Unexpectedly resolved a bogus path";
+  EXPECT_TRUE(EC) << "Expected error resolving path";
+
+  auto DirPath = PResolver->resolve(BaseDir, EC);
+  ASSERT_TRUE(DirPath.has_value());
+  EXPECT_FALSE(EC) << "Expected no error resolving path";
+  EXPECT_EQ(*DirPath, BaseDir);
+
+  auto DylibPath = PResolver->resolve(lib("C"), EC);
+  ASSERT_TRUE(DylibPath.has_value());
+  EXPECT_FALSE(EC) << "Expected no error resolving path";
+  EXPECT_EQ(*DylibPath, lib("C"));
+}
+
+TEST_F(LibraryResolverIT, PathResolverNormalizesDotAndDotDot) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  std::error_code EC;
+
+  // e.g. BaseDir + "/./C/../C/C.dylib" → BaseDir + "/C.dylib"
+  std::string Messy = BaseDir + "/C/./../C/./libC" + ext;
+  auto Resolved = PResolver->resolve(Messy, EC);
+  ASSERT_TRUE(Resolved.has_value());
+  EXPECT_FALSE(EC);
+  EXPECT_EQ(*Resolved, lib("C")) << "Expected realpath to collapse . and ..";
+}
+
+#if !defined(_WIN32)
+TEST_F(LibraryResolverIT, PathResolverFollowsSymlinks) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  std::error_code EC;
+
+  // Create a symlink temp -> BaseDir (only if filesystem allows it)
+  std::string linkName = BaseDir + withext("/link_to_C");
+  std::string target = lib("C");
+  if (::symlink(target.c_str(), linkName.c_str()) != 0)
+    GTEST_SKIP() << "Failed to create symlink: " << strerror(errno);
+
+  auto resolved = PResolver->resolve(linkName, EC);
+  ASSERT_TRUE(resolved.has_value());
+  EXPECT_FALSE(EC);
+  EXPECT_EQ(*resolved, target);
+
+  (void)::unlink(linkName.c_str()); // cleanup
+}
+
+TEST_F(LibraryResolverIT, PathResolverCachesResults) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  SmallString<128> TmpDylib;
+  std::error_code EC;
+  EC = sys::fs::createUniqueFile(withext("A-copy"), TmpDylib);
+  if (EC)
+    GTEST_SKIP() << "Failed to create temp dylib" << EC.message();
+
+  EC = sys::fs::copy_file(lib("A"), TmpDylib);
+  if (EC)
+    GTEST_SKIP() << "Failed to copy libA: " << EC.message();
+  EC.clear();
+
+  // First resolve -> should populate LibPathCache
+  auto first = PResolver->resolve(TmpDylib, EC);
+  ASSERT_TRUE(first.has_value());
+
+  // Forcefully remove the file from disk
+  (void)::unlink(TmpDylib.c_str());
+
+  // Second resolve -> should still succeed from LibPathCache
+  auto second = PResolver->resolve(TmpDylib, EC);
+  EXPECT_TRUE(second.has_value());
+  EXPECT_EQ(*second, *first);
+}
+#endif
+
+TEST_F(LibraryResolverIT, LoaderPathSubstitutionAndResolve) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  DylibSubstitutor substitutor;
+  substitutor.configure(libdir("C"));
+#if defined(__APPLE__)
+  // Substitute @loader_path with BaseDir
+  std::string substituted =
+      substitutor.substitute(withext("@loader_path/libC"));
+#elif defined(__linux__)
+  // Substitute $origin with BaseDir
+  std::string substituted = substitutor.substitute(withext("$ORIGIN/libC"));
+#endif
+  ASSERT_FALSE(substituted.empty());
+  EXPECT_EQ(substituted, lib("C"));
+
+  // Now try resolving the substituted path
+  std::error_code EC;
+  auto resolved = PResolver->resolve(substituted, EC);
+  ASSERT_TRUE(resolved.has_value()) << "Expected to resolve substituted dylib";
+  EXPECT_EQ(*resolved, lib("C"));
+  EXPECT_FALSE(EC) << "Expected no error resolving substituted dylib";
+}
+
+TEST_F(LibraryResolverIT, ResolveFromUsrOrSystemPaths) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  DylibPathValidator validator(*PResolver);
+
+  std::vector<std::string> Paths = {"/foo/bar/", "temp/foo",  libdir("C"),
+                                    libdir("A"), libdir("B"), libdir("Z")};
+
+  SmallVector<StringRef> P(Paths.begin(), Paths.end());
+
+  DylibResolver Resolver(validator);
+  Resolver.configure("", {{P, SearchPathType::UsrOrSys}});
+
+  // Check "C"
+  auto ValOptC = Resolver.resolve("libC", true);
+  EXPECT_TRUE(ValOptC.has_value());
+  EXPECT_EQ(*ValOptC, lib("C"));
+
+  auto ValOptCdylib = Resolver.resolve(withext("libC"));
+  EXPECT_TRUE(ValOptCdylib.has_value());
+  EXPECT_EQ(*ValOptCdylib, lib("C"));
+
+  // Check "A"
+  auto ValOptA = Resolver.resolve("libA", true);
+  EXPECT_TRUE(ValOptA.has_value());
+  EXPECT_EQ(*ValOptA, lib("A"));
+
+  auto ValOptAdylib = Resolver.resolve(withext("libA"));
+  EXPECT_TRUE(ValOptAdylib.has_value());
+  EXPECT_EQ(*ValOptAdylib, lib("A"));
+
+  // Check "B"
+  auto ValOptB = Resolver.resolve("libB", true);
+  EXPECT_TRUE(ValOptB.has_value());
+  EXPECT_EQ(*ValOptB, lib("B"));
+
+  auto ValOptBdylib = Resolver.resolve(withext("libB"));
+  EXPECT_TRUE(ValOptBdylib.has_value());
+  EXPECT_EQ(*ValOptBdylib, lib("B"));
+
+  // Check "Z"
+  auto ValOptZ = Resolver.resolve("libZ", true);
+  EXPECT_TRUE(ValOptZ.has_value());
+  EXPECT_EQ(*ValOptZ, lib("Z"));
+
+  auto ValOptZdylib = Resolver.resolve(withext("libZ"));
+  EXPECT_TRUE(ValOptZdylib.has_value());
+  EXPECT_EQ(*ValOptZdylib, lib("Z"));
+}
+
+#if defined(__APPLE__)
+TEST_F(LibraryResolverIT, ResolveViaLoaderPathAndRPathSubstitution) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  DylibPathValidator validator(*PResolver);
+
+  std::vector<std::string> Paths = {"@loader_path/../A", "@loader_path/../B",
+                                    "@loader_path/../C", "@loader_path/../Z"};
+
+  SmallVector<StringRef> P(Paths.begin(), Paths.end());
+
+  DylibResolver Resolver(validator);
+
+  // Use only RPath config
+  Resolver.configure(lib("C"), {{P, SearchPathType::RPath}});
+
+  // --- Check A ---
+  auto ValOptA = Resolver.resolve("@rpath/libA", true);
+  EXPECT_TRUE(ValOptA.has_value());
+  EXPECT_EQ(*ValOptA, lib("A"));
+
+  auto ValOptAdylib = Resolver.resolve(withext("@rpath/libA"));
+  EXPECT_TRUE(ValOptAdylib.has_value());
+  EXPECT_EQ(*ValOptAdylib, lib("A"));
+
+  // --- Check B ---
+  auto ValOptB = Resolver.resolve("@rpath/libB", true);
+  EXPECT_TRUE(ValOptB.has_value());
+  EXPECT_EQ(*ValOptB, lib("B"));
+
+  auto ValOptBdylib = Resolver.resolve(withext("@rpath/libB"));
+  EXPECT_TRUE(ValOptBdylib.has_value());
+  EXPECT_EQ(*ValOptBdylib, lib("B"));
+
+  // --- Check Z ---
+  auto ValOptZ = Resolver.resolve("@rpath/libZ", true);
+  EXPECT_TRUE(ValOptZ.has_value());
+  EXPECT_EQ(*ValOptZ, lib("Z"));
+
+  auto ValOptZdylib = Resolver.resolve(withext("@rpath/libZ"));
+  EXPECT_TRUE(ValOptZdylib.has_value());
+  EXPECT_EQ(*ValOptZdylib, lib("Z"));
+}
+#endif
+
+#if defined(__linux__)
+TEST_F(LibraryResolverIT, ResolveViaOriginAndRPathSubstitution) {
+  auto LibPathCache = std::make_shared<LibraryPathCache>();
+  auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+  DylibPathValidator validator(*PResolver);
+
+  // On Linux, $ORIGIN works like @loader_path
+  std::vector<std::string> Paths = {"$ORIGIN/../A", "$ORIGIN/../B",
+                                    "$ORIGIN/../C", "$ORIGIN/../Z"};
+
+  SmallVector<StringRef> P(Paths.begin(), Paths.end());
+
+  DylibResolver Resolver(validator);
+
+  // Use only RPath config
+  Resolver.configure(lib("C"), {{P, SearchPathType::RunPath}});
+
+  // --- Check A ---
+  auto ValOptA = Resolver.resolve("libA", true);
+  EXPECT_TRUE(ValOptA.has_value());
+  EXPECT_EQ(*ValOptA, lib("A"));
+
+  auto valOptASO = Resolver.resolve(withext("libA"));
+  EXPECT_TRUE(valOptASO.has_value());
+  EXPECT_EQ(*valOptASO, lib("A"));
+
+  // --- Check B ---
+  auto ValOptB = Resolver.resolve("libB", true);
+  EXPECT_TRUE(ValOptB.has_value());
+  EXPECT_EQ(*ValOptB, lib("B"));
+
+  auto valOptBSO = Resolver.resolve(withext("libB"));
+  EXPECT_TRUE(valOptBSO.has_value());
+  EXPECT_EQ(*valOptBSO, lib("B"));
+
+  // --- Check Z ---
+  auto ValOptZ = Resolver.resolve("libZ", true);
+  EXPECT_TRUE(ValOptZ.has_value());
+  EXPECT_EQ(*ValOptZ, lib("Z"));
+
+  auto valOptZSO = Resolver.resolve(withext("libZ"));
+  EXPECT_TRUE(valOptZSO.has_value());
+  EXPECT_EQ(*valOptZSO, lib("Z"));
+}
+#endif
+} // namespace
+#endif // defined(__APPLE__)
diff --git a/llvm/unittests/Support/AlignOfTest.cpp b/llvm/unittests/Support/AlignOfTest.cpp
index 979f2cf..53358a28 100644
--- a/llvm/unittests/Support/AlignOfTest.cpp
+++ b/llvm/unittests/Support/AlignOfTest.cpp
@@ -79,14 +79,14 @@ struct V8 : V5, virtual V6, V7 { double zz;
 
 double S6::f() { return 0.0; }
 float D2::g() { return 0.0f; }
-V1::~V1() {}
-V2::~V2() {}
-V3::~V3() {}
-V4::~V4() {}
-V5::~V5() {}
-V6::~V6() {}
-V7::~V7() {}
-V8::~V8() {}
+V1::~V1() = default;
+V2::~V2() = default;
+V3::~V3() = default;
+V4::~V4() = default;
+V5::~V5() = default;
+V6::~V6() = default;
+V7::~V7() = default;
+V8::~V8() = default;
 
 template <typename M> struct T { M m; };
 
diff --git a/llvm/unittests/Support/AllocatorTest.cpp b/llvm/unittests/Support/AllocatorTest.cpp
index 1069e43..2337f34 100644
--- a/llvm/unittests/Support/AllocatorTest.cpp
+++ b/llvm/unittests/Support/AllocatorTest.cpp
@@ -235,7 +235,7 @@ class MockSlabAllocator {
   static size_t LastSlabSize;
 
 public:
-  ~MockSlabAllocator() { }
+  ~MockSlabAllocator() = default;
 
   void *Allocate(size_t Size, size_t /*Alignment*/) {
     // Allocate space for the alignment, the slab, and a void* that goes right
diff --git a/llvm/unittests/Support/BinaryStreamTest.cpp b/llvm/unittests/Support/BinaryStreamTest.cpp
index 70cd403..06ed12b 100644
--- a/llvm/unittests/Support/BinaryStreamTest.cpp
+++ b/llvm/unittests/Support/BinaryStreamTest.cpp
@@ -110,7 +110,7 @@ constexpr uint32_t NumStreams = 2 * NumEndians;
 class BinaryStreamTest : public testing::Test {
 
 public:
-  BinaryStreamTest() {}
+  BinaryStreamTest() = default;
 
   void SetUp() override {
     Streams.clear();
diff --git a/llvm/unittests/Support/Casting.cpp b/llvm/unittests/Support/Casting.cpp
index 18327f6..7906750 100644
--- a/llvm/unittests/Support/Casting.cpp
+++ b/llvm/unittests/Support/Casting.cpp
@@ -23,7 +23,7 @@ template <typename T> IllegalCast *cast(...) { return nullptr; }
 // with conversion facility
 //
 struct bar {
-  bar() {}
+  bar() = default;
   bar(const bar &) = delete;
   struct foo *baz();
   struct foo *caz();
@@ -36,7 +36,7 @@ struct foo {
 };
 
 struct base {
-  virtual ~base() {}
+  virtual ~base() = default;
 };
 
 struct derived : public base {
@@ -375,12 +375,12 @@ namespace inferred_upcasting {
 class Base {
 public:
   // No classof. We are testing that the upcast is inferred.
-  Base() {}
+  Base() = default;
 };
 
 class Derived : public Base {
 public:
-  Derived() {}
+  Derived() = default;
 };
 
 // Even with no explicit classof() in Base, we should still be able to cast
@@ -529,7 +529,7 @@ TEST(CastingTest, smart_dyn_cast_or_null) {
 #ifndef NDEBUG
 namespace assertion_checks {
 struct Base {
-  virtual ~Base() {}
+  virtual ~Base() = default;
 };
 
 struct Derived : public Base {
diff --git a/llvm/unittests/Support/InstructionCostTest.cpp b/llvm/unittests/Support/InstructionCostTest.cpp
index efe8388..5392689 100644
--- a/llvm/unittests/Support/InstructionCostTest.cpp
+++ b/llvm/unittests/Support/InstructionCostTest.cpp
@@ -14,7 +14,7 @@ using namespace llvm;
 namespace {
 
 struct CostTest : public testing::Test {
-  CostTest() {}
+  CostTest() = default;
 };
 
 } // namespace
diff --git a/llvm/unittests/Support/OptimizedStructLayoutTest.cpp b/llvm/unittests/Support/OptimizedStructLayoutTest.cpp
index e8cd5f4..0bcae0d 100644
--- a/llvm/unittests/Support/OptimizedStructLayoutTest.cpp
+++ b/llvm/unittests/Support/OptimizedStructLayoutTest.cpp
@@ -25,7 +25,7 @@ class LayoutTest {
   bool Verified = false;
 
 public:
-  LayoutTest() {}
+  LayoutTest() = default;
   LayoutTest(const LayoutTest &) = delete;
   LayoutTest &operator=(const LayoutTest &) = delete;
   ~LayoutTest() { assert(Verified); }
diff --git a/llvm/unittests/Support/YAMLIOTest.cpp b/llvm/unittests/Support/YAMLIOTest.cpp
index 283e5f8..7446c07 100644
--- a/llvm/unittests/Support/YAMLIOTest.cpp
+++ b/llvm/unittests/Support/YAMLIOTest.cpp
@@ -3221,12 +3221,12 @@ template <> struct TaggedScalarTraits<Scalar> {
 
 template <> struct CustomMappingTraits<Map> {
   static void inputOne(IO &IO, StringRef Key, Map &M) {
-    IO.mapRequired(Key.str().c_str(), M[Key]);
+    IO.mapRequired(Key, M[Key]);
   }
 
   static void output(IO &IO, Map &M) {
     for (auto &N : M)
-      IO.mapRequired(N.getKey().str().c_str(), N.getValue());
+      IO.mapRequired(N.getKey(), N.getValue());
   }
 };
 
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
index 50ad4d5..4680282 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
@@ -21,7 +21,7 @@ using VPVerifierTest = VPlanTestBase;
 namespace {
 TEST_F(VPVerifierTest, VPInstructionUseBeforeDefSameBB) {
   VPlan &Plan = getPlan();
-  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  VPValue *Zero = Plan.getConstantInt(32, 0);
   VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero});
   VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI});
   auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
@@ -56,7 +56,7 @@ TEST_F(VPVerifierTest, VPInstructionUseBeforeDefSameBB) {
 
 TEST_F(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) {
   VPlan &Plan = getPlan();
-  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  VPValue *Zero = Plan.getConstantInt(32, 0);
   VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero});
   VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI});
   auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
@@ -184,7 +184,7 @@ TEST_F(VPVerifierTest, VPPhiIncomingValueDoesntDominateIncomingBlock) {
 
 TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
   VPlan &Plan = getPlan();
-  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  VPValue *Zero = Plan.getConstantInt(32, 0);
   VPInstruction *I1 = new VPInstruction(Instruction::Add, {Zero});
   auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
   VPInstruction *BranchOnCond =
@@ -218,7 +218,7 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
 
 TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
   VPlan &Plan = getPlan();
-  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  VPValue *Zero = Plan.getConstantInt(32, 0);
   VPInstruction *I1 = new VPInstruction(Instruction::Add, {Zero});
   auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
   VPInstruction *BranchOnCond =
@@ -259,7 +259,7 @@ TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) {
   VPBasicBlock *VPBB1 = Plan.getEntry();
   VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
 
-  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  VPValue *Zero = Plan.getConstantInt(32, 0);
   auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
   VPBB2->appendRecipe(CanIV);
 
@@ -288,7 +288,7 @@ TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) {
 
 TEST_F(VPVerifierTest, NonHeaderPHIInHeader) {
   VPlan &Plan = getPlan();
-  VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+  VPValue *Zero = Plan.getConstantInt(32, 0);
   auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
   auto *BranchOnCond = new VPInstruction(VPInstruction::BranchOnCond, {CanIV});
 
@@ -351,8 +351,7 @@ TEST_F(VPIRVerifierTest, testVerifyIRPhi) {
   BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
   auto Plan = buildVPlan(LoopHeader);
 
-  Plan->getExitBlocks()[0]->front().addOperand(
-      Plan->getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(*Ctx), 0)));
+  Plan->getExitBlocks()[0]->front().addOperand(Plan->getConstantInt(32, 0));
 
 #if GTEST_HAS_STREAM_REDIRECTION
   ::testing::internal::CaptureStderr();
diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
index ed802e2..6a36f47 100644
--- a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
@@ -154,7 +154,7 @@ public:
       Provides = ProvideMap.lookup(ProvidesDef);
   }
 
-  ~RuntimeLibcallImpl() {}
+  ~RuntimeLibcallImpl() = default;
 
   const Record *getDef() const { return TheDef; }
 
diff --git a/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.h b/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.h
index 99e4820..412f323 100644
--- a/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.h
+++ b/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.h
@@ -43,7 +43,7 @@ public:
   void printFeatureKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap);
   void printCPUKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap);
   virtual void run(raw_ostream &O);
-  virtual ~TargetFeaturesEmitter() {};
+  virtual ~TargetFeaturesEmitter() = default;
 };
 } // namespace llvm
 #endif
diff --git a/llvm/utils/TableGen/Common/CodeGenTarget.cpp b/llvm/utils/TableGen/Common/CodeGenTarget.cpp
index 3db0d07..1e93788 100644
--- a/llvm/utils/TableGen/Common/CodeGenTarget.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenTarget.cpp
@@ -80,7 +80,7 @@ CodeGenTarget::CodeGenTarget(const RecordKeeper &records)
   MacroFusions = Records.getAllDerivedDefinitions("Fusion");
 }
 
-CodeGenTarget::~CodeGenTarget() {}
+CodeGenTarget::~CodeGenTarget() = default;
 
 StringRef CodeGenTarget::getName() const { return TargetRec->getName(); }
 
diff --git a/llvm/utils/TableGen/Common/DAGISelMatcher.h b/llvm/utils/TableGen/Common/DAGISelMatcher.h
index f87de75..a19f444 100644
--- a/llvm/utils/TableGen/Common/DAGISelMatcher.h
+++ b/llvm/utils/TableGen/Common/DAGISelMatcher.h
@@ -105,7 +105,7 @@ protected:
   Matcher(KindTy K) : Kind(K) {}
 
 public:
-  virtual ~Matcher() {}
+  virtual ~Matcher() = default;
 
   unsigned getSize() const { return Size; }
   void setSize(unsigned sz) { Size = sz; }
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
index 5d49715..7af757c 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
@@ -457,7 +457,7 @@ std::optional<LLTCodeGen> llvm::gi::MVTToLLT(MVT::SimpleValueType SVT) {
 
 void Matcher::optimize() {}
 
-Matcher::~Matcher() {}
+Matcher::~Matcher() = default;
 
 //===- GroupMatcher -------------------------------------------------------===//
 
@@ -1150,11 +1150,11 @@ void RuleMatcher::insnmatchers_pop_front() { Matchers.erase(Matchers.begin()); }
 
 //===- PredicateMatcher ---------------------------------------------------===//
 
-PredicateMatcher::~PredicateMatcher() {}
+PredicateMatcher::~PredicateMatcher() = default;
 
 //===- OperandPredicateMatcher --------------------------------------------===//
 
-OperandPredicateMatcher::~OperandPredicateMatcher() {}
+OperandPredicateMatcher::~OperandPredicateMatcher() = default;
 
 bool OperandPredicateMatcher::isHigherPriorityThan(
     const OperandPredicateMatcher &B) const {
@@ -1941,7 +1941,7 @@ bool InstructionOperandMatcher::isHigherPriorityThan(
 
 //===- OperandRenderer ----------------------------------------------------===//
 
-OperandRenderer::~OperandRenderer() {}
+OperandRenderer::~OperandRenderer() = default;
 
 //===- CopyRenderer -------------------------------------------------------===//
 
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
index 0f1241e..fdcca1d 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
@@ -1375,7 +1375,7 @@ class InstructionPredicateMatcher : public PredicateMatcher {
 public:
   InstructionPredicateMatcher(PredicateKind Kind, unsigned InsnVarID)
       : PredicateMatcher(Kind, InsnVarID) {}
-  ~InstructionPredicateMatcher() override {}
+  ~InstructionPredicateMatcher() override = default;
 
   /// Compare the priority of this object and B.
   ///
@@ -2319,7 +2319,7 @@ public:
 
   ActionKind getKind() const { return Kind; }
 
-  virtual ~MatchAction() {}
+  virtual ~MatchAction() = default;
 
   // Some actions may need to add extra predicates to ensure they can run.
   virtual void emitAdditionalPredicates(MatchTable &Table,
diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp
index e0be104..c4dbb14 100644
--- a/llvm/utils/TableGen/FastISelEmitter.cpp
+++ b/llvm/utils/TableGen/FastISelEmitter.cpp
@@ -85,7 +85,7 @@ struct OperandsSignature {
     char Repr = OK_Invalid;
 
   public:
-    OpKind() {}
+    OpKind() = default;
 
     bool operator<(OpKind RHS) const { return Repr < RHS.Repr; }
     bool operator==(OpKind RHS) const { return Repr == RHS.Repr; }
diff --git a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
index 043bc628..50e63a4 100644
--- a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
@@ -2441,7 +2441,7 @@ public:
   explicit GICombinerEmitter(const RecordKeeper &RK,
                              const CodeGenTarget &Target, StringRef Name,
                              const Record *Combiner);
-  ~GICombinerEmitter() override {}
+  ~GICombinerEmitter() override = default;
 
   void run(raw_ostream &OS);
 };
diff --git a/llvm/utils/TableGen/X86DisassemblerTables.cpp b/llvm/utils/TableGen/X86DisassemblerTables.cpp
index 3414190..b8c3c02 100644
--- a/llvm/utils/TableGen/X86DisassemblerTables.cpp
+++ b/llvm/utils/TableGen/X86DisassemblerTables.cpp
@@ -708,7 +708,7 @@ DisassemblerTables::DisassemblerTables() {
   HasConflicts = false;
 }
 
-DisassemblerTables::~DisassemblerTables() {}
+DisassemblerTables::~DisassemblerTables() = default;
 
 void DisassemblerTables::emitModRMDecision(raw_ostream &o1, raw_ostream &o2,
                                            unsigned &i1, unsigned &i2,
diff --git a/llvm/utils/TableGen/X86ModRMFilters.h b/llvm/utils/TableGen/X86ModRMFilters.h
index 7bf111f..4eb57b0 100644
--- a/llvm/utils/TableGen/X86ModRMFilters.h
+++ b/llvm/utils/TableGen/X86ModRMFilters.h
@@ -28,7 +28,7 @@ class ModRMFilter {
 
 public:
   /// Destructor    - Override as necessary.
-  virtual ~ModRMFilter() {}
+  virtual ~ModRMFilter() = default;
 
   /// isDumb        - Indicates whether this filter returns the same value for
   ///                 any value of the ModR/M byte.
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index 119303c..2dad16a 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -29,7 +29,7 @@ Version changelog:
    'none' and 'all'. 'smart' is the default.
 5: Basic block labels are matched by FileCheck expressions
 6: The semantics of TBAA checks has been incorporated in the check lines.
-7: Indent switch-cases correctly.
+7: Indent switch-cases correctly; CHECK-EMPTY instead of skipping blank lines.
 """
 DEFAULT_VERSION = 6
 
@@ -2280,6 +2280,14 @@ def add_checks(
             # For IR output, change all defs to FileCheck variables, so we're immune
             # to variable naming fashions.
             else:
+                if ginfo.get_version() >= 7:
+                    # Record the indices of blank lines in the function body preemptively.
+                    blank_line_indices = {
+                        i for i, line in enumerate(func_body) if line.strip() == ""
+                    }
+                else:
+                    blank_line_indices = set()
+
                 func_body = generalize_check_lines(
                     func_body,
                     ginfo,
@@ -2305,9 +2313,18 @@ def add_checks(
 
                 is_blank_line = False
 
-                for func_line in func_body:
+                for idx, func_line in enumerate(func_body):
                     if func_line.strip() == "":
-                        is_blank_line = True
+                        # We should distinguish if the line is a 'fake' blank line generated by
+                        # generalize_check_lines removing comments.
+                        # Fortunately, generalize_check_lines does not change the index of each line,
+                        # we can record the indices of blank lines preemptively.
+                        if idx in blank_line_indices:
+                            output_lines.append(
+                                "{} {}-EMPTY:".format(comment_marker, checkprefix)
+                            )
+                        else:
+                            is_blank_line = True
                         continue
                     if not check_inst_comments:
                         # Do not waste time checking IR comments unless necessary.
diff --git a/mlir/include/mlir/Support/Timing.h b/mlir/include/mlir/Support/Timing.h
index a8a4bfd..3d61a0a 100644
--- a/mlir/include/mlir/Support/Timing.h
+++ b/mlir/include/mlir/Support/Timing.h
@@ -44,7 +44,7 @@ class DefaultTimingManagerImpl;
 /// This is a POD type with pointer size, so it should be passed around by
 /// value. The underlying data is owned by the `TimingManager`.
 class TimingIdentifier {
-  using EntryType = llvm::StringMapEntry<std::nullopt_t>;
+  using EntryType = llvm::StringMapEntry<llvm::EmptyStringSetTag>;
 
 public:
   TimingIdentifier(const TimingIdentifier &) = default;
diff --git a/mlir/lib/Support/Timing.cpp b/mlir/lib/Support/Timing.cpp
index 16306d7..2e92d9c 100644
--- a/mlir/lib/Support/Timing.cpp
+++ b/mlir/lib/Support/Timing.cpp
@@ -50,7 +50,8 @@ public:
   llvm::sys::SmartRWMutex<true> identifierMutex;
 
   /// A thread local cache of identifiers to reduce lock contention.
-  ThreadLocalCache<llvm::StringMap<llvm::StringMapEntry<std::nullopt_t> *>>
+  ThreadLocalCache<
+      llvm::StringMap<llvm::StringMapEntry<llvm::EmptyStringSetTag> *>>
       localIdentifierCache;
 
   TimingManagerImpl() : identifiers(identifierAllocator) {}
diff --git a/runtimes/cmake/Modules/HandleLibC.cmake b/runtimes/cmake/Modules/HandleLibC.cmake
index 51fbf04..01da5b2 100644
--- a/runtimes/cmake/Modules/HandleLibC.cmake
+++ b/runtimes/cmake/Modules/HandleLibC.cmake
@@ -30,6 +30,7 @@ elseif (RUNTIMES_USE_LIBC STREQUAL "llvm-libc")
   check_cxx_compiler_flag(-nostdlibinc CXX_SUPPORTS_NOSTDLIBINC_FLAG)
   if(CXX_SUPPORTS_NOSTDLIBINC_FLAG)
     target_compile_options(runtimes-libc-headers INTERFACE "-nostdlibinc")
+    target_compile_options(runtimes-libc-headers INTERFACE "-idirafter${LIBC_KERNEL_HEADERS}")
   endif()
 
   add_library(runtimes-libc-static INTERFACE)